def get_uncorr_drug_features(include_rnaseq=True):
    # first get the compounds that are from the LINCS dataset
    drug_features_dict = get_feature_dict('Data/phase2_compounds_morgan_2048.csv')  # , use_int=True)
    drug_features_dict = remove_non_lncap(drug_features_dict)

    # add the compounds that were in RNAseq
    if include_rnaseq:
        rnaseq_drugs = get_feature_dict('Data/inhouse_morgan_2048.csv')
        for rnaseq_drug in rnaseq_drugs:
            drug_features_dict[rnaseq_drug] = rnaseq_drugs[rnaseq_drug]

    unique_drug_features_dict = remove_dups(drug_features_dict)
    drug_features = get_array(unique_drug_features_dict)
    no_cor_np = remove_corr_features(drug_features)
    return remove_dup_np(no_cor_np)
def get_predictions(up_model_filename, down_model_filename):
    # load the models
    up_model = load_model_from_file_prefix(up_model_filename)
    down_model = load_model_from_file_prefix(down_model_filename)

    # build your input features
    gene_features_dict = get_feature_dict("Data/go_fingerprints.csv")
    drug_features_dict = get_feature_dict("Data/inhouse_morgan_2048.csv")
    #drug_features_dict.pop("Enzalutamide")
    #drug_features_dict.pop("VPC14449")
    # drug_features_dict.pop("VPC17005")

    data = []
    descriptions = []
    rnaseq_missing_genes = [  # these genes were not in the rnaseq dataset
        'GATA3',
        'RPL39L',
        'IKZF1',
        'CXCL2',
        'HMGA2',
        'TLR4',
        'SPP1',
        'MEF2C',
        'PRKCQ',
        'MMP1',
        'PTGS2',
        'ICAM3',
        'INPP1',
    ]
    for gene in rnaseq_missing_genes:
        gene_features_dict.pop(gene, None)
    for drug in drug_features_dict:
        for gene in gene_features_dict:
            data.append(drug_features_dict[drug] + gene_features_dict[gene])
            descriptions.append(drug + ", " + gene)
    data = np.asarray(data, dtype=np.float16)

    # get predictions
    up_predictions = up_model.predict(data)
    down_predictions = down_model.predict(data)

    return up_predictions, down_predictions, drug_features_dict, gene_features_dict
def get_jaccard_score_of_rnaseq_drug(drug_id, lincs_drugs):
    rnaseq_drugs = get_feature_dict('Data/inhouse_morgan_2048.csv')

    rnaseq_drug = rnaseq_drugs[drug_id]
    rnaseq_drug = np.reshape(np.array(rnaseq_drug, np.float16), (1, -1))
    rnaseq_drug = remove_corr_features(rnaseq_drug)
    scores = []
    for lincs_drug in lincs_drugs:
        score = jaccard_similarity_score(lincs_drug, rnaseq_drug[0])
        scores.append(score)
    return np.mean(scores)
Example #4
0
        return 0
    else:
        return highest_vote_class


def get_their_id(good_id):
    return 'b\'' + good_id + '\''


def get_our_id(bad_id):
    return bad_id[2:-1]


# get the dictionaries
print(datetime.datetime.now(), "Loading drug and gene features")
drug_features_dict = get_feature_dict('Data/phase2_compounds_morgan_2048.csv')
gene_features_dict = get_feature_dict('Data/go_fingerprints.csv')
cell_name_to_id_dict = get_feature_dict('Data/Phase2_Cell_Line_Metadata.txt',
                                        '\t', 2)
experiments_dose_dict = get_feature_dict(
    LINCS_data_path + 'GSE70138_Broad_LINCS_sig_info.txt', '\t', 0)
gene_id_dict = get_gene_id_dict()

lm_gene_entrez_ids = []
for gene in gene_id_dict:
    lm_gene_entrez_ids.append(get_their_id(gene))

print("Loading gene expressions from gctx")
level_5_gctoo = load_gene_expression_data(
    LINCS_data_path + "GSE70138_Broad_LINCS_Level5_COMPZ_n118050x12328.gctx",
    lm_gene_entrez_ids)