def get_uncorr_drug_features(include_rnaseq=True): # first get the compounds that are from the LINCS dataset drug_features_dict = get_feature_dict('Data/phase2_compounds_morgan_2048.csv') # , use_int=True) drug_features_dict = remove_non_lncap(drug_features_dict) # add the compounds that were in RNAseq if include_rnaseq: rnaseq_drugs = get_feature_dict('Data/inhouse_morgan_2048.csv') for rnaseq_drug in rnaseq_drugs: drug_features_dict[rnaseq_drug] = rnaseq_drugs[rnaseq_drug] unique_drug_features_dict = remove_dups(drug_features_dict) drug_features = get_array(unique_drug_features_dict) no_cor_np = remove_corr_features(drug_features) return remove_dup_np(no_cor_np)
def get_predictions(up_model_filename, down_model_filename): # load the models up_model = load_model_from_file_prefix(up_model_filename) down_model = load_model_from_file_prefix(down_model_filename) # build your input features gene_features_dict = get_feature_dict("Data/go_fingerprints.csv") drug_features_dict = get_feature_dict("Data/inhouse_morgan_2048.csv") #drug_features_dict.pop("Enzalutamide") #drug_features_dict.pop("VPC14449") # drug_features_dict.pop("VPC17005") data = [] descriptions = [] rnaseq_missing_genes = [ # these genes were not in the rnaseq dataset 'GATA3', 'RPL39L', 'IKZF1', 'CXCL2', 'HMGA2', 'TLR4', 'SPP1', 'MEF2C', 'PRKCQ', 'MMP1', 'PTGS2', 'ICAM3', 'INPP1', ] for gene in rnaseq_missing_genes: gene_features_dict.pop(gene, None) for drug in drug_features_dict: for gene in gene_features_dict: data.append(drug_features_dict[drug] + gene_features_dict[gene]) descriptions.append(drug + ", " + gene) data = np.asarray(data, dtype=np.float16) # get predictions up_predictions = up_model.predict(data) down_predictions = down_model.predict(data) return up_predictions, down_predictions, drug_features_dict, gene_features_dict
def get_jaccard_score_of_rnaseq_drug(drug_id, lincs_drugs): rnaseq_drugs = get_feature_dict('Data/inhouse_morgan_2048.csv') rnaseq_drug = rnaseq_drugs[drug_id] rnaseq_drug = np.reshape(np.array(rnaseq_drug, np.float16), (1, -1)) rnaseq_drug = remove_corr_features(rnaseq_drug) scores = [] for lincs_drug in lincs_drugs: score = jaccard_similarity_score(lincs_drug, rnaseq_drug[0]) scores.append(score) return np.mean(scores)
return 0 else: return highest_vote_class def get_their_id(good_id): return 'b\'' + good_id + '\'' def get_our_id(bad_id): return bad_id[2:-1] # get the dictionaries print(datetime.datetime.now(), "Loading drug and gene features") drug_features_dict = get_feature_dict('Data/phase2_compounds_morgan_2048.csv') gene_features_dict = get_feature_dict('Data/go_fingerprints.csv') cell_name_to_id_dict = get_feature_dict('Data/Phase2_Cell_Line_Metadata.txt', '\t', 2) experiments_dose_dict = get_feature_dict( LINCS_data_path + 'GSE70138_Broad_LINCS_sig_info.txt', '\t', 0) gene_id_dict = get_gene_id_dict() lm_gene_entrez_ids = [] for gene in gene_id_dict: lm_gene_entrez_ids.append(get_their_id(gene)) print("Loading gene expressions from gctx") level_5_gctoo = load_gene_expression_data( LINCS_data_path + "GSE70138_Broad_LINCS_Level5_COMPZ_n118050x12328.gctx", lm_gene_entrez_ids)