def add_design(session, design_file, dataset_name, name=None, description=None, sample_groups_file=None, sample_group_levels_file=None, sample_group_pheno_column=None): ds = get_ds(session, dataset_name) name = get_or_guess_name(name, design_file) design = query_design(session, name, dataset_name).one_or_none() if design is not None: raise ROGERUsageError( "Design of data set '%s' with name '%s' already exist" % (dataset_name, name)) design_data = create_design_data( read_table(design_file, sep='\t', index_col=0), ds.pheno_data, name, description, read_array(sample_groups_file, nullable=True), read_array(sample_group_levels_file, nullable=True), sample_group_pheno_column) design_data.design.DataSetID = ds.ID session.add(design_data.design) session.flush() design_data.sample_subset["DesignID"] = design_data.design.ID insert_data_frame(session, design_data.sample_subset, SampleSubset.__table__) session.commit() return name
def add_contrast(session, contrast_file, design_name, dataset_name, name=None, description=None): design = get_design(session, design_name, dataset_name) name = get_or_guess_name(name, contrast_file) if query_contrast(session, name, design_name, dataset_name).one_or_none() is not None: raise ROGERUsageError("Contrast '%s' already exist in '%s'" % (name, design_name)) contrast = Contrast(DesignID=design.ID, Name=name, Description=description, CreatedBy=get_current_user_name(), CreationTime=get_current_datetime()) session.add(contrast) session.flush() contrast_data = read_table(contrast_file, sep='\t', index_col=0) check_contrast_matrix(design.design_matrix.columns, contrast_data) contrast_cols = contrast_data.columns contrast_table = DataFrame({ "ContrastID": contrast.ID, "DesignID": design.ID, "Name": contrast_cols, "Description": contrast_cols, "ColumnData": [ contrast_data[col_name].values.tolist() for col_name in contrast_cols ] }) insert_data_frame(session, contrast_table, ContrastColumn.__table__) session.commit() return name
def perform_gse(session: Session, roger_wd_dir: str, dge_model: DGEmodel, algorithm: GSEAlgorithm, gene_set_category_filter: List[str] = None): existing_results = get_gse_result(session, dge_model.Contrast.Name, dge_model.Contrast.Design.Name, dge_model.Contrast.Design.DataSet.Name, dge_model.Method.Name, algorithm.name) if existing_results: raise ROGERUsageError( "Result for %s:%s:%s:%s:%s already exists" % (dge_model.Contrast.Name, dge_model.Contrast.Design.Name, dge_model.Contrast.Design.DataSet.Name, dge_model.Method.Name, algorithm.name)) gene_sets = get_gmt_locations(session, gene_set_category_filter) gscs_list = { gene_set.Category: gene_set.FileWC for index, gene_set in gene_sets.iterrows() } if len(gscs_list) == 0: raise ROGERUsageError( "Cannot perform GSE without preexisting gene sets (did you import GMT files?)" ) gscs = ribios_gsea.readGmt(ListVector(gscs_list)) contrast_columns = dge_model.Contrast.contrast_columns gse_method_id = session.query(GSEmethod.ID) \ .filter(GSEmethod.DGEmethodID == DGEmethod.ID) \ .filter(DGEmethod.ID == dge_model.Method.ID) \ .filter(GSEmethod.Name == algorithm.name).scalar() gse_algo_result = algorithm.exec_gse(dge_model, gscs) enrich_tbl = gse_algo_result.raw_gse_table gene_sets.Category = gene_sets.Category.str.lower() gene_sets.Name = gene_sets.Name.str.lower() enrich_tbl.Category = enrich_tbl.Category.str.lower() enrich_tbl.GeneSet = enrich_tbl.GeneSet.str.lower() merged_enrich_tbl = enrich_tbl.join( gene_sets.set_index(['Category', 'Name']), on=['Category', "GeneSet"]).join(contrast_columns.set_index("Name"), on="Contrast", lsuffix="_GENE_SET") gse_method_sub_dir = "%d_%s" % (dge_model.Contrast.ID, algorithm.name) gse_models_path = os.path.join(roger_wd_dir, GSE_RESULT_SUB_FOLDER) gse_model_path = os.path.join(gse_models_path, gse_method_sub_dir) if not os.path.exists(gse_model_path): os.makedirs(gse_model_path) gse_result_file = os.path.join(gse_model_path, "gse_table.txt") write_df(enrich_tbl, gse_result_file) gse_result = GSEresult(ContrastID=dge_model.ContrastID, DGEmethodID=dge_model.DGEmethodID, GSEmethodID=gse_method_id, OutputFile=gse_result_file, MethodDescription=gse_algo_result.method_desc) session.add(gse_result) session.flush() gse_tbl = DataFrame({ "GSEresultID": gse_result.ID, "ContrastColumnID": merged_enrich_tbl.ID, "GeneSetID": merged_enrich_tbl.ID_GENE_SET, "Correlation": merged_enrich_tbl.Correlation, "Direction": merged_enrich_tbl.Direction.map({ "Up": 1, "Down": -1 }), "PValue": merged_enrich_tbl.PValue, "FDR": merged_enrich_tbl.FDR, "EnrichmentScore": merged_enrich_tbl.Direction.map({ "Up": 1, "Down": -1 }) * abs(log10(merged_enrich_tbl.PValue)), "EffGeneCount": merged_enrich_tbl.NGenes }) unmapped = gse_tbl[gse_tbl.GeneSetID.isnull()] mapped = gse_tbl[~gse_tbl.GeneSetID.isnull()] if unmapped.shape[0] > 0: print("Warning: unable to map %d of %d entries to gene sets " % (unmapped.shape[0], merged_enrich_tbl.shape[0])) mapped_duplications = mapped.drop_duplicates( subset=['ContrastColumnID', 'GeneSetID']) if mapped_duplications.shape[0] < mapped.shape[0]: print( "Warning: %d of %d entries of mapped result entries are duplicated" % (mapped.shape[0] - mapped_duplications.shape[0], mapped.shape[0])) insert_data_frame(session, mapped_duplications, GSEtable.__table__) session.commit()
def add_gmt(session, roger_wd_dir, category_name, file, tax_id, description=None): gene_anno = as_data_frame( session.query(GeneAnnotation).filter(GeneAnnotation.TaxID == tax_id)) # TODO Make min_size configurable? gmt = gsea_gmt_parser(file, min_size=1, max_size=sys.maxsize) gene_sets_path = os.path.join(roger_wd_dir, GENE_SET_SUB_FOLDER) file_copy_path = os.path.join(gene_sets_path, os.path.basename(file)) category = GeneSetCategory(Name=category_name, FileWC=file_copy_path, FileSrc=os.path.abspath(file)) session.add(category) if not os.path.exists(gene_sets_path): os.makedirs(gene_sets_path) shutil.copy(file, file_copy_path) session.flush() gene_sets = [ GeneSet(Category=category, Name=gene_set_name, TaxID=tax_id, Description=description, GeneCount=len(genes), IsPrivate=False) for gene_set_name, genes in gmt.items() ] session.add_all(gene_sets) session.flush() gene_set_dict = {gene_set.Name: gene_set for gene_set in gene_sets} gene_set_data = {'GeneSetID': [], 'GeneSymbol': []} for gene_set_name, genes in gmt.items(): gene_set_data['GeneSetID'] += [gene_set_dict[gene_set_name].ID ] * len(genes) gene_set_data['GeneSymbol'] += genes genes_table = pd.DataFrame.from_dict(gene_set_data) annotated_genes = genes_table.join(gene_anno.set_index('GeneSymbol'), on='GeneSymbol') # Filter out non-matching genes matched_genes = annotated_genes[annotated_genes.RogerGeneIndex.notna()] \ .drop_duplicates(subset=['RogerGeneIndex', 'GeneSetID'], keep=False) # Bulk insert all gene set genes insert_data_frame(session, matched_genes, GeneSetGene.__table__, chunk_size=100000) session.commit() # Report number of gene symbols that could not be matched with gene annotation p_unknown_gene_symbols = (annotated_genes.shape[0] - matched_genes.shape[0] ) / float(annotated_genes.shape[0]) return p_unknown_gene_symbols
def add_species(session, dataset_name, tax_id): annotation_service = roger.logic.mart.provider.get_annotation_service() # Check if dataset is already preset in the database species_table = list_species(session) if species_table[species_table.TaxID == human_tax_id].empty and human_tax_id != tax_id: raise ROGERUsageError( 'No human species annotation data present - import human gene annotation first' ) if not species_table[species_table.TaxID == tax_id].empty: raise ROGERUsageError('Species already exists in database: %s' % dataset_name) homolog_attr = re.sub(r'(\w+)_gene_ensembl', r'\1_homolog_ensembl_gene', dataset_name) homolog_filter = re.sub(r'(\w+)_gene_ensembl', r'with_\1_homolog', dataset_name) # Insert Gene annotation dataset = annotation_service.get_dataset(dataset_name) # TODO fix this, should move into provider.py version = "%s %s" % (dataset_name, re.search(r'[^(]+\(([^)]+)\)', dataset.display_name).group(1)) gene_anno = dataset.get_bulk_query( params={ 'attributes': [ "ensembl_gene_id", "entrezgene", "gene_biotype", "external_gene_name" ] }) next_id = get_next_free_db_id(session, GeneAnnotation.RogerGeneIndex) genes = DataFrame({ 'RogerGeneIndex': range(next_id, next_id + gene_anno.shape[0]), 'Version': version, 'TaxID': tax_id, 'EnsemblGeneID': gene_anno["ensembl_gene_id"], 'EntrezGeneID': gene_anno["entrezgene"], 'GeneType': gene_anno["gene_biotype"], 'GeneSymbol': gene_anno["external_gene_name"], 'IsObsolete': False }) insert_data_frame(session, genes, GeneAnnotation.__table__) # Insert orthologs if tax_id == human_tax_id: orthologs = DataFrame({ 'RogerGeneIndex': genes["RogerGeneIndex"], 'HumanRogerGeneIndex': genes["RogerGeneIndex"] }) insert_data_frame(session, orthologs, Ortholog.__table__) session.commit() return huma_anno_query = as_data_frame( session.query(GeneAnnotation).filter( GeneAnnotation.TaxID == human_tax_id)) ortho = annotation_service.get_bulk_query( human_dataset, params={ 'attributes': ["ensembl_gene_id", homolog_attr], 'filters': { homolog_filter: True } }) merged_ortho = ortho.join(huma_anno_query.set_index('EnsemblGeneID'), on='ensembl_gene_id') \ .join(genes.set_index('EnsemblGeneID'), on=homolog_attr, lsuffix='Human', rsuffix='Other') orthologs = DataFrame({ 'RogerGeneIndex': merged_ortho["RogerGeneIndexOther"], 'HumanRogerGeneIndex': merged_ortho["RogerGeneIndexHuman"] }) insert_data_frame(session, orthologs, Ortholog.__table__) session.commit()
def run_dge(session, roger_wd_dir, contrast, design, dataset, algorithm: DGEAlgorithm): model = query_dge_models(session, contrast, design, dataset, algorithm.name, DGEmodel).one_or_none() if model is not None: raise ROGERUsageError( "A model for %s:%s:%s has already been generated by the method '%s'" % (dataset, design, contrast, algorithm.name)) print("Retrieving data from database") contrast_data = get_contrast(session, contrast, design, dataset) design_data = contrast_data.Design ds_data = design_data.DataSet feature_data = ds_data.feature_data print("Performing differential gene expression analysis using %s" % algorithm.name) contrast_matrix = contrast_data.contrast_matrix dge_result = algorithm.exec_dge(ds_data.ExprsWC, feature_data, design_data, contrast_matrix) print("Persisting model information") method = session.query(DGEmethod).filter( DGEmethod.Name == algorithm.name).one() dge_method_sub_dir = "%d_%d" % (contrast_data.ID, method.ID) dge_models_path = os.path.join(roger_wd_dir, DGE_RESULT_SUB_FOLDER) dge_model_path = os.path.join(dge_models_path, dge_method_sub_dir) if not os.path.exists(dge_model_path): os.makedirs(dge_model_path) input_obj_file = os.path.abspath( os.path.join(dge_model_path, "limma_input_obj.rds")) base.saveRDS(dge_result.input_obj, file=input_obj_file) fit_obj_file = os.path.abspath( os.path.join(dge_model_path, "limma_fit_obj")) base.saveRDS(dge_result.fit_obj, file=fit_obj_file) dge_model = DGEmodel(ContrastID=contrast_data.ID, DGEmethodID=method.ID, InputObjFile=input_obj_file, FitObjFile=fit_obj_file, MethodDescription=dge_result.method_description) session.add(dge_model) session.flush() print("Persisting feature subsets") feature_subset = pd.DataFrame({ "FeatureIndex": feature_data["FeatureIndex"], "DataSetID": ds_data.ID, "ContrastID": contrast_data.ID, "DGEmethodID": method.ID, "IsUsed": dge_result.used_feature_list, "Description": "Default filtering by '%s'" % algorithm.name }) insert_data_frame(session, feature_subset, FeatureSubset.__table__) dge_tbl = dge_result.dge_table \ .join(contrast_data.contrast_columns.set_index("Name"), on="Contrast", rsuffix="_C") \ .join(feature_data.set_index("FeatureIndex"), on="FeatureIndex", rsuffix="_F") dgetable = pd.DataFrame({ 'ContrastColumnID': dge_tbl["ID"], 'FeatureIndex': dge_tbl["FeatureIndex"], "ContrastID": contrast_data.ID, "DGEmethodID": method.ID, 'DataSetID': dge_tbl["DataSetID"], 'AveExprs': dge_tbl["AveExpr"], 'Statistic': dge_tbl["t"], 'LogFC': dge_tbl["logFC"], 'PValue': dge_tbl["PValue"], 'FDR': dge_tbl["FDR"] }) insert_data_frame(session, dgetable, DGEtable.__table__) session.commit()