def get_ident(form_field): # check if the post request has the file part if form_field not in request.values: raise ROGERUsageError('No \'%s\' part' % form_field) value = request.values[form_field] # if user does not select file, browser also # submit a empty part without filename if not IDENT_PATTERN.match(value): raise ROGERUsageError("Invalid value for `%s': `%s`" % (form_field, value)) return value
def remove_species(session, tax_id): # Check if dataset is already preset in the database species_table = list_species(session) if tax_id == human_tax_id: raise ROGERUsageError( 'Cannot delete gene annotation from human species') if species_table[species_table.TaxID == tax_id].empty: raise ROGERUsageError('Species does not exist in database: %s' % tax_id) session.query(GeneAnnotation).filter( GeneAnnotation.TaxID == tax_id).delete() session.commit()
def __check_matrix(ref_columns, matrix, matrix_name, ref_list_name): if len(ref_columns) != matrix.shape[0]: raise ROGERUsageError( "Number of rows in %s does not match the number of %s: %d vs %d" % (matrix_name, ref_list_name, len(ref_columns), matrix.shape[0])) if matrix.index.dtype.name == "object" and set( matrix.index) != set(ref_columns): raise ROGERUsageError("Row names of %s and %s do not match" % (matrix_name, ref_list_name)) for col_name in matrix.columns: if not np.issubdtype(matrix[col_name].dtype, np.integer): raise ROGERUsageError("Column '%s' is not an integer type" % col_name)
def get_file(form_field): # check if the post request has the file part if form_field not in request.files: raise ROGERUsageError('No \'%s\' part' % form_field) file = request.files[form_field] # if user does not select file, browser also # submit a empty part without filename if file.filename == '': raise ROGERUsageError('No file specified for `%s`' % form_field) if not allowed_file(file.filename): raise ROGERUsageError("Invalid file name in `%s`: %s" % (form_field, file.filename)) filename = secure_filename(file.filename) file.save(os.path.join(current_app().config['ROGER_DATA_FOLDER'], filename)) return file
def add_design(session, design_file, dataset_name, name=None, description=None, sample_groups_file=None, sample_group_levels_file=None, sample_group_pheno_column=None): ds = get_ds(session, dataset_name) name = get_or_guess_name(name, design_file) design = query_design(session, name, dataset_name).one_or_none() if design is not None: raise ROGERUsageError( "Design of data set '%s' with name '%s' already exist" % (dataset_name, name)) design_data = create_design_data( read_table(design_file, sep='\t', index_col=0), ds.pheno_data, name, description, read_array(sample_groups_file, nullable=True), read_array(sample_group_levels_file, nullable=True), sample_group_pheno_column) design_data.design.DataSetID = ds.ID session.add(design_data.design) session.flush() design_data.sample_subset["DesignID"] = design_data.design.ID insert_data_frame(session, design_data.sample_subset, SampleSubset.__table__) session.commit() return name
def get_dge_tbl(session, contrast_name, design_name, dataset_name, method_name) -> DGEmodel: contrast = session.query(ContrastColumn) \ .filter(DGEmodel.DGEmethodID == DGEmethod.ID) \ .filter(DGEmodel.ContrastID == Contrast.ID) \ .filter(Contrast.DesignID == Design.ID) \ .filter(Design.DataSetID == DataSet.ID) \ .filter(ContrastColumn.ContrastID == Contrast.ID) \ .filter(Design.Name == design_name) \ .filter(DataSet.Name == dataset_name) \ .filter(DGEmethod.Name == method_name) \ .filter(ContrastColumn.Name == contrast_name).one_or_none() if contrast is None: raise ROGERUsageError("Model for %s:%s:%s does not exists" % (dataset_name, design_name, contrast_name)) dge_table = as_data_frame( session.query(DGEtable).filter( DGEtable.ContrastColumnID == contrast.ID)) feature_data = contrast.Design.DataSet.feature_data dge_table['Contrast'] = contrast.Name dge_table['Design'] = contrast.Design.Name dge_table['DGEMethod'] = method_name return dge_table.join(feature_data.set_index('FeatureIndex'), on='FeatureIndex', rsuffix="Feature")
def get_design(session, design_name, ds_name) -> Design: design = query_design(session, design_name, ds_name).one_or_none() if design is None: raise ROGERUsageError( "Design of data set '%s' with name '%s' does not exist" % (ds_name, design_name)) return design
def annotate_ds_pheno_data(gct_data, pheno_data=pd.DataFrame()): if pheno_data.shape[0] > 0: if pheno_data.shape[0] != len(gct_data.columns): raise ROGERUsageError( "Number of rows in pheno data and number of samples don't match: %d vs %d" % (pheno_data.shape[0], len(gct_data.columns))) if ROGER_SAMPLE_NAME not in pheno_data: pheno_data.insert(0, ROGER_SAMPLE_NAME, list(gct_data)) if ROGER_SAMPLE_NAME in pheno_data and set( pheno_data[ROGER_SAMPLE_NAME]) != set(gct_data): raise ROGERUsageError( "Sample names given by column '%s' don't match the sample names in expression data" % ROGER_SAMPLE_NAME) return pheno_data
def get_dge_model(session, contrast_name, design_name, dataset_name, method_name) -> DGEmodel: model = query_dge_models(session, contrast_name, design_name, dataset_name, method_name, DGEmodel).one_or_none() if model is None: raise ROGERUsageError("Model for %s:%s:%s does not exists" % (dataset_name, design_name, contrast_name)) return model
def delete_method(session, name): # Check if GSE method is already preset in the database gse_methods = list_methods(session) if gse_methods[gse_methods.Name == name].empty: raise ROGERUsageError('GSE does not exist in database: %s' % name) session.query(GSEmethod).filter(GSEmethod.Name == name).delete() session.commit()
def get_algorithm(name) -> DGEAlgorithm: algorithm_dict = { algo.name: algo for algo in [cls() for cls in all_subclasses(DGEAlgorithm)] } if name not in algorithm_dict: raise ROGERUsageError("Algorithm '%s' does not exist" % name) return algorithm_dict[name]
def get_contrast(session, contrast_name, design_name, ds_name) -> Contrast: design = query_contrast(session, contrast_name, design_name, ds_name).one_or_none() if design is None: raise ROGERUsageError( "Contrast of design '%s' with name '%s' does not exist" % (design_name, contrast_name)) return design
def delete_gmt(session, category_name): # Check if gene set category is preset in the database gmt_list = list_gmt(session) if gmt_list[gmt_list.Name == category_name].empty: raise ROGERUsageError('GMT does not exist in database: %s' % category_name) session.query(GeneSetCategory).filter( GeneSetCategory.Name == category_name).delete() session.commit()
def get_gse_table(session, contrast, design, dataset, dge_method, gse_method) -> DataFrame: gse_result = get_gse_result(session, contrast, design, dataset, dge_method, gse_method) if not gse_result: raise ROGERUsageError( "GSE results for %s:%s:%s:%s:%s do not exist" % (contrast, design, dataset, dge_method, gse_method)) return gse_result.result_table
def get_dataset_of(session, tax_id): annotation_service = get_annotation_service() species_list = roger.persistence.geneanno.list_species(session) if species_list[species_list.TaxID == tax_id].empty: raise ROGERUsageError('Unknown taxon id: %s' % tax_id) dataset_name = species_list.loc[species_list["TaxID"] == tax_id, "Version"].values[0].split(' ')[0] return annotation_service.get_dataset(dataset_name)
def annotate(session, gct_data, tax_id, symbol_type): ensembl_dataset = get_dataset_of(session, tax_id) attributes = ensembl_dataset.attributes params = { "attributes": [symbol_type, "ensembl_gene_id"], } filter_attr = "with_%s" % symbol_type if filter_attr in attributes["name"]: params["filters"] = {filter_attr: True} all_sym = ensembl_dataset.get_bulk_query(params).dropna() feature_anno = pd.DataFrame(data={ "Name": gct_data.index, "FeatureIndex": range(0, gct_data.shape[0]) }, index=gct_data.index) feature_anno = feature_anno.join(all_sym.set_index(symbol_type)) if feature_anno[feature_anno.isnull().any( axis=1)].shape[0] == feature_anno.shape[0]: raise ROGERUsageError("Unable to annotate features in expression file") # TODO Find a better heuristic to drop multiple Ensembl ID association feature_anno = feature_anno[~feature_anno.index.duplicated(keep='first')] feature_anno = feature_anno.set_index("ensembl_gene_id") # TODO include origin tax id and origin roger gene index query = session.query(GeneAnnotation.RogerGeneIndex, GeneAnnotation.RogerGeneIndex.label("OriRogerGeneIndex"), literal(human_tax_id).label("OriTaxID"), GeneAnnotation.EnsemblGeneID) \ .filter_by(TaxID=tax_id) if tax_id != human_tax_id: query = session \ .query(Ortholog.HumanRogerGeneIndex.label("RogerGeneIndex"), Ortholog.RogerGeneIndex.label("OriRogerGeneIndex"), literal(tax_id).label("OriTaxID"), GeneAnnotation.EnsemblGeneID) \ .filter(GeneAnnotation.RogerGeneIndex == Ortholog.RogerGeneIndex) \ .filter(GeneAnnotation.TaxID == tax_id) roger_gene_indices = roger.logic.util.data.as_data_frame(query) # TODO Find a better heuristic to drop multiple Ensembl ID association # feature_anno.join(roger_gene_indices.set_index("EnsemblGeneID")).to_csv("test.txt", sep="\t") feature_anno = feature_anno.join(roger_gene_indices.set_index("EnsemblGeneID")). \ drop_duplicates("FeatureIndex"). \ sort_values('FeatureIndex'). \ reset_index().drop(columns="index") return feature_anno, ensembl_dataset.display_name
def __get_sample_groups(design_data, pheno_data, sample_groups=None, sample_group_pheno_column=None): if sample_group_pheno_column is not None and sample_groups is not None: raise ROGERUsageError( "You cannot give a list of sample groups and specify a " "sample group column within the pheno data at the same time") if sample_group_pheno_column is None and sample_groups is not None: return sample_groups if sample_group_pheno_column is not None and sample_groups is None: if sample_group_pheno_column not in pheno_data: raise ROGERUsageError( "Column '%s' does not exist in the pheno matrix of the given study" % sample_group_pheno_column) return pheno_data[sample_group_pheno_column].tolist() # No information given? infer sample groups then from the design matrix return design_data.apply(lambda row: "_".join(["%s.%d" % (key, value) for (key, value) in row.items()]), axis=1) \ .tolist()
def create_ds(session, ds_type: Type[DataSet], exprs_file, tax_id, symbol_type, pheno_file=None, name=None, normalization_method=None, description=None, xref=None): name = get_or_guess_name(name, exprs_file) # Input checking species_list = list_species(session) if species_list[species_list.TaxID == tax_id].empty: raise ROGERUsageError('Unknown taxon id: %s' % tax_id) if session.query(DataSet).filter( DataSet.Name == name).one_or_none() is not None: raise ROGERUsageError("Data set with name '%s' already exists" % name) exprs_data = parse_gct(file_path=exprs_file) (annotation_data, annotation_version) = annotate(session, exprs_data, tax_id, symbol_type) pheno_data = pd.DataFrame() if pheno_file is not None: pheno_data = read_df(pheno_file) annotated_pheno_data = annotate_ds_pheno_data(exprs_data, pheno_data) return DataSetProperties(ds_type, tax_id, exprs_file, pheno_file, exprs_data, annotated_pheno_data, annotation_data, annotation_version, name, normalization_method, description, xref)
def add_contrast(session, contrast_file, design_name, dataset_name, name=None, description=None): design = get_design(session, design_name, dataset_name) name = get_or_guess_name(name, contrast_file) if query_contrast(session, name, design_name, dataset_name).one_or_none() is not None: raise ROGERUsageError("Contrast '%s' already exist in '%s'" % (name, design_name)) contrast = Contrast(DesignID=design.ID, Name=name, Description=description, CreatedBy=get_current_user_name(), CreationTime=get_current_datetime()) session.add(contrast) session.flush() contrast_data = read_table(contrast_file, sep='\t', index_col=0) check_contrast_matrix(design.design_matrix.columns, contrast_data) contrast_cols = contrast_data.columns contrast_table = DataFrame({ "ContrastID": contrast.ID, "DesignID": design.ID, "Name": contrast_cols, "Description": contrast_cols, "ColumnData": [ contrast_data[col_name].values.tolist() for col_name in contrast_cols ] }) insert_data_frame(session, contrast_table, ContrastColumn.__table__) session.commit() return name
def create_design_data(design_data, pheno_data, name=None, description=None, sample_groups=None, sample_group_levels=None, sample_group_pheno_column=None) -> DesignData: check_design_matrix(pheno_data[ROGER_SAMPLE_NAME], design_data) sample_groups = __get_sample_groups(design_data, pheno_data, sample_groups, sample_group_pheno_column) if sample_group_levels is None: sample_group_levels = list(set(sample_groups)) if any([x not in sample_group_levels for x in sample_groups]): raise ROGERUsageError( "Sample group list contains groups that are not part of sample group levels: %s vs %s" % (sample_groups, sample_group_levels)) # TODO make this customizable by user sample_subset = DataFrame({ "SampleIndex": range(0, pheno_data.shape[0]), "IsUsed": True, "Description": "No filtering" }) # TODO make this customizable by user json_obj = [{ "columnName": col_name, "isCovariate": False, "values": design_data[col_name].values.tolist() } for col_name in design_data.columns] design_entry = Design(VariableCount=design_data.shape[1], Name=name, Description=description, DesignMatrix=json_obj, SampleGroups=sample_groups, SampleGroupLevels=sample_group_levels, CreatedBy=get_current_user_name(), CreationTime=get_current_datetime()) return DesignData(design_entry, sample_subset)
def perform_gse(session: Session, roger_wd_dir: str, dge_model: DGEmodel, algorithm: GSEAlgorithm, gene_set_category_filter: List[str] = None): existing_results = get_gse_result(session, dge_model.Contrast.Name, dge_model.Contrast.Design.Name, dge_model.Contrast.Design.DataSet.Name, dge_model.Method.Name, algorithm.name) if existing_results: raise ROGERUsageError( "Result for %s:%s:%s:%s:%s already exists" % (dge_model.Contrast.Name, dge_model.Contrast.Design.Name, dge_model.Contrast.Design.DataSet.Name, dge_model.Method.Name, algorithm.name)) gene_sets = get_gmt_locations(session, gene_set_category_filter) gscs_list = { gene_set.Category: gene_set.FileWC for index, gene_set in gene_sets.iterrows() } if len(gscs_list) == 0: raise ROGERUsageError( "Cannot perform GSE without preexisting gene sets (did you import GMT files?)" ) gscs = ribios_gsea.readGmt(ListVector(gscs_list)) contrast_columns = dge_model.Contrast.contrast_columns gse_method_id = session.query(GSEmethod.ID) \ .filter(GSEmethod.DGEmethodID == DGEmethod.ID) \ .filter(DGEmethod.ID == dge_model.Method.ID) \ .filter(GSEmethod.Name == algorithm.name).scalar() gse_algo_result = algorithm.exec_gse(dge_model, gscs) enrich_tbl = gse_algo_result.raw_gse_table gene_sets.Category = gene_sets.Category.str.lower() gene_sets.Name = gene_sets.Name.str.lower() enrich_tbl.Category = enrich_tbl.Category.str.lower() enrich_tbl.GeneSet = enrich_tbl.GeneSet.str.lower() merged_enrich_tbl = enrich_tbl.join( gene_sets.set_index(['Category', 'Name']), on=['Category', "GeneSet"]).join(contrast_columns.set_index("Name"), on="Contrast", lsuffix="_GENE_SET") gse_method_sub_dir = "%d_%s" % (dge_model.Contrast.ID, algorithm.name) gse_models_path = os.path.join(roger_wd_dir, GSE_RESULT_SUB_FOLDER) gse_model_path = os.path.join(gse_models_path, gse_method_sub_dir) if not os.path.exists(gse_model_path): os.makedirs(gse_model_path) gse_result_file = os.path.join(gse_model_path, "gse_table.txt") write_df(enrich_tbl, gse_result_file) gse_result = GSEresult(ContrastID=dge_model.ContrastID, DGEmethodID=dge_model.DGEmethodID, GSEmethodID=gse_method_id, OutputFile=gse_result_file, MethodDescription=gse_algo_result.method_desc) session.add(gse_result) session.flush() gse_tbl = DataFrame({ "GSEresultID": gse_result.ID, "ContrastColumnID": merged_enrich_tbl.ID, "GeneSetID": merged_enrich_tbl.ID_GENE_SET, "Correlation": merged_enrich_tbl.Correlation, "Direction": merged_enrich_tbl.Direction.map({ "Up": 1, "Down": -1 }), "PValue": merged_enrich_tbl.PValue, "FDR": merged_enrich_tbl.FDR, "EnrichmentScore": merged_enrich_tbl.Direction.map({ "Up": 1, "Down": -1 }) * abs(log10(merged_enrich_tbl.PValue)), "EffGeneCount": merged_enrich_tbl.NGenes }) unmapped = gse_tbl[gse_tbl.GeneSetID.isnull()] mapped = gse_tbl[~gse_tbl.GeneSetID.isnull()] if unmapped.shape[0] > 0: print("Warning: unable to map %d of %d entries to gene sets " % (unmapped.shape[0], merged_enrich_tbl.shape[0])) mapped_duplications = mapped.drop_duplicates( subset=['ContrastColumnID', 'GeneSetID']) if mapped_duplications.shape[0] < mapped.shape[0]: print( "Warning: %d of %d entries of mapped result entries are duplicated" % (mapped.shape[0] - mapped_duplications.shape[0], mapped.shape[0])) insert_data_frame(session, mapped_duplications, GSEtable.__table__) session.commit()
def get_dataset(self, dataset_name): try: return RemoteBioMartDataSet(self.__server.datasets[dataset_name]) except KeyError: raise ROGERUsageError("Dataset not found on Ensembl BioMart: %s" % dataset_name)
def parse_gct(file_path): with open(file_path) as myfile: header = [next(myfile).rstrip() for _ in range(3)] version_line = header[0] dim_line = header[1].split("\t") col_line = header[2].split("\t") if version_line != "#1.2": raise ROGERUsageError( "Unable to parse GCT file '%s': missing GCT header" % file_path) # Number of genes + number of samples n_dim_elems = 2 if len(dim_line) != n_dim_elems: raise ROGERUsageError( "Unable to parse GCT file '%s': missing dimension header in GCT header" % file_path) try: dims = [int(x) for x in header[1].split("\t")] except ValueError: raise ROGERUsageError( "Unable to parse GCT file '%s': ill-formatted dimension header '%s'" % (file_path, header[1])) # Name col + Description col + at least one sample col n_minimum_cols = 3 if len(col_line) < n_minimum_cols or col_line[0].lower( ) != "name" or col_line[1].lower() != "description": raise ROGERUsageError( "Unable to parse GCT file '%s': ill-formatted column header '%s ...'" % (file_path, header[2][0:100])) sample_names = col_line[2:] if len(sample_names) != len(set(sample_names)): raise ROGERUsageError( "Unable to parse GCT file '%s': duplicated sample names" % file_path) df = pd.read_table(file_path, sep="\t", skiprows=2, index_col=0) df = df.drop(columns=df.columns[0]) df.index = df.index.astype(str) if dims[0] != df.shape[0]: raise ROGERUsageError( "Unable to parse GCT file '%s': Number of expected genes don't match (%d vs %d)" % (file_path, dims[0], df.shape[0])) if dims[1] != df.shape[1]: raise ROGERUsageError( "Unable to parse GCT file '%s': Number of expected samples don't match (%d vs %d)" % (file_path, dims[1], df.shape[1])) if any([col_type.name == "object" for col_type in df.dtypes]): raise ROGERUsageError( "Uable to parse GCT file '%s': counts / signal columns have non-numeric values" % file_path) gene_duplicates = df.index.duplicated() if any(gene_duplicates): raise ROGERUsageError( "Unable to parse GCT file '%s': duplicated row names '%s ...'" % (file_path, df[gene_duplicates].index[0:2].tolist())) return df
def add_species(session, dataset_name, tax_id): annotation_service = roger.logic.mart.provider.get_annotation_service() # Check if dataset is already preset in the database species_table = list_species(session) if species_table[species_table.TaxID == human_tax_id].empty and human_tax_id != tax_id: raise ROGERUsageError( 'No human species annotation data present - import human gene annotation first' ) if not species_table[species_table.TaxID == tax_id].empty: raise ROGERUsageError('Species already exists in database: %s' % dataset_name) homolog_attr = re.sub(r'(\w+)_gene_ensembl', r'\1_homolog_ensembl_gene', dataset_name) homolog_filter = re.sub(r'(\w+)_gene_ensembl', r'with_\1_homolog', dataset_name) # Insert Gene annotation dataset = annotation_service.get_dataset(dataset_name) # TODO fix this, should move into provider.py version = "%s %s" % (dataset_name, re.search(r'[^(]+\(([^)]+)\)', dataset.display_name).group(1)) gene_anno = dataset.get_bulk_query( params={ 'attributes': [ "ensembl_gene_id", "entrezgene", "gene_biotype", "external_gene_name" ] }) next_id = get_next_free_db_id(session, GeneAnnotation.RogerGeneIndex) genes = DataFrame({ 'RogerGeneIndex': range(next_id, next_id + gene_anno.shape[0]), 'Version': version, 'TaxID': tax_id, 'EnsemblGeneID': gene_anno["ensembl_gene_id"], 'EntrezGeneID': gene_anno["entrezgene"], 'GeneType': gene_anno["gene_biotype"], 'GeneSymbol': gene_anno["external_gene_name"], 'IsObsolete': False }) insert_data_frame(session, genes, GeneAnnotation.__table__) # Insert orthologs if tax_id == human_tax_id: orthologs = DataFrame({ 'RogerGeneIndex': genes["RogerGeneIndex"], 'HumanRogerGeneIndex': genes["RogerGeneIndex"] }) insert_data_frame(session, orthologs, Ortholog.__table__) session.commit() return huma_anno_query = as_data_frame( session.query(GeneAnnotation).filter( GeneAnnotation.TaxID == human_tax_id)) ortho = annotation_service.get_bulk_query( human_dataset, params={ 'attributes': ["ensembl_gene_id", homolog_attr], 'filters': { homolog_filter: True } }) merged_ortho = ortho.join(huma_anno_query.set_index('EnsemblGeneID'), on='ensembl_gene_id') \ .join(genes.set_index('EnsemblGeneID'), on=homolog_attr, lsuffix='Human', rsuffix='Other') orthologs = DataFrame({ 'RogerGeneIndex': merged_ortho["RogerGeneIndexOther"], 'HumanRogerGeneIndex': merged_ortho["RogerGeneIndexHuman"] }) insert_data_frame(session, orthologs, Ortholog.__table__) session.commit()
def get_ds(session, name) -> DataSet: ds = session.query(DataSet).filter(DataSet.Name == name).one_or_none() if ds is None: raise ROGERUsageError("Data set with name '%s' does not exist" % name) return ds
def run_dge(session, roger_wd_dir, contrast, design, dataset, algorithm: DGEAlgorithm): model = query_dge_models(session, contrast, design, dataset, algorithm.name, DGEmodel).one_or_none() if model is not None: raise ROGERUsageError( "A model for %s:%s:%s has already been generated by the method '%s'" % (dataset, design, contrast, algorithm.name)) print("Retrieving data from database") contrast_data = get_contrast(session, contrast, design, dataset) design_data = contrast_data.Design ds_data = design_data.DataSet feature_data = ds_data.feature_data print("Performing differential gene expression analysis using %s" % algorithm.name) contrast_matrix = contrast_data.contrast_matrix dge_result = algorithm.exec_dge(ds_data.ExprsWC, feature_data, design_data, contrast_matrix) print("Persisting model information") method = session.query(DGEmethod).filter( DGEmethod.Name == algorithm.name).one() dge_method_sub_dir = "%d_%d" % (contrast_data.ID, method.ID) dge_models_path = os.path.join(roger_wd_dir, DGE_RESULT_SUB_FOLDER) dge_model_path = os.path.join(dge_models_path, dge_method_sub_dir) if not os.path.exists(dge_model_path): os.makedirs(dge_model_path) input_obj_file = os.path.abspath( os.path.join(dge_model_path, "limma_input_obj.rds")) base.saveRDS(dge_result.input_obj, file=input_obj_file) fit_obj_file = os.path.abspath( os.path.join(dge_model_path, "limma_fit_obj")) base.saveRDS(dge_result.fit_obj, file=fit_obj_file) dge_model = DGEmodel(ContrastID=contrast_data.ID, DGEmethodID=method.ID, InputObjFile=input_obj_file, FitObjFile=fit_obj_file, MethodDescription=dge_result.method_description) session.add(dge_model) session.flush() print("Persisting feature subsets") feature_subset = pd.DataFrame({ "FeatureIndex": feature_data["FeatureIndex"], "DataSetID": ds_data.ID, "ContrastID": contrast_data.ID, "DGEmethodID": method.ID, "IsUsed": dge_result.used_feature_list, "Description": "Default filtering by '%s'" % algorithm.name }) insert_data_frame(session, feature_subset, FeatureSubset.__table__) dge_tbl = dge_result.dge_table \ .join(contrast_data.contrast_columns.set_index("Name"), on="Contrast", rsuffix="_C") \ .join(feature_data.set_index("FeatureIndex"), on="FeatureIndex", rsuffix="_F") dgetable = pd.DataFrame({ 'ContrastColumnID': dge_tbl["ID"], 'FeatureIndex': dge_tbl["FeatureIndex"], "ContrastID": contrast_data.ID, "DGEmethodID": method.ID, 'DataSetID': dge_tbl["DataSetID"], 'AveExprs': dge_tbl["AveExpr"], 'Statistic': dge_tbl["t"], 'LogFC': dge_tbl["logFC"], 'PValue': dge_tbl["PValue"], 'FDR': dge_tbl["FDR"] }) insert_data_frame(session, dgetable, DGEtable.__table__) session.commit()