def test_duplicate_column(self): columns = ['ID', 'COL', 'ROW', 'NAME', 'SPOT_ID', 'CONTROL_TYPE', 'REFSEQ', 'GB_ACC', 'GENE', 'GENE_SYMBOL', 'GENE_NAME', 'UNIGENE_ID', 'ENSEMBL_ID', 'TIGR_ID', 'ACCESSION_STRING', 'CHROMOSOMAL_LOCATION', 'CYTOBAND', 'DESCRIPTION', 'GO_ID', 'SEQUENCE', 'SPOT_ID.1', 'ORDER'] columns2 = ['ID', 'COL', 'ROW', 'NAME', 'SPOT_ID', 'CONTROL_TYPE', 'ENSEMBL_ID', 'GB_ACC', 'GENE', 'GENE_SYMBOL', 'ENSEMBL_ID.1', 'UNIGENE_ID', 'ENSEMBL_ID.2', 'TIGR_ID', 'ACCESSION_STRING', 'CHROMOSOMAL_LOCATION', 'CYTOBAND', 'DESCRIPTION', 'GO_ID', 'SEQUENCE', 'SPOT_ID.1', 'ORDER'] gpl = GEO.get_GEO(filepath=join(download_geo, "GPL4133.txt")) self.assertEqual(list(gpl.columns.index), columns) gpl2 = GEO.get_GEO(filepath=join(download_geo, "GPL4134.txt")) self.assertEqual(list(gpl2.columns.index), columns2)
def test_get_geo_and_data(self): gsm = GEO.get_GEO(geo="GSM11805", destdir=download_geo) self.assertTrue(isinstance(gsm, GSM)) self.assertEqual(gsm.get_accession(), "GSM11805") self.assertEqual(len(gsm.table.index), 22283) self.assertEqual(len(gsm.columns), 3) self.assertEqual(len(gsm.metadata.keys()), 28)
def test_merge_and_average(self): gse = GEO.get_GEO(filepath=join(download_geo, "soft_ex_family.txt"), geotype="GSE") result = read_table( join(download_geo, "test_merged_by_id_and_averaged_by_gb_acc.tab"), index_col=0, ) result = result.loc[sorted( result.index), sorted( result.columns )] # gse.gsms is a dict so the columns might be in different order merged = gse.merge_and_average( gse.gpls[next(iter(gse.gpls))], "VALUE", "GB_ACC", gpl_on="ID", gsm_on="ID_REF", ) merged = merged[sorted( merged.columns )] # gse.gsms is a dict so the columns might be in different order assert_frame_equal(merged, result) with self.assertRaises(KeyError): gse.merge_and_average("platform", "VALUE", "GB_ACC", gpl_on="ID", gsm_on="ID_REF") with self.assertRaises(ValueError): gse.merge_and_average(["platform"], "VALUE", "GB_ACC", gpl_on="ID", gsm_on="ID_REF")
def test_download_SRA_parallel_by_sra(self): geo_id = "GSE63525" # Hi-C dataset from Rao et al. def filterby(x): return ("HIC173" in x.metadata["title"][0] or "HIC174" in x.metadata["title"][0] or "HIC175" in x.metadata["title"][0]) destdir = "./TMP_SOFT_parallel_by_sra" gse = GEO.get_GEO(geo=geo_id, destdir=destdir) downloaded_paths = gse.download_SRA( "*****@*****.**", # some unused e-mail directory=destdir, filetype="sra", filterby=filterby, silent=True, keep_sra=True, nproc=3, ) print(downloaded_paths) self.assertTrue(isdir(destdir)) self.assertEqual(len(downloaded_paths), 3) for k in downloaded_paths.keys(): self.assertTrue(k in gse.gsms.keys()) for k in ["GSM1551718", "GSM1551719", "GSM1551720"]: self.assertTrue(k in downloaded_paths.keys()) for k in downloaded_paths.keys(): for f in downloaded_paths[k]["SRA"]: self.assertTrue(isfile(f))
def test_download_SRA_parallel_by_gsm(self): geo_id = "GSE63525" # Hi-C dataset from Rao et al. def filterby(x): return ("HIC173" in x.metadata["title"][0] or "HIC174" in x.metadata["title"][0] or "HIC175" in x.metadata["title"][0]) destdir = "./TMP_SOFT_parallel_by_gsm" gse = GEO.get_GEO(geo=geo_id, destdir=destdir) gsms_to_use = [gsm for gsm in gse.gsms.values() if filterby(gsm)] downloaded_paths = dict() for gsm in gsms_to_use: downloaded_paths[gsm.name] = gsm.download_SRA( "*****@*****.**", # some unused e-mail directory=destdir, nproc=3, return_list=False, filetype="sra", silent=True, keep_sra=True, ) self.assertTrue(isdir(destdir)) self.assertEqual(len(downloaded_paths), 3) for k in downloaded_paths.keys(): self.assertTrue(k in gse.gsms.keys()) for k in ["GSM1551718", "GSM1551719", "GSM1551720"]: self.assertTrue(k in downloaded_paths.keys()) for k in downloaded_paths.keys(): for f in downloaded_paths[k]["SRA"]: self.assertTrue(isfile(f))
def GSEA(geo_ID, gene_list): gse = GEOparse.get_GEO(geo=geo_ID, destdir="./") expression = gse.pivot_samples('VALUE').T experiments = {} for i, (idx, row) in enumerate(gse.phenotype_data.iterrows()): tmp = {} tmp["Type"] = 1 if "control" in row["description"] else 0 experiments[i] = tmp experiments = pd.DataFrame(experiments).T counter = 0 all_genes_set = [] all_corr_set = [] genes_corr_set = [] for gene in expression: counter += 1 if counter <= 3: continue all_genes_set.append(gene) corr_matrix = np.corrcoef( [list(experiments['Type']), list(expression[gene])]) all_corr_set.append(corr_matrix[0, 1]) if gene in gene_list: genes_corr_set.append(corr_matrix[0, 1]) p_value = ks_2samp(genes_corr_set, all_corr_set)[1] return (str(p_value))
def gene_expression_pipeline(geo_id, tissue_origin, gene_locations): """ Given a GEO id and a gene location table, runs the whole data pipeline of annotation -> duplication removal -> filtering on the GEO gene expression table. The tissue origin parameter is given to save intermediate results of the pipeline and reduce runtime when trying different filtering parameters. An empty string can be supplied to disallow saving of intermediate files. NOTE: 1. GEOparse will download an expression SOFT file and save it in destdir unless one is already provided. 2. table_add_gene_annotations requires several minutes to run (because of eutils requests). Do NOT terminate early. 3. After running table_remove_duplicates, for some reason the ID column header is not saved in the csv. This can be easily fixed manually once (not worth code intervention). """ gse_data = geo.get_GEO(geo=geo_id, destdir='./expression_data') expression_table = generate_raw_expression_table(gse_data) expression_table = table_add_gene_annotations(expression_table, gene_locations, tissue_origin) expression_table = table_remove_duplicates(expression_table, tissue_origin) filtered_expression = table_filter(expression_table) return filtered_expression
def test_download_SRA_parallel_by_sra(self): geo_id = 'GSE63525' # Hi-C dataset from Rao et al. def filterby(x): return 'HIC173' in x.metadata['title'][0] \ or 'HIC174' in x.metadata['title'][0] \ or 'HIC175' in x.metadata['title'][0] destdir = "./TMP_SOFT_parallel_by_sra" gse = GEO.get_GEO(geo=geo_id, destdir=destdir) downloaded_paths = gse.download_SRA("*****@*****.**", # some unused e-mail directory=destdir, filetype='sra', filterby=filterby, silent=True, keep_sra=True, nproc=3) print(downloaded_paths) self.assertTrue(isdir(destdir)) self.assertEqual(len(downloaded_paths), 3) for k in downloaded_paths.keys(): self.assertTrue(k in gse.gsms.keys()) for k in ['GSM1551718', 'GSM1551719', 'GSM1551720']: self.assertTrue(k in downloaded_paths.keys()) for k in downloaded_paths.keys(): for f in downloaded_paths[k]['SRA']: self.assertTrue(isfile(f))
def test_get_geo_and_data(self): gpl = GEO.get_GEO(geo="GPL96", destdir=download_geo) self.assertTrue(isinstance(gpl, GPL)) self.assertEqual(gpl.name, "GPL96") self.assertEqual(gpl.get_accession(), "GPL96") self.assertEqual(len(gpl.table.index), 22283) self.assertEqual(len(gpl.columns), 16)
def read_soft(datapath_or_datastring, is_datafile=True, return_filtered_data=False, rows=None, columns=None): """Read a file in SOFT format, either from a file or from a string of raw data. :param (string) datapath_or_datastring: Either the path to the SOFT data file (can be relative or absolute), or a string corresponding to the content of a SOFT file (including newline characters). :param (bool, optional) is_datafile: Either True (default) if passing the filepath to the data, or False if passing a string of raw data. :param (bool) return_filtered_data: Either False (default) to return all the metadata, or True to return only the data filtered by rows and/or columns. :param (list[string]) rows: The rows that should be filtered in if `is_data_unfiltered` is False. :param (list[string]) columns: The columns that should be filtered in if `is_data_unfiltered` is False. :rtype (tuple|ndarray): Either a tuple containing the description (metadata), subsets, row names, and column names for the SOFT data if `return_filtered_data` is False, or an array of the filtered SOFT data if `return_filtered_data` is True. """ # ensure required argument is a string err_msg = 'Please pass either the filepath to the data, or the data as a string.' assert isinstance(datapath_or_datastring, str), err_msg if is_datafile: filepath = datapath_or_datastring else: with tempfile.NamedTemporaryFile(mode='w+', delete=False, suffix='.soft') as tf: tf.write(datapath_or_datastring) filepath = tf.name geo_file = gp.get_GEO(filepath=filepath, geotype='GDS') df = geo_file.table df.set_index('ID_REF', inplace=True) all_rows = list(df.index.values) all_cols = list(df.columns.values) for column in all_cols: if 'GSM' not in column: all_cols.remove(column) if return_filtered_data: return _get_selected_data(df, all_rows, all_cols, rows, columns) desc = geo_file.metadata subsets = geo_file.subsets for subset in subsets: subsets[subset] = subsets[subset].metadata return desc, subsets, all_rows, all_cols
def test_get_geo_and_data_with_annotations(self): gpl = GEO.get_GEO(geo="GPL96", destdir=download_geo, annotate_gpl=True) self.assertTrue(isinstance(gpl, GPL)) self.assertEqual(gpl.name, "GPL96") self.assertEqual(gpl.get_metadata_attribute('platform'), "GPL96") self.assertEqual(len(gpl.table.index), 22283) self.assertEqual(len(gpl.columns), 21)
def test_pivot_samples(self): gse = GEO.get_GEO(filepath=join(download_geo, "soft_ex_family.txt"), geotype="GSE") result = read_table( join(download_geo, "test_sample_pivoted_by_value.tab"), index_col=0) result.columns.name = 'name' assert_frame_equal(gse.pivot_samples("VALUE"), result)
def test_download_SRA_parallel_by_gsm(self): geo_id = 'GSE63525' # Hi-C dataset from Rao et al. def filterby(x): return 'HIC173' in x.metadata['title'][0] \ or 'HIC174' in x.metadata['title'][0] \ or 'HIC175' in x.metadata['title'][0] destdir = "./TMP_SOFT_parallel_by_gsm" gse = GEO.get_GEO(geo=geo_id, destdir=destdir) gsms_to_use = [gsm for gsm in gse.gsms.values() if filterby(gsm)] downloaded_paths = dict() for gsm in gsms_to_use: downloaded_paths[gsm.name] = gsm.download_SRA("*****@*****.**", # some unused e-mail directory=destdir, nproc=3, return_list=False, filetype='sra', silent=True, keep_sra=True) self.assertTrue(isdir(destdir)) self.assertEqual(len(downloaded_paths), 3) for k in downloaded_paths.keys(): self.assertTrue(k in gse.gsms.keys()) for k in ['GSM1551718', 'GSM1551719', 'GSM1551720']: self.assertTrue(k in downloaded_paths.keys()) for k in downloaded_paths.keys(): for f in downloaded_paths[k]['SRA']: self.assertTrue(isfile(f))
def download_regular(name, filterby=None, metadata_path="./", destdir="./TMP_SOFT", nthreads=20): geo_id = gse_dict[name] gse = GEOparse.get_GEO(geo=geo_id, destdir=destdir) gsms = gse.gsms if filterby is None: downloaded_paths = gse.download_SRA( '*****@*****.**', filetype='fastq', fastq_dump_options=fastq_dump_options, nproc=nthreads, silent=True) else: downloaded_paths = gse.download_SRA( '*****@*****.**', filetype='fastq', filterby=filterby, fastq_dump_options=fastq_dump_options, nproc=nthreads, silent=True) # fix a bug with multiple replicates! metadata_collected_dict = { x: parse_metadata(gsms[x], mode=name) for x in downloaded_paths.keys() } metadata_collected_list = [] for k in metadata_collected_dict: for i in range(len(downloaded_paths[k]) // 2): d = metadata_collected_dict[k] d['path fastq R1'] = downloaded_paths[k][2 * i] d['path fastq R2'] = downloaded_paths[k][2 * i + 1] d['gse'] = geo_id metadata_collected_list.append(dict(d)) df1 = pd.DataFrame(metadata_collected_list) df1 = pd.concat([ g.drop('index', axis=1).reset_index(drop=True) for i, g in df1.reset_index().groupby("index") ]).reset_index() df1.loc[:, 'technical_rep'] = df1.loc[:, 'index'] + 1 df1 = df1.drop('index', axis=1) df1 = df1.applymap(lambda x: str(x).replace(' ', '-')) df1.loc[:, "running_mode"] = name df1.to_csv(os.path.join(metadata_path, '{}_metadata.tsv'.format(geo_id)), sep='\t') df1 = df1.drop(['processing', 'protocol'], axis=1) df1.to_csv(os.path.join(metadata_path, '{}_metadata_short.tsv'.format(geo_id)), sep='\t')
def get_geo_metadata(acc, experiment_type=None): ''' Parses information associated with a GEO Series or single experiment. Uses GEOparse library which downloads records from NCBI ftp rather than using NCBI Entrez e-utils, resulting in a single request rather than many. This function will parse information from the files and then delete them. Returns a Dataset object, holding information about all the associated experiments and biosamples. ''' if acc.startswith('GSE') or '/GSE' in acc: # experiment series if '/' in acc: gse = GEOparse.get_GEO(filepath=acc) else: gse = GEOparse.get_GEO(geo=acc) # pragma: no cover # create Experiment objects from each GSM file experiments = [obj for obj in [parse_gsm(gsm, experiment_type) for gsm in gse.gsms.values()] if obj] # delete file after GSMs are parsed if '/' not in acc: print('GEO parsing done. Removing downloaded soft file.') os.remove('{}_family.soft.gz'.format(acc)) if not experiments: print('Sequencing experiments not found. Exiting.') return gds = Dataset(acc, gse.metadata['sample_id'], experiments, [parse_bs_record(experiment.bs) for experiment in experiments]) return gds elif acc.startswith('GSM') or '/GSM' in acc: # single experiment if '/' in acc: gsm = GEOparse.get_GEO(filepath=acc) else: gsm = GEOparse.get_GEO(geo=acc) # pragma: no cover exp = parse_gsm(gsm, experiment_type) print("GEO parsing done. Removing downloaded soft file.") try: os.remove('{}.txt'.format(acc)) # delete file after GSM is parsed except Exception: pass if not exp: print("Accession not a sequencing experiment, or couldn't be parsed. Exiting.") return gds = Dataset(None, [acc], [exp], [parse_bs_record(exp.bs)]) return gds else: print('Input not a valid GEO accession.') return
def open_gds(filename): gds = GEOparse.get_GEO(filepath="%s" % filename) Table = gds.table metadata = gds.columns Table = Table.drop(columns='ID_REF') Table = Table[(Table['IDENTIFIER'] != '--Control') | (Table['IDENTIFIER'] != 'control')] Table = Table.groupby('IDENTIFIER').mean().reset_index() return Table
def main(args): logging.basicConfig( level=logging.INFO, format='%(module)s:%(levelname)s:%(asctime)s:%(message)s', handlers=[logging.FileHandler("../logs/report.log")]) logging.info(args) utils.create_dir_if_not_exist(args.out_expr_dir) utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw')) utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed')) gse = GEOparse.get_GEO(geo='GSE64913', destdir=join(args.out_expr_dir, 'raw')) annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL570'], 'ENTREZ_GENE_ID') annotated2 = annotated[~pd.isnull(annotated.ENTREZ_GENE_ID)] annotated2 = annotated2.loc[~annotated2.isnull().values.all(axis=1)] annotated2['ENTREZ_GENE_ID'] = annotated2.ENTREZ_GENE_ID.str.split( '///').str[0].astype(int) annotated2 = annotated2.set_index('ENTREZ_GENE_ID') classes = {} classes['healthy_cae'] = [ 'diagnosis: Healthy', 'cell type: Central airway epithelium' ] classes['healthy_pae'] = [ 'diagnosis: Healthy', 'cell type: Peripheral airway epithelium' ] classes['asthma_cae'] = [ 'diagnosis: Severe Asthmatic', 'cell type: Central airway epithelium' ] classes['asthma_pae'] = [ 'diagnosis: Severe Asthmatic', 'cell type: Peripheral airway epithelium' ] logging.info(classes) gsms = { cls: [ gsm for gsm in gse.gsms if gse.gsms[gsm].metadata['characteristics_ch1'][1] == classes[cls] [0] and gse.gsms[gsm].metadata['characteristics_ch1'][5] == classes[cls][1] ] for cls in classes } logging.info(' '.join( ['{} GSM:{}'.format(cls, len(gsms[cls])) for cls in classes])) utils.create_dir_if_not_exist(args.out_expr_dir) utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'), annotated2) for cls in classes: utils.write_text( join(args.out_expr_dir, 'processed', '{}_gsms.txt'.format(cls)), gsms[cls])
def test_get_geo_and_data(self): gds = GEO.get_GEO(geo="GDS507", destdir=download_geo) self.assertTrue(isinstance(gds, GDS)) self.assertEqual(len(gds.table.index), 22645) self.assertEqual(len(gds.table.columns), 19) self.assertEqual(len(gds.metadata.keys()), 16) # we omit DATABASE and SUBSET ! entries self.assertEqual(len(gds.database.metadata.keys()), 5) for subset_name, subset in iteritems(gds.subsets): self.assertEqual(len(subset.metadata.keys()), 4) self.assertTrue(isinstance(subset, GDSSubset))
def load_dataset(dataset_id, download_location="."): """ Load the dataset from disk (or download it if it does not exists) Arguments: - dataset_id: the ID of the dataset to load Output: - GSE object (GEOparse Series) """ path = download_location + "/" + dataset_id + "_family.soft.gz" if os.path.exists(path): # Load from an existing file print("- Loading from", path) gse = GEOparse.get_GEO(filepath=path) else: # Download GSE and load it print("- Downloading", dataset_id) gse = GEOparse.get_GEO(geo=dataset_id, destdir=download_location + "/") return gse
def test_get_geo_metadata_sra_hidden(capfd, mocker, hidden_sra): gse_all = GEOparse.get_GEO( filepath='./tests/data_files/GSE93431_family.soft.gz') mocker.patch('scripts.geo2fdn.parse_bs_record', return_value='SAMNXXXXXXXX') mocker.patch('scripts.geo2fdn.parse_gsm', return_value=hidden_sra) gse = geo.get_geo_metadata('./tests/data_files/GSE93431_family.soft.gz') out, err = capfd.readouterr() assert not gse assert len(gse_all.gsms.values()) > 10
def get_geo(geo_accesions: list = None, paths=None): """ Args: geo_accesions: Returns: """ geos = list() if paths is None: for i in geo_accesions: geos.append(geo.get_GEO(i)) geo_entries = geos else: for i in paths: geos.append(geo.get_GEO(filepath=os.path.join(i))) geo_entries = geos return geo_entries
def get_geo(geo_id): #Import GSE dataset gse = GEOparse.get_GEO(geo=str(geo_id).upper()) data = gse.pivot_samples('VALUE') del data.index.name data = clean_df(data) return data
def comp_test(accession): start = time.time() gpl_data = GEOparse.get_GEO(geo = accession, destdir = "./cache", silent = True) table = gpl_data.table for line in table: #just for minimal processing overhead so not optimized away (can python even do that?), should never trigger if(len(line) == 10000000000): print("loooooong") end = time.time() print(end - start)
def open_gds(filename): #Take filename input and parse geoquery data. Clean the table to remove unneccessary rows gds = GEOparse.get_GEO(filepath="%s" % filename) Table = gds.table metadata = gds.columns Table = Table.drop(columns='ID_REF') Table = Table[(Table['IDENTIFIER'] != '--Control') | (Table['IDENTIFIER'] != 'control')] Table = Table.groupby("IDENTIFIER").mean().reset_index() return Table
def get_from_geo(accession, disease): """Downloads a dataset from GEO.""" geodir = tempfile.TemporaryDirectory() print("geodir is {}".format(geodir.name)) print("Downloading data set {} from GEO....".format(accession), flush=True) # silent=True has no effect, # see https://github.com/guma44/GEOparse/issues/19 raw_gse = GEOparse.get_GEO(geo=accession, destdir=geodir.name, silent=True) return GEOSeries(raw_gse, disease)
def test_duplicate_column(self): columns = [ 'ID', 'COL', 'ROW', 'NAME', 'SPOT_ID', 'CONTROL_TYPE', 'REFSEQ', 'GB_ACC', 'GENE', 'GENE_SYMBOL', 'GENE_NAME', 'UNIGENE_ID', 'ENSEMBL_ID', 'TIGR_ID', 'ACCESSION_STRING', 'CHROMOSOMAL_LOCATION', 'CYTOBAND', 'DESCRIPTION', 'GO_ID', 'SEQUENCE', 'SPOT_ID.1', 'ORDER' ] columns2 = [ 'ID', 'COL', 'ROW', 'NAME', 'SPOT_ID', 'CONTROL_TYPE', 'ENSEMBL_ID', 'GB_ACC', 'GENE', 'GENE_SYMBOL', 'ENSEMBL_ID.1', 'UNIGENE_ID', 'ENSEMBL_ID.2', 'TIGR_ID', 'ACCESSION_STRING', 'CHROMOSOMAL_LOCATION', 'CYTOBAND', 'DESCRIPTION', 'GO_ID', 'SEQUENCE', 'SPOT_ID.1', 'ORDER' ] gpl = GEO.get_GEO(filepath=join(download_geo, "GPL4133.txt")) self.assertEqual(list(gpl.columns.index), columns) gpl2 = GEO.get_GEO(filepath=join(download_geo, "GPL4134.txt")) self.assertEqual(list(gpl2.columns.index), columns2)
def main(): fname = "cache.pkl" rebuildcache = True if rebuildcache: data1 = GEOparse.get_GEO( filepath="./GDS2947.soft.gz" ) # Adenoma/Healthy Set (Testing?) (True Count) data2 = GEOparse.get_GEO( filepath="./GDS4379.soft.gz" ) # Adenocarcinoma (Testing?) (True Count) Samples 64 data4 = GEOparse.get_GEO( filepath="./GDS4393.soft.gz") # Metastatic/Tumor Set (True Count) data5 = GEOparse.get_GEO( filepath="./GDS4513.soft.gz" ) # Tumor/Excised Set (Transformed count) Samples 53 data6 = GEOparse.get_GEO( filepath="./GDS4516.soft.gz" ) # Metastatic/Stage 3 Set (Transformed count) Samples 104 data7 = GEOparse.get_GEO(filepath="./GSE137140_family.soft.gz") data8 = GEOparse.get_GEO(filepath="./GSE134347_family.soft.gz") tup = (data1, data2, data4, data5, data6, data7, data8) with open(fname, 'wb') as f: pickle.dump(tup, f) else: with open(fname, 'rb') as f: tup = pickle.load(f) (data1, data2, data4, data5, data6, data7, data8) = tup gseData, gseClass, geneIndex = unpackValues(data8) # table1_expression, table1_info = getValues(data1) # table1_expression = np.log2(table1_expression) # table6_expression, table6_info = getValues(data6) # table2_expression, table2_info = getValues(data2) # catDataE, catDataI = catTables([table1_expression, table6_expression, table2_expression], [table1_info, table6_info,table2_info]) # infolen = catDataI.shape # infolen = infolen[0] # classification = np.zeros((infolen), dtype=int) # for i in range(infolen): # if (catDataI[i] == "adenoma" or catDataI[i] == "Large Intestine, Villous Adenoma"): # classification[i] = 1 # elif (catDataI[i] == "normal mucosa"): # classification[i] = 2 trainSizes = np.linspace(0.2, 0.8, 4) regMax = np.array([0, 0, []]) regularization = np.array([0, 0, [], 0, 0]) for t in trainSizes: cMax, cVals = crossValidation(gseData, gseClass, t) regMax = np.vstack((regMax, cMax)) regularization = np.vstack((regularization, cVals)) regMax = regMax[1:, :] regularization = regularization[1:, :] plotRegularization(regularization, trainSizes) geneMax(regMax[3], geneIndex) print(1)
def make_meta(gse_name): gse = GEOparse.get_GEO(geo=gse_name) gsms = list(gse.gsms.keys()) gsms.sort() meta = [{ "gsm": gsm, "sra": gse.gsms[gsm].relations["SRA"][0].split("=")[1], "title": gse.gsms[gsm].metadata["title"][0] } for gsm in gsms] return (meta)
def get_value_from_sample_by_ids(gsm, id_refs): gsm = GEOparse.get_GEO(geo=gsm, destdir=cache) #print(gsm.table["ID_REF"]) data = gsm.table values = data.loc[data["ID_REF"].isin(id_refs)].to_dict("records") return values
def test_annotate(self): gse = GEO.get_GEO(filepath=join(download_geo, "soft_ex_family.txt"), geotype="GSE") gsm = gse.gsms["Triple-Fusion Transfected Embryonic Stem Cells Replicate 1"] result = read_table(join(download_geo, "test_gsm_annotated.tab")) gpl = gse.gpls[next(iter(gse.gpls))] assert_frame_equal(result, gsm.annotate(gpl, annotation_column="GB_ACC")) assert_frame_equal(result, gsm.annotate(gpl.table, annotation_column="GB_ACC")) with self.assertRaises(TypeError): gsm.annotate("platform", annotation_column="GB_ACC") gsm.annotate(gpl.table, annotation_column="GB_ACC", in_place=True) assert_frame_equal(result, gsm.table)
def test_merge_and_average(self): gse = GEO.get_GEO(filepath=join(download_geo, "soft_ex_family.txt"), geotype="GSE") result = read_table(join(download_geo, "test_merged_by_id_and_averaged_by_gb_acc.tab"), index_col=0) result = result.ix[sorted(result.index), sorted(result.columns)] # gse.gsms is a dict so the columns might be in different order merged = gse.merge_and_average(gse.gpls[gse.gpls.keys()[0]], "VALUE", "GB_ACC", gpl_on="ID", gsm_on="ID_REF") merged = merged[sorted(merged.columns)] # gse.gsms is a dict so the columns might be in different order assert_frame_equal(merged, result) with self.assertRaises(KeyError): gse.merge_and_average("platform", "VALUE", "GB_ACC", gpl_on="ID", gsm_on="ID_REF") with self.assertRaises(ValueError): gse.merge_and_average(["platform"], "VALUE", "GB_ACC", gpl_on="ID", gsm_on="ID_REF")
def on_click(): genes = genesEdit.text().split(',') gse = GEOparse.get_GEO(geo='GSE' + str(gseEdit.text()), destdir='./') for gsm_name, gsm in gse.gsms.items(): print("Name: ", gsm_name) print("Metadata:", ) for key, value in gsm.metadata.items(): print(" - %s : %s" % (key, ", ".join(value))) print("Table data:", ) print(gsm.table.head()) break
def test_pivot_and_annotate(self): gse = GEO.get_GEO(filepath=join(download_geo, "soft_ex_family.txt"), geotype="GSE") gpl = gse.gpls[next(iter(gse.gpls))] result = read_table(join(download_geo, "test_sample_pivoted_by_value_and_annotated_by_gbacc.tab"), index_col=0) result.columns.name = 'name' pivoted = gse.pivot_and_annotate(values="VALUE", gpl=gpl, annotation_column="GB_ACC") assert_frame_equal(result, pivoted) assert_frame_equal(gse.pivot_and_annotate(values="VALUE", gpl=gpl.table, annotation_column="GB_ACC"), result) with self.assertRaises(TypeError): gse.pivot_and_annotate(values="VALUE", gpl="gpl", annotation_column="GB_ACC")
def main(args): logging.basicConfig( level=logging.INFO, format='%(module)s:%(levelname)s:%(asctime)s:%(message)s', handlers=[logging.FileHandler("../logs/report.log")]) logging.info(args) utils.create_dir_if_not_exist(args.out_expr_dir) utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw')) utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed')) gse = GEOparse.get_GEO(geo='GSE54837', destdir=join(args.out_expr_dir, 'raw')) annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL570'], 'ENTREZ_GENE_ID') annotated2 = annotated[~pd.isnull(annotated.ENTREZ_GENE_ID)] annotated2 = annotated2.loc[~annotated2.isnull().values.all(axis=1)] annotated2['ENTREZ_GENE_ID'] = annotated2.ENTREZ_GENE_ID.str.split( '///').str[0].astype(int) annotated2 = annotated2.set_index('ENTREZ_GENE_ID') annotated2[annotated2 <= 0] = 0.001 annotated2 = np.log(annotated2) disease_cls = ['subject type: COPD Subjects'] healthy_cls = [ 'subject type: Non-smoker Controls', 'subject type: Smoker Controls' ] healthy_non_smoker_cls = ['subject type: Non-smoker Controls'] logging.info(disease_cls) logging.info(healthy_cls) logging.info(healthy_non_smoker_cls) disease_gsm = [ gsm for gsm in gse.gsms if gse.gsms[gsm].metadata['characteristics_ch1'][5] in disease_cls ] healthy_gsm = [ gsm for gsm in gse.gsms if gse.gsms[gsm].metadata['characteristics_ch1'][5] in healthy_cls ] healthy_non_smoker_gsm = [ gsm for gsm in gse.gsms if gse.gsms[gsm].metadata['characteristics_ch1'][5] in healthy_non_smoker_cls ] logging.info( "Disease GSM: {}, Healthy GSM: {}, Healthy non smoker GSM: {}".format( len(disease_gsm), len(healthy_gsm), len(healthy_non_smoker_gsm))) utils.create_dir_if_not_exist(args.out_expr_dir) utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'), annotated2) utils.write_text(join(args.out_expr_dir, 'processed', 'disease_gsms.txt'), disease_gsm) utils.write_text(join(args.out_expr_dir, 'processed', 'healthy_gsms.txt'), healthy_gsm)
def test_soft_format_gse(self): print download_geo gse = GEO.get_GEO(geo="GSE1563", destdir=download_geo) self.assertTrue(isinstance(gse, GSE)) self.assertEqual(gse.get_accession(), "GSE1563") self.assertEqual(len(gse.gsms.keys()), 62) self.assertEqual(len(gse.gpls.keys()), 1) self.assertEqual(len(gse.gpls[gse.gpls.keys()[0]].table.index), 12625) self.assertEqual(len(gse.gsms[gse.gsms.keys()[0]].table.index), 12625) for gsm_name, gsm in gse.gsms.iteritems(): self.assertEqual(len(gsm.table.index), 12625) self.assertTrue(isinstance(gsm, GSM)) for gpl_name, gpl in gse.gpls.iteritems(): self.assertEqual(len(gpl.table.index), 12625) self.assertTrue(isinstance(gpl, GPL))
def test_get_geo_gpl_partially(self): partial = [ "GSM1662787", "GSM1662789", "GSM1662791", "GSM1859499" ] gpl = GEO.get_GEO(geo="GPL20082", destdir=download_geo, include_data=True, partial=partial) self.assertTrue(isinstance(gpl, GPL)) self.assertEqual(gpl.get_accession(), "GPL20082") for gsm in gpl.gsms: self.assertTrue(gsm in partial) self.assertEqual(4, len(gpl.gsms))
def test_download_SRA(self): gse = GEO.get_GEO(geo="GSE1563", destdir=download_geo) self.assertTrue(isinstance(gse, GSE)) self.assertEqual(gse.get_accession(), "GSE1563") self.assertEqual(len(gse.gsms.keys()), 62) self.assertEqual(len(gse.gpls.keys()), 1) self.assertEqual(len(gse.gpls[next(iter(gse.gpls))].table.index), 12625) self.assertEqual(len(gse.gsms[next(iter(gse.gsms))].table.index), 12625) for gsm_name, gsm in iteritems(gse.gsms): self.assertEqual(len(gsm.table.index), 12625) self.assertTrue(isinstance(gsm, GSM)) for gpl_name, gpl in iteritems(gse.gpls): self.assertEqual(len(gpl.table.index), 12625) self.assertTrue(isinstance(gpl, GPL))
def test_get_geo_gpl_sequencing(self): gpl = GEO.get_GEO(geo="GPL20082", destdir=download_geo, include_data=True) self.assertTrue(isinstance(gpl, GPL)) self.assertEqual(gpl.get_accession(), "GPL20082") samples = [ "GSM1662787", "GSM1662788", "GSM1662789", "GSM1662790", "GSM1662791", "GSM1677167", "GSM1859499", "GSM1875285" ] for sample in samples: self.assertTrue(sample in gpl.gsms) self.assertEqual(6, len(gpl.gses["GSE68087"].gsms)) self.assertEqual(2, len(gpl.gses["GSE67974"].gsms))
def GSEA (geo_ID, gene_list): gse = GEOparse.get_GEO(geo=geo_ID, destdir="./") expression = gse.pivot_samples('VALUE').T experiments = {} for i, (idx, row) in enumerate(gse.phenotype_data.iterrows()): tmp = {} tmp["Type"] = 1 if "control" in row["description"] else 0 experiments[i] = tmp experiments = pd.DataFrame(experiments).T counter = 0 all_genes_set = [] all_corr_set = [] genes_corr_set = [] for gene in expression: counter += 1 if counter <= 3: continue all_genes_set.append(gene) corr_matrix = np.corrcoef([list(experiments['Type']), list(expression[gene])]) all_corr_set.append(corr_matrix[0,1]) if gene in gene_list: genes_corr_set.append(corr_matrix[0,1]) p_value = ks_2samp(genes_corr_set, all_corr_set)[1] return(str(p_value))
def test_no_table(self): try: gsm = GEO.get_GEO(filepath=join(download_geo, 'GSM2795971.txt'), geotype='GSM') except Exception: self.fail("No data in the file error.")
def download(geo_accession): if not os.path.exists("../../data/geo/"): os.makedirs("../../data/geo/") gse = GEOparse.get_GEO(geo=geo_accession, destdir="../../data/geo/") return gse
def test_empty_line(self): try: gsm = GEO.get_GEO(filepath=join(download_geo, 'GSM32878.txt'), geotype='GSM') except IndexError: self.fail("Empty line in the file causes an error.")
def test_name(self): gpl = GEO.get_GEO(filepath=join(download_geo, "GPL20814_family.soft"), geotype="GPL") self.assertEqual(gpl.name, "GPL20814")
def test_name(self): gse = GEO.get_GEO(filepath=join(download_geo, "GSE105845_family.soft"), geotype="GSE") self.assertEqual(gse.name, "GSE105845")