def main(args): logging.basicConfig(level=logging.INFO, format='%(module)s:%(levelname)s:%(asctime)s:%(message)s', handlers=[logging.FileHandler("../logs/report.log")]) logging.info(args) utils.create_dir_if_not_exist(args.out_expr_dir) utils.create_dir_if_not_exist(join(args.out_expr_dir,'raw')) utils.create_dir_if_not_exist(join(args.out_expr_dir,'processed')) gse = GEOparse.get_GEO(geo='GSE16032', destdir=join(args.out_expr_dir,'raw')) annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL570'], 'ENTREZ_GENE_ID') annotated2 = annotated[~pd.isnull(annotated.ENTREZ_GENE_ID)] annotated2 = annotated2.loc[~annotated2.isnull().values.all(axis=1)] annotated2['ENTREZ_GENE_ID'] = annotated2.ENTREZ_GENE_ID.str.split('///').str[0].astype(int) annotated2 = annotated2.set_index('ENTREZ_GENE_ID') annotated2 = np.log(annotated2) disease_cls = ['disease state: Acute', 'disease state: Convalescence'] logging.info(disease_cls) disease_gsm = [gsm for gsm in gse.gsms if gse.gsms[gsm].metadata['characteristics_ch1'][1] in disease_cls] logging.info("Disease GSM: {}".format(len(disease_gsm))) utils.create_dir_if_not_exist(args.out_expr_dir) utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'), annotated2) utils.write_text(join(args.out_expr_dir,'processed','disease_gsms.txt'), disease_gsm)
def main(args): logging.basicConfig( level=logging.INFO, format='%(module)s:%(levelname)s:%(asctime)s:%(message)s', handlers=[logging.FileHandler("../logs/report.log")]) logging.info(args) utils.create_dir_if_not_exist(args.out_expr_dir) utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw')) utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed')) gse = GEOparse.get_GEO(geo='GSE13896', destdir=join(args.out_expr_dir, 'raw')) annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL570'], 'ENTREZ_GENE_ID') annotated2 = annotated[~pd.isnull(annotated.ENTREZ_GENE_ID)] annotated2['ENTREZ_GENE_ID'] = annotated2.ENTREZ_GENE_ID.str.split( '///').str[0].astype(int) annotated2 = np.log(annotated2.set_index('ENTREZ_GENE_ID')) disease_cls = [ gse.gsms[gsm].metadata['characteristics_ch1'][3] for gsm in gse.gsms if 'COPD' in gse.gsms[gsm].metadata['characteristics_ch1'][3] ] healthy_cls = [ gse.gsms[gsm].metadata['characteristics_ch1'][3] for gsm in gse.gsms if 'COPD' not in gse.gsms[gsm].metadata['characteristics_ch1'][3] ] healthy_non_smoker_cls = [ gse.gsms[gsm].metadata['characteristics_ch1'][3] for gsm in gse.gsms if 'non-smoker' in gse.gsms[gsm].metadata['characteristics_ch1'][3] ] logging.info(disease_cls) logging.info(healthy_cls) logging.info(healthy_non_smoker_cls) disease_gsm = [ gsm for gsm in gse.gsms if gse.gsms[gsm].metadata['characteristics_ch1'][3] in disease_cls ] healthy_gsm = [ gsm for gsm in gse.gsms if gse.gsms[gsm].metadata['characteristics_ch1'][3] in healthy_cls ] healthy_non_smoker = [ gsm for gsm in gse.gsms if gse.gsms[gsm].metadata['characteristics_ch1'][3] in healthy_non_smoker_cls ] logging.info( "Disease GSM: {}, Healthy GSM: {}, Healthy non smoker: {}".format( len(disease_gsm), len(healthy_gsm), len(healthy_non_smoker))) utils.create_dir_if_not_exist(args.out_expr_dir) utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'), annotated2) utils.write_text(join(args.out_expr_dir, 'processed', 'disease_gsms.txt'), disease_gsm) utils.write_text(join(args.out_expr_dir, 'processed', 'healthy_gsms.txt'), healthy_gsm) utils.write_text( join(args.out_expr_dir, 'processed', 'healthy_non_smoker_gsms.txt'), healthy_non_smoker)
def main(args): logging.basicConfig( level=logging.INFO, format='%(module)s:%(levelname)s:%(asctime)s:%(message)s', handlers=[logging.FileHandler("../logs/report.log")]) logging.info(args) utils.create_dir_if_not_exist(args.out_expr_dir) utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw')) utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed')) gse = GEOparse.get_GEO(geo='GSE64913', destdir=join(args.out_expr_dir, 'raw')) annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL570'], 'ENTREZ_GENE_ID') annotated2 = annotated[~pd.isnull(annotated.ENTREZ_GENE_ID)] annotated2 = annotated2.loc[~annotated2.isnull().values.all(axis=1)] annotated2['ENTREZ_GENE_ID'] = annotated2.ENTREZ_GENE_ID.str.split( '///').str[0].astype(int) annotated2 = annotated2.set_index('ENTREZ_GENE_ID') classes = {} classes['healthy_cae'] = [ 'diagnosis: Healthy', 'cell type: Central airway epithelium' ] classes['healthy_pae'] = [ 'diagnosis: Healthy', 'cell type: Peripheral airway epithelium' ] classes['asthma_cae'] = [ 'diagnosis: Severe Asthmatic', 'cell type: Central airway epithelium' ] classes['asthma_pae'] = [ 'diagnosis: Severe Asthmatic', 'cell type: Peripheral airway epithelium' ] logging.info(classes) gsms = { cls: [ gsm for gsm in gse.gsms if gse.gsms[gsm].metadata['characteristics_ch1'][1] == classes[cls] [0] and gse.gsms[gsm].metadata['characteristics_ch1'][5] == classes[cls][1] ] for cls in classes } logging.info(' '.join( ['{} GSM:{}'.format(cls, len(gsms[cls])) for cls in classes])) utils.create_dir_if_not_exist(args.out_expr_dir) utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'), annotated2) for cls in classes: utils.write_text( join(args.out_expr_dir, 'processed', '{}_gsms.txt'.format(cls)), gsms[cls])
def main(args): logging.basicConfig( level=logging.INFO, format='%(module)s:%(levelname)s:%(asctime)s:%(message)s', handlers=[logging.FileHandler("../logs/report.log")]) logging.info(args) utils.create_dir_if_not_exist(args.out_expr_dir) utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw')) utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed')) gse = GEOparse.get_GEO(geo='GSE37147', destdir=join(args.out_expr_dir, 'raw')) annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL13243'], 'SPOT_ID') annotated2 = annotated.rename(columns={'SPOT_ID': 'ENTREZ_GENE_ID'}) annotated2 = annotated2.set_index('ENTREZ_GENE_ID') disease_cls = ['copd: yes'] healthy_cls = ['copd: no'] logging.info(disease_cls) logging.info(healthy_cls) disease_gsm = [ gsm for gsm in gse.gsms if gse.gsms[gsm].metadata['characteristics_ch1'][4] in disease_cls ] healthy_gsm = [ gsm for gsm in gse.gsms if gse.gsms[gsm].metadata['characteristics_ch1'][4] in healthy_cls ] disease_no_asthma_gsm = [gsm for gsm in gse.gsms \ if gse.gsms[gsm].metadata['characteristics_ch1'][4] in disease_cls \ and gse.gsms[gsm].metadata['characteristics_ch1'][9] == 'history of asthma: no'] healthy_no_asthma_gsm = [gsm for gsm in gse.gsms \ if gse.gsms[gsm].metadata['characteristics_ch1'][4] in healthy_cls \ and gse.gsms[gsm].metadata['characteristics_ch1'][9] == 'history of asthma: no'] logging.info("Disease GSM: {}, Healthy GSM: {}".format( len(disease_gsm), len(healthy_gsm))) logging.info( "Disease (no history of asthma) GSM: {} Healthy (no history of asthma) GSM: {}" .format(len(disease_no_asthma_gsm), len(healthy_no_asthma_gsm))) utils.create_dir_if_not_exist(args.out_expr_dir) utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'), annotated2) utils.write_text(join(args.out_expr_dir, 'processed', 'disease_gsms.txt'), disease_gsm) utils.write_text(join(args.out_expr_dir, 'processed', 'healthy_gsms.txt'), healthy_gsm) utils.write_text( join(args.out_expr_dir, 'processed', 'disease_no_history_asthma_gsms.txt'), disease_no_asthma_gsm) utils.write_text( join(args.out_expr_dir, 'processed', 'healthy_no_history_asthma_gsms.txt'), healthy_no_asthma_gsm)
def main(args): logging.basicConfig( level=logging.INFO, format='%(module)s:%(levelname)s:%(asctime)s:%(message)s', handlers=[logging.FileHandler("../logs/report.log")]) logging.info(args) utils.create_dir_if_not_exist(args.out_expr_dir) utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw')) utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed')) gse = GEOparse.get_GEO(geo='GSE473', destdir=join(args.out_expr_dir, 'raw')) annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL96'], 'ENTREZ_GENE_ID') gsm96 = [ gsm for gsm in gse.gsms if gse.gsms[gsm].metadata['platform_id'][0] == 'GPL96' ] # we choose only GPL96 platform annotated2 = annotated[gsm96 + ['ENTREZ_GENE_ID']] print(annotated2.shape) annotated2 = annotated2[~pd.isnull(annotated2.ENTREZ_GENE_ID)] print(annotated2.shape) annotated2 = annotated2.loc[~annotated2.isnull().values.any(axis=1)] print(annotated2.shape) annotated2['ENTREZ_GENE_ID'] = annotated2.ENTREZ_GENE_ID.str.split( '///').str[0].astype(int) annotated2 = annotated2.set_index('ENTREZ_GENE_ID') annotated2 = np.log(annotated2) #{'astM_atop', 'astM_nonatop', 'astS_atop', 'ctr_atop', 'ctr_nonatop'} classes = {} classes['asthma_med_nonatop'] = ['astM_nonatop'] classes['control_nonatop'] = ['ctr_nonatop'] logging.info(classes) gsms = { cls: [ gsm for gsm in gsm96 if classes[cls][0] in gse.gsms[gsm].metadata['title'][0] ] for cls in classes } logging.info(' '.join( ['{} GSM:{}'.format(cls, len(gsms[cls])) for cls in classes])) utils.create_dir_if_not_exist(args.out_expr_dir) utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'), annotated2) for cls in classes: utils.write_text( join(args.out_expr_dir, 'processed', '{}_gsms.txt'.format(cls)), gsms[cls])
def main(args): logging.basicConfig( level=logging.INFO, format='%(module)s:%(levelname)s:%(asctime)s:%(message)s', handlers=[logging.FileHandler("../logs/report.log")]) logging.info(args) utils.create_dir_if_not_exist(args.out_expr_dir) utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw')) utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed')) gse = GEOparse.get_GEO(geo='GSE31773', destdir=join(args.out_expr_dir, 'raw')) annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL570'], 'ENTREZ_GENE_ID') annotated2 = annotated[~pd.isnull(annotated.ENTREZ_GENE_ID)] annotated2 = annotated2.loc[~annotated2.isnull().values.all(axis=1)] annotated2['ENTREZ_GENE_ID'] = annotated2.ENTREZ_GENE_ID.str.split( '///').str[0].astype(int) annotated2 = annotated2.set_index('ENTREZ_GENE_ID') classes = {} classes['cd4_severe'] = ['CD4_Tcells_severe_asthma'] classes['cd8_severe'] = ['CD8_Tcells_severe_asthma'] classes['cd4_healthy'] = ['CD4_Tcells_healthy_donor'] classes['cd8_healthy'] = ['CD8_Tcells_healthy_donor'] classes['cd8_non_severe'] = ['CD8_Tcells_non_severe_asthma'] classes['cd4_non_severe'] = ['CD4_Tcells_non_severe_asthma'] classes['asthma_severe'] = [ 'CD4_Tcells_severe_asthma', 'CD8_Tcells_severe_asthma' ] classes['healthy'] = [ 'CD4_Tcells_healthy_donor', 'CD8_Tcells_healthy_donor' ] logging.info(classes) gsms = { cls: [ gsm for gsm in gse.gsms if gse.gsms[gsm].metadata['source_name_ch1'][0] in classes[cls] ] for cls in classes } utils.create_dir_if_not_exist(args.out_expr_dir) utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'), annotated2) for cls in classes: logging.info("{} GSM: {}".format(cls, len(gsms[cls]))) utils.write_text( join(args.out_expr_dir, 'processed', '{}_gsms.txt'.format(cls)), gsms[cls])
def main(args): logging.basicConfig( level=logging.INFO, format='%(module)s:%(levelname)s:%(asctime)s:%(message)s', handlers=[logging.FileHandler("../logs/report.log")]) logging.info(args) utils.create_dir_if_not_exist(args.out_expr_dir) utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw')) utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed')) gse = GEOparse.get_GEO(geo='GSE16972', destdir=join(args.out_expr_dir, 'raw')) annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL96'], 'ENTREZ_GENE_ID') print(annotated.shape) annotated2 = annotated[~pd.isnull(annotated.ENTREZ_GENE_ID)] print(annotated2.shape) annotated2 = annotated2.loc[~annotated2.isnull().values.any(axis=1)] print(annotated2.shape) annotated2['ENTREZ_GENE_ID'] = annotated2.ENTREZ_GENE_ID.str.split( '///').str[0].astype(int) annotated2 = annotated2.set_index('ENTREZ_GENE_ID') annotated2 = np.log(annotated2) classes = {} classes['copd'] = ['disease status: COPD patient'] classes['control'] = ['disease status: control patient'] logging.info(classes) gsms = { cls: [ gsm for gsm in gse.gsms if gse.gsms[gsm].metadata['characteristics_ch1'][0] == 'cell type: alveolar macrophage' and gse.gsms[gsm].metadata['characteristics_ch1'][1] == classes[cls][0] ] for cls in classes } logging.info(' '.join( ['{} GSM:{}'.format(cls, len(gsms[cls])) for cls in classes])) utils.create_dir_if_not_exist(args.out_expr_dir) utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'), annotated2) for cls in classes: utils.write_text( join(args.out_expr_dir, 'processed', '{}_gsms.txt'.format(cls)), gsms[cls])
def main(args): logging.basicConfig( level=logging.INFO, format='%(module)s:%(levelname)s:%(asctime)s:%(message)s', handlers=[logging.FileHandler("../logs/report.log")]) logging.info(args) utils.create_dir_if_not_exist(args.out_expr_dir) utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw')) utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed')) gse = GEOparse.get_GEO(geo='GSE18965', destdir=join(args.out_expr_dir, 'raw')) annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL96'], 'ENTREZ_GENE_ID') annotated2 = annotated[~pd.isnull(annotated.ENTREZ_GENE_ID)] annotated2 = annotated2.loc[~annotated2.isnull().values.all(axis=1)] annotated2['ENTREZ_GENE_ID'] = annotated2.ENTREZ_GENE_ID.str.split( '///').str[0].astype(int) annotated2 = annotated2.set_index('ENTREZ_GENE_ID') disease_cls = [ gse.gsms[gsm].metadata['title'][0] for gsm in gse.gsms if 'AA' in gse.gsms[gsm].metadata['title'][0] ] healthy_cls = [ gse.gsms[gsm].metadata['title'][0] for gsm in gse.gsms if 'HN' in gse.gsms[gsm].metadata['title'][0] ] logging.info(disease_cls) logging.info(healthy_cls) disease_gsm = [ gsm for gsm in gse.gsms if gse.gsms[gsm].metadata['title'][0] in disease_cls ] healthy_gsm = [ gsm for gsm in gse.gsms if gse.gsms[gsm].metadata['title'][0] in healthy_cls ] logging.info("Disease GSM: {}, Healthy GSM: {}".format( len(disease_gsm), len(healthy_gsm))) utils.create_dir_if_not_exist(args.out_expr_dir) utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'), annotated2) utils.write_text(join(args.out_expr_dir, 'processed', 'disease_gsms.txt'), disease_gsm) utils.write_text(join(args.out_expr_dir, 'processed', 'healthy_gsms.txt'), healthy_gsm)
def main(args): logging.basicConfig( level=logging.INFO, format='%(module)s:%(levelname)s:%(asctime)s:%(message)s', handlers=[logging.FileHandler("../logs/report.log")]) logging.info(args) utils.create_dir_if_not_exist(args.out_expr_dir) utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw')) utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed')) gse = GEOparse.get_GEO(geo='GSE4302', destdir=join(args.out_expr_dir, 'raw')) annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL570'], 'ENTREZ_GENE_ID') annotated2 = annotated[~pd.isnull(annotated.ENTREZ_GENE_ID)] annotated2 = annotated2.loc[~annotated2.isnull().values.all(axis=1)] annotated2['ENTREZ_GENE_ID'] = annotated2.ENTREZ_GENE_ID.str.split( '///').str[0].astype(int) disease_cls = [ 'sample type: Asthmatic at baseline', 'sample type: Asthmatic after Flovent', 'sample type: Asthmatic after Placebo' ] healthy_cls = ['sample type: Healthy control', 'sample type: Smoker'] logging.info(disease_cls) logging.info(healthy_cls) disease_gsm = [ gsm for gsm in gse.gsms if gse.gsms[gsm].metadata['characteristics_ch1'][0] in disease_cls ] healthy_gsm = [ gsm for gsm in gse.gsms if gse.gsms[gsm].metadata['characteristics_ch1'][0] in healthy_cls ] logging.info("Disease GSM: {}, Healthy GSM: {}".format( len(disease_gsm), len(healthy_gsm))) utils.create_dir_if_not_exist(args.out_expr_dir) utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'), annotated2.set_index('ENTREZ_GENE_ID')) utils.write_text(join(args.out_expr_dir, 'processed', 'disease_gsms.txt'), disease_gsm) utils.write_text(join(args.out_expr_dir, 'processed', 'healthy_gsms.txt'), healthy_gsm)
def main(args): logging.basicConfig( level=logging.INFO, format='%(module)s:%(levelname)s:%(asctime)s:%(message)s', handlers=[logging.FileHandler("../logs/report.log")]) logging.info(args) utils.create_dir_if_not_exist(args.out_expr_dir) utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw')) utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed')) gse = GEOparse.get_GEO(geo='GSE89809', destdir=join(args.out_expr_dir, 'raw')) annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL13158'], 'ENTREZ_GENE_ID') annotated2 = annotated[~pd.isnull(annotated.ENTREZ_GENE_ID)] annotated2 = annotated2.loc[~annotated2.isnull().values.all(axis=1)] annotated2['ENTREZ_GENE_ID'] = annotated2.ENTREZ_GENE_ID.str.split( '///').str[0].astype(int) annotated2 = annotated2.set_index('ENTREZ_GENE_ID') # ['Severe_Spm', 'Healthy_Spm', 'Healthy_Epithelial', 'Mild_Epithelial', 'Severe_Epithelial', 'Mild_Spm', 'Healthy_BAL', # 'Moderate_Epithelial', 'Severe_BAL', 'Mild_BAL', 'Moderate_Spm', 'Moderate_BAL'] classes = set([ '_'.join(gse.gsms[gsm].metadata['title'][0].split('_')[2:4]) for gsm in gse.gsms ]) logging.info(classes) gsms = { cls: [gsm for gsm in gse.gsms if cls in gse.gsms[gsm].metadata['title'][0]] for cls in classes } logging.info(' '.join( ['{} GSM:{}'.format(cls, len(gsms[cls])) for cls in classes])) utils.create_dir_if_not_exist(args.out_expr_dir) utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'), annotated2) for cls in classes: utils.write_text( join(args.out_expr_dir, 'processed', '{}_gsms.txt'.format(cls)), gsms[cls])
def main(args): logging.basicConfig(level=logging.INFO, format='%(module)s:%(levelname)s:%(asctime)s:%(message)s', handlers=[logging.FileHandler("../logs/report.log")]) logging.info(args) utils.create_dir_if_not_exist(args.out_expr_dir) utils.create_dir_if_not_exist(join(args.out_expr_dir,'raw')) utils.create_dir_if_not_exist(join(args.out_expr_dir,'processed')) gse = GEOparse.get_GEO(geo='GSE57148', destdir=join(args.out_expr_dir,'raw')) with closing(request.urlopen( 'ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE57nnn/GSE57148/suppl/GSE57148_COPD_FPKM_Normalized.txt.gz')) as r: with open(join(args.out_expr_dir,'raw','GSE57148_COPD_FPKM_Normalized.txt.gz'), 'wb') as f: shutil.copyfileobj(r, f) counts = pd.read_csv(join(args.out_expr_dir,'raw','GSE57148_COPD_FPKM_Normalized.txt.gz'),sep='\t') annotated2 = counts.rename(columns={gse.gsms[gsm].metadata['title'][0].split(' ')[-1]: gsm for gsm in gse.gsms}) print(annotated2.isnull().values.any(axis=1).sum()) print(annotated2.isnull().values.all(axis=1).sum()) convmatr = utils.gm._sources['biomart'].set_index('symbol') annotated2['ENTREZ_GENE_ID'] = convmatr.loc[annotated2.GeneName.tolist()].entrez.tolist() print(annotated2.shape) annotated2 = annotated2[~annotated2.ENTREZ_GENE_ID.isnull()] print(annotated2.shape) annotated2['ENTREZ_GENE_ID'] = annotated2['ENTREZ_GENE_ID'].astype(int) del annotated2['GeneName'] annotated2 = annotated2.set_index('ENTREZ_GENE_ID') disease_cls = ['disease state: COPD'] healthy_cls = ['disease state: Normal'] logging.info(disease_cls) logging.info(healthy_cls) disease_gsm = [gsm for gsm in gse.gsms if gse.gsms[gsm].metadata['characteristics_ch1'][0] in disease_cls] healthy_gsm = [gsm for gsm in gse.gsms if gse.gsms[gsm].metadata['characteristics_ch1'][0] in healthy_cls] logging.info("Disease GSM: {}, Healthy GSM: {}".format(len(disease_gsm), len(healthy_gsm))) utils.create_dir_if_not_exist(args.out_expr_dir) utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'), annotated2) utils.write_text(join(args.out_expr_dir, 'processed', 'disease_gsms.txt'), disease_gsm) utils.write_text(join(args.out_expr_dir, 'processed', 'healthy_gsms.txt'), healthy_gsm)
def main(args): logging.basicConfig( level=logging.INFO, format='%(module)s:%(levelname)s:%(asctime)s:%(message)s', handlers=[logging.FileHandler("../logs/report.log")]) logging.info(args) utils.create_dir_if_not_exist(args.out_expr_dir) utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw')) utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed')) gse = GEOparse.get_GEO(geo='GSE104468', destdir=join(args.out_expr_dir, 'raw')) annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL21185'], 'GENE_SYMBOL') convmatr = utils.gm._sources['biomart'].set_index('symbol') annotated2 = annotated[~pd.isnull(annotated.GENE_SYMBOL)] annotated2 = annotated2.loc[~annotated2.isnull().values.all(axis=1)] annotated2['ENTREZ_GENE_ID'] = convmatr.loc[ annotated2.GENE_SYMBOL.tolist()].entrez.tolist() annotated2 = annotated2[~annotated2.ENTREZ_GENE_ID.isnull()] annotated2['ENTREZ_GENE_ID'] = annotated2['ENTREZ_GENE_ID'].astype(int) del annotated2['GENE_SYMBOL'] annotated2 = annotated2.set_index('ENTREZ_GENE_ID') classes = {} classes['asthma_pbmc'] = ['disease state: Asthma', 'cell type: PBMC'] classes['asthma_bronch'] = [ 'disease state: Asthma', 'cell type: bronchial epithelia' ] classes['asthma_nasal'] = [ 'disease state: Asthma', 'cell type: nasal epithelia' ] classes['normal_pbmc'] = ['disease state: Normal', 'cell type: PBMC'] classes['normal_bronch'] = [ 'disease state: Normal', 'cell type: bronchial epithelia' ] classes['normal_nasal'] = [ 'disease state: Normal', 'cell type: nasal epithelia' ] logging.info(classes) gsms = { cls: [ gsm for gsm in gse.gsms if gse.gsms[gsm].metadata['characteristics_ch1'][2] == classes[cls] [0] and gse.gsms[gsm].metadata['characteristics_ch1'][1] == classes[cls][1] ] for cls in classes } logging.info(' '.join( ['{} GSM:{}'.format(cls, len(gsms[cls])) for cls in classes])) utils.create_dir_if_not_exist(args.out_expr_dir) utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'), annotated2) for cls in classes: utils.write_text( join(args.out_expr_dir, 'processed', '{}_gsms.txt'.format(cls)), gsms[cls])