def main(args):
    logging.basicConfig(level=logging.INFO,
                        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
                            handlers=[logging.FileHandler("../logs/report.log")])
    logging.info(args)

    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.create_dir_if_not_exist(join(args.out_expr_dir,'raw'))
    utils.create_dir_if_not_exist(join(args.out_expr_dir,'processed'))

    gse = GEOparse.get_GEO(geo='GSE16032', destdir=join(args.out_expr_dir,'raw'))

    annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL570'], 'ENTREZ_GENE_ID')
    annotated2 = annotated[~pd.isnull(annotated.ENTREZ_GENE_ID)]
    annotated2 = annotated2.loc[~annotated2.isnull().values.all(axis=1)]
    annotated2['ENTREZ_GENE_ID'] = annotated2.ENTREZ_GENE_ID.str.split('///').str[0].astype(int)
    annotated2 = annotated2.set_index('ENTREZ_GENE_ID')
    annotated2 = np.log(annotated2)
    disease_cls = ['disease state: Acute', 'disease state: Convalescence']
    logging.info(disease_cls)
    disease_gsm = [gsm for gsm in gse.gsms if gse.gsms[gsm].metadata['characteristics_ch1'][1] in disease_cls]
    logging.info("Disease GSM: {}".format(len(disease_gsm)))
    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'), annotated2)
    utils.write_text(join(args.out_expr_dir,'processed','disease_gsms.txt'), disease_gsm)
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[logging.FileHandler("../logs/report.log")])
    logging.info(args)

    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw'))
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed'))

    gse = GEOparse.get_GEO(geo='GSE13896',
                           destdir=join(args.out_expr_dir, 'raw'))
    annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL570'],
                                       'ENTREZ_GENE_ID')
    annotated2 = annotated[~pd.isnull(annotated.ENTREZ_GENE_ID)]
    annotated2['ENTREZ_GENE_ID'] = annotated2.ENTREZ_GENE_ID.str.split(
        '///').str[0].astype(int)
    annotated2 = np.log(annotated2.set_index('ENTREZ_GENE_ID'))
    disease_cls = [
        gse.gsms[gsm].metadata['characteristics_ch1'][3] for gsm in gse.gsms
        if 'COPD' in gse.gsms[gsm].metadata['characteristics_ch1'][3]
    ]
    healthy_cls = [
        gse.gsms[gsm].metadata['characteristics_ch1'][3] for gsm in gse.gsms
        if 'COPD' not in gse.gsms[gsm].metadata['characteristics_ch1'][3]
    ]
    healthy_non_smoker_cls = [
        gse.gsms[gsm].metadata['characteristics_ch1'][3] for gsm in gse.gsms
        if 'non-smoker' in gse.gsms[gsm].metadata['characteristics_ch1'][3]
    ]
    logging.info(disease_cls)
    logging.info(healthy_cls)
    logging.info(healthy_non_smoker_cls)
    disease_gsm = [
        gsm for gsm in gse.gsms
        if gse.gsms[gsm].metadata['characteristics_ch1'][3] in disease_cls
    ]
    healthy_gsm = [
        gsm for gsm in gse.gsms
        if gse.gsms[gsm].metadata['characteristics_ch1'][3] in healthy_cls
    ]
    healthy_non_smoker = [
        gsm for gsm in gse.gsms
        if gse.gsms[gsm].metadata['characteristics_ch1'][3] in
        healthy_non_smoker_cls
    ]
    logging.info(
        "Disease GSM: {}, Healthy GSM: {}, Healthy non smoker: {}".format(
            len(disease_gsm), len(healthy_gsm), len(healthy_non_smoker)))
    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'),
                     annotated2)
    utils.write_text(join(args.out_expr_dir, 'processed', 'disease_gsms.txt'),
                     disease_gsm)
    utils.write_text(join(args.out_expr_dir, 'processed', 'healthy_gsms.txt'),
                     healthy_gsm)
    utils.write_text(
        join(args.out_expr_dir, 'processed', 'healthy_non_smoker_gsms.txt'),
        healthy_non_smoker)
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[logging.FileHandler("../logs/report.log")])
    logging.info(args)

    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw'))
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed'))

    gse = GEOparse.get_GEO(geo='GSE64913',
                           destdir=join(args.out_expr_dir, 'raw'))
    annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL570'],
                                       'ENTREZ_GENE_ID')
    annotated2 = annotated[~pd.isnull(annotated.ENTREZ_GENE_ID)]
    annotated2 = annotated2.loc[~annotated2.isnull().values.all(axis=1)]
    annotated2['ENTREZ_GENE_ID'] = annotated2.ENTREZ_GENE_ID.str.split(
        '///').str[0].astype(int)
    annotated2 = annotated2.set_index('ENTREZ_GENE_ID')

    classes = {}
    classes['healthy_cae'] = [
        'diagnosis: Healthy', 'cell type: Central airway epithelium'
    ]
    classes['healthy_pae'] = [
        'diagnosis: Healthy', 'cell type: Peripheral airway epithelium'
    ]
    classes['asthma_cae'] = [
        'diagnosis: Severe Asthmatic', 'cell type: Central airway epithelium'
    ]
    classes['asthma_pae'] = [
        'diagnosis: Severe Asthmatic',
        'cell type: Peripheral airway epithelium'
    ]

    logging.info(classes)
    gsms = {
        cls: [
            gsm for gsm in gse.gsms
            if gse.gsms[gsm].metadata['characteristics_ch1'][1] == classes[cls]
            [0] and gse.gsms[gsm].metadata['characteristics_ch1'][5] ==
            classes[cls][1]
        ]
        for cls in classes
    }

    logging.info(' '.join(
        ['{} GSM:{}'.format(cls, len(gsms[cls])) for cls in classes]))
    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'),
                     annotated2)
    for cls in classes:
        utils.write_text(
            join(args.out_expr_dir, 'processed', '{}_gsms.txt'.format(cls)),
            gsms[cls])
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[logging.FileHandler("../logs/report.log")])
    logging.info(args)

    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw'))
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed'))

    gse = GEOparse.get_GEO(geo='GSE37147',
                           destdir=join(args.out_expr_dir, 'raw'))
    annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL13243'],
                                       'SPOT_ID')
    annotated2 = annotated.rename(columns={'SPOT_ID': 'ENTREZ_GENE_ID'})
    annotated2 = annotated2.set_index('ENTREZ_GENE_ID')
    disease_cls = ['copd: yes']
    healthy_cls = ['copd: no']
    logging.info(disease_cls)
    logging.info(healthy_cls)
    disease_gsm = [
        gsm for gsm in gse.gsms
        if gse.gsms[gsm].metadata['characteristics_ch1'][4] in disease_cls
    ]
    healthy_gsm = [
        gsm for gsm in gse.gsms
        if gse.gsms[gsm].metadata['characteristics_ch1'][4] in healthy_cls
    ]
    disease_no_asthma_gsm = [gsm for gsm in gse.gsms \
                             if gse.gsms[gsm].metadata['characteristics_ch1'][4] in disease_cls \
                            and gse.gsms[gsm].metadata['characteristics_ch1'][9] == 'history of asthma: no']
    healthy_no_asthma_gsm = [gsm for gsm in gse.gsms \
                             if gse.gsms[gsm].metadata['characteristics_ch1'][4] in healthy_cls \
                             and gse.gsms[gsm].metadata['characteristics_ch1'][9] == 'history of asthma: no']
    logging.info("Disease GSM: {}, Healthy GSM: {}".format(
        len(disease_gsm), len(healthy_gsm)))
    logging.info(
        "Disease (no history of asthma) GSM: {} Healthy (no history of asthma) GSM: {}"
        .format(len(disease_no_asthma_gsm), len(healthy_no_asthma_gsm)))
    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'),
                     annotated2)
    utils.write_text(join(args.out_expr_dir, 'processed', 'disease_gsms.txt'),
                     disease_gsm)
    utils.write_text(join(args.out_expr_dir, 'processed', 'healthy_gsms.txt'),
                     healthy_gsm)
    utils.write_text(
        join(args.out_expr_dir, 'processed',
             'disease_no_history_asthma_gsms.txt'), disease_no_asthma_gsm)
    utils.write_text(
        join(args.out_expr_dir, 'processed',
             'healthy_no_history_asthma_gsms.txt'), healthy_no_asthma_gsm)
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[logging.FileHandler("../logs/report.log")])
    logging.info(args)

    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw'))
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed'))

    gse = GEOparse.get_GEO(geo='GSE473',
                           destdir=join(args.out_expr_dir, 'raw'))
    annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL96'],
                                       'ENTREZ_GENE_ID')
    gsm96 = [
        gsm for gsm in gse.gsms
        if gse.gsms[gsm].metadata['platform_id'][0] == 'GPL96'
    ]  # we choose only GPL96 platform
    annotated2 = annotated[gsm96 + ['ENTREZ_GENE_ID']]
    print(annotated2.shape)
    annotated2 = annotated2[~pd.isnull(annotated2.ENTREZ_GENE_ID)]
    print(annotated2.shape)
    annotated2 = annotated2.loc[~annotated2.isnull().values.any(axis=1)]
    print(annotated2.shape)
    annotated2['ENTREZ_GENE_ID'] = annotated2.ENTREZ_GENE_ID.str.split(
        '///').str[0].astype(int)
    annotated2 = annotated2.set_index('ENTREZ_GENE_ID')
    annotated2 = np.log(annotated2)
    #{'astM_atop', 'astM_nonatop', 'astS_atop', 'ctr_atop', 'ctr_nonatop'}
    classes = {}
    classes['asthma_med_nonatop'] = ['astM_nonatop']
    classes['control_nonatop'] = ['ctr_nonatop']

    logging.info(classes)
    gsms = {
        cls: [
            gsm for gsm in gsm96
            if classes[cls][0] in gse.gsms[gsm].metadata['title'][0]
        ]
        for cls in classes
    }

    logging.info(' '.join(
        ['{} GSM:{}'.format(cls, len(gsms[cls])) for cls in classes]))
    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'),
                     annotated2)
    for cls in classes:
        utils.write_text(
            join(args.out_expr_dir, 'processed', '{}_gsms.txt'.format(cls)),
            gsms[cls])
Exemple #6
0
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[logging.FileHandler("../logs/report.log")])
    logging.info(args)

    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw'))
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed'))

    gse = GEOparse.get_GEO(geo='GSE31773',
                           destdir=join(args.out_expr_dir, 'raw'))

    annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL570'],
                                       'ENTREZ_GENE_ID')
    annotated2 = annotated[~pd.isnull(annotated.ENTREZ_GENE_ID)]
    annotated2 = annotated2.loc[~annotated2.isnull().values.all(axis=1)]
    annotated2['ENTREZ_GENE_ID'] = annotated2.ENTREZ_GENE_ID.str.split(
        '///').str[0].astype(int)
    annotated2 = annotated2.set_index('ENTREZ_GENE_ID')
    classes = {}
    classes['cd4_severe'] = ['CD4_Tcells_severe_asthma']
    classes['cd8_severe'] = ['CD8_Tcells_severe_asthma']
    classes['cd4_healthy'] = ['CD4_Tcells_healthy_donor']
    classes['cd8_healthy'] = ['CD8_Tcells_healthy_donor']
    classes['cd8_non_severe'] = ['CD8_Tcells_non_severe_asthma']
    classes['cd4_non_severe'] = ['CD4_Tcells_non_severe_asthma']
    classes['asthma_severe'] = [
        'CD4_Tcells_severe_asthma', 'CD8_Tcells_severe_asthma'
    ]
    classes['healthy'] = [
        'CD4_Tcells_healthy_donor', 'CD8_Tcells_healthy_donor'
    ]
    logging.info(classes)
    gsms = {
        cls: [
            gsm for gsm in gse.gsms
            if gse.gsms[gsm].metadata['source_name_ch1'][0] in classes[cls]
        ]
        for cls in classes
    }
    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'),
                     annotated2)
    for cls in classes:
        logging.info("{} GSM: {}".format(cls, len(gsms[cls])))
        utils.write_text(
            join(args.out_expr_dir, 'processed', '{}_gsms.txt'.format(cls)),
            gsms[cls])
Exemple #7
0
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[logging.FileHandler("../logs/report.log")])
    logging.info(args)

    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw'))
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed'))

    gse = GEOparse.get_GEO(geo='GSE16972',
                           destdir=join(args.out_expr_dir, 'raw'))
    annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL96'],
                                       'ENTREZ_GENE_ID')
    print(annotated.shape)
    annotated2 = annotated[~pd.isnull(annotated.ENTREZ_GENE_ID)]
    print(annotated2.shape)
    annotated2 = annotated2.loc[~annotated2.isnull().values.any(axis=1)]
    print(annotated2.shape)
    annotated2['ENTREZ_GENE_ID'] = annotated2.ENTREZ_GENE_ID.str.split(
        '///').str[0].astype(int)
    annotated2 = annotated2.set_index('ENTREZ_GENE_ID')
    annotated2 = np.log(annotated2)
    classes = {}
    classes['copd'] = ['disease status: COPD patient']
    classes['control'] = ['disease status: control patient']

    logging.info(classes)
    gsms = {
        cls: [
            gsm for gsm in gse.gsms
            if gse.gsms[gsm].metadata['characteristics_ch1'][0] ==
            'cell type: alveolar macrophage' and
            gse.gsms[gsm].metadata['characteristics_ch1'][1] == classes[cls][0]
        ]
        for cls in classes
    }

    logging.info(' '.join(
        ['{} GSM:{}'.format(cls, len(gsms[cls])) for cls in classes]))
    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'),
                     annotated2)
    for cls in classes:
        utils.write_text(
            join(args.out_expr_dir, 'processed', '{}_gsms.txt'.format(cls)),
            gsms[cls])
Exemple #8
0
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[logging.FileHandler("../logs/report.log")])
    logging.info(args)

    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw'))
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed'))

    gse = GEOparse.get_GEO(geo='GSE18965',
                           destdir=join(args.out_expr_dir, 'raw'))

    annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL96'],
                                       'ENTREZ_GENE_ID')
    annotated2 = annotated[~pd.isnull(annotated.ENTREZ_GENE_ID)]
    annotated2 = annotated2.loc[~annotated2.isnull().values.all(axis=1)]
    annotated2['ENTREZ_GENE_ID'] = annotated2.ENTREZ_GENE_ID.str.split(
        '///').str[0].astype(int)
    annotated2 = annotated2.set_index('ENTREZ_GENE_ID')
    disease_cls = [
        gse.gsms[gsm].metadata['title'][0] for gsm in gse.gsms
        if 'AA' in gse.gsms[gsm].metadata['title'][0]
    ]
    healthy_cls = [
        gse.gsms[gsm].metadata['title'][0] for gsm in gse.gsms
        if 'HN' in gse.gsms[gsm].metadata['title'][0]
    ]
    logging.info(disease_cls)
    logging.info(healthy_cls)
    disease_gsm = [
        gsm for gsm in gse.gsms
        if gse.gsms[gsm].metadata['title'][0] in disease_cls
    ]
    healthy_gsm = [
        gsm for gsm in gse.gsms
        if gse.gsms[gsm].metadata['title'][0] in healthy_cls
    ]
    logging.info("Disease GSM: {}, Healthy GSM: {}".format(
        len(disease_gsm), len(healthy_gsm)))
    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'),
                     annotated2)
    utils.write_text(join(args.out_expr_dir, 'processed', 'disease_gsms.txt'),
                     disease_gsm)
    utils.write_text(join(args.out_expr_dir, 'processed', 'healthy_gsms.txt'),
                     healthy_gsm)
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[logging.FileHandler("../logs/report.log")])
    logging.info(args)

    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw'))
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed'))

    gse = GEOparse.get_GEO(geo='GSE4302',
                           destdir=join(args.out_expr_dir, 'raw'))
    annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL570'],
                                       'ENTREZ_GENE_ID')
    annotated2 = annotated[~pd.isnull(annotated.ENTREZ_GENE_ID)]
    annotated2 = annotated2.loc[~annotated2.isnull().values.all(axis=1)]
    annotated2['ENTREZ_GENE_ID'] = annotated2.ENTREZ_GENE_ID.str.split(
        '///').str[0].astype(int)
    disease_cls = [
        'sample type: Asthmatic at baseline',
        'sample type: Asthmatic after Flovent',
        'sample type: Asthmatic after Placebo'
    ]
    healthy_cls = ['sample type: Healthy control', 'sample type: Smoker']
    logging.info(disease_cls)
    logging.info(healthy_cls)
    disease_gsm = [
        gsm for gsm in gse.gsms
        if gse.gsms[gsm].metadata['characteristics_ch1'][0] in disease_cls
    ]
    healthy_gsm = [
        gsm for gsm in gse.gsms
        if gse.gsms[gsm].metadata['characteristics_ch1'][0] in healthy_cls
    ]
    logging.info("Disease GSM: {}, Healthy GSM: {}".format(
        len(disease_gsm), len(healthy_gsm)))
    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'),
                     annotated2.set_index('ENTREZ_GENE_ID'))
    utils.write_text(join(args.out_expr_dir, 'processed', 'disease_gsms.txt'),
                     disease_gsm)
    utils.write_text(join(args.out_expr_dir, 'processed', 'healthy_gsms.txt'),
                     healthy_gsm)
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[logging.FileHandler("../logs/report.log")])
    logging.info(args)

    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw'))
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed'))

    gse = GEOparse.get_GEO(geo='GSE89809',
                           destdir=join(args.out_expr_dir, 'raw'))
    annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL13158'],
                                       'ENTREZ_GENE_ID')
    annotated2 = annotated[~pd.isnull(annotated.ENTREZ_GENE_ID)]
    annotated2 = annotated2.loc[~annotated2.isnull().values.all(axis=1)]
    annotated2['ENTREZ_GENE_ID'] = annotated2.ENTREZ_GENE_ID.str.split(
        '///').str[0].astype(int)
    annotated2 = annotated2.set_index('ENTREZ_GENE_ID')

    # ['Severe_Spm', 'Healthy_Spm', 'Healthy_Epithelial', 'Mild_Epithelial', 'Severe_Epithelial', 'Mild_Spm', 'Healthy_BAL',
    # 'Moderate_Epithelial', 'Severe_BAL', 'Mild_BAL', 'Moderate_Spm', 'Moderate_BAL']
    classes = set([
        '_'.join(gse.gsms[gsm].metadata['title'][0].split('_')[2:4])
        for gsm in gse.gsms
    ])

    logging.info(classes)
    gsms = {
        cls:
        [gsm for gsm in gse.gsms if cls in gse.gsms[gsm].metadata['title'][0]]
        for cls in classes
    }

    logging.info(' '.join(
        ['{} GSM:{}'.format(cls, len(gsms[cls])) for cls in classes]))
    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'),
                     annotated2)
    for cls in classes:
        utils.write_text(
            join(args.out_expr_dir, 'processed', '{}_gsms.txt'.format(cls)),
            gsms[cls])
def main(args):
    logging.basicConfig(level=logging.INFO,
                        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
                            handlers=[logging.FileHandler("../logs/report.log")])
    logging.info(args)

    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.create_dir_if_not_exist(join(args.out_expr_dir,'raw'))
    utils.create_dir_if_not_exist(join(args.out_expr_dir,'processed'))

    gse = GEOparse.get_GEO(geo='GSE57148', destdir=join(args.out_expr_dir,'raw'))
    with closing(request.urlopen(
            'ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE57nnn/GSE57148/suppl/GSE57148_COPD_FPKM_Normalized.txt.gz')) as r:
        with open(join(args.out_expr_dir,'raw','GSE57148_COPD_FPKM_Normalized.txt.gz'), 'wb') as f:
            shutil.copyfileobj(r, f)
    counts = pd.read_csv(join(args.out_expr_dir,'raw','GSE57148_COPD_FPKM_Normalized.txt.gz'),sep='\t')
    annotated2 = counts.rename(columns={gse.gsms[gsm].metadata['title'][0].split(' ')[-1]: gsm for gsm in gse.gsms})
    print(annotated2.isnull().values.any(axis=1).sum())
    print(annotated2.isnull().values.all(axis=1).sum())
    convmatr = utils.gm._sources['biomart'].set_index('symbol')
    annotated2['ENTREZ_GENE_ID'] = convmatr.loc[annotated2.GeneName.tolist()].entrez.tolist()
    print(annotated2.shape)
    annotated2 = annotated2[~annotated2.ENTREZ_GENE_ID.isnull()]
    print(annotated2.shape)
    annotated2['ENTREZ_GENE_ID'] = annotated2['ENTREZ_GENE_ID'].astype(int)
    del annotated2['GeneName']
    annotated2 = annotated2.set_index('ENTREZ_GENE_ID')

    disease_cls = ['disease state: COPD']
    healthy_cls = ['disease state: Normal']
    logging.info(disease_cls)
    logging.info(healthy_cls)
    disease_gsm = [gsm for gsm in gse.gsms if gse.gsms[gsm].metadata['characteristics_ch1'][0] in disease_cls]
    healthy_gsm = [gsm for gsm in gse.gsms if gse.gsms[gsm].metadata['characteristics_ch1'][0] in healthy_cls]
    logging.info("Disease GSM: {}, Healthy GSM: {}".format(len(disease_gsm), len(healthy_gsm)))
    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'), annotated2)
    utils.write_text(join(args.out_expr_dir, 'processed', 'disease_gsms.txt'), disease_gsm)
    utils.write_text(join(args.out_expr_dir, 'processed', 'healthy_gsms.txt'), healthy_gsm)
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[logging.FileHandler("../logs/report.log")])
    logging.info(args)

    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw'))
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed'))

    gse = GEOparse.get_GEO(geo='GSE104468',
                           destdir=join(args.out_expr_dir, 'raw'))
    annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL21185'],
                                       'GENE_SYMBOL')
    convmatr = utils.gm._sources['biomart'].set_index('symbol')
    annotated2 = annotated[~pd.isnull(annotated.GENE_SYMBOL)]
    annotated2 = annotated2.loc[~annotated2.isnull().values.all(axis=1)]
    annotated2['ENTREZ_GENE_ID'] = convmatr.loc[
        annotated2.GENE_SYMBOL.tolist()].entrez.tolist()
    annotated2 = annotated2[~annotated2.ENTREZ_GENE_ID.isnull()]
    annotated2['ENTREZ_GENE_ID'] = annotated2['ENTREZ_GENE_ID'].astype(int)
    del annotated2['GENE_SYMBOL']
    annotated2 = annotated2.set_index('ENTREZ_GENE_ID')

    classes = {}
    classes['asthma_pbmc'] = ['disease state: Asthma', 'cell type: PBMC']
    classes['asthma_bronch'] = [
        'disease state: Asthma', 'cell type: bronchial epithelia'
    ]
    classes['asthma_nasal'] = [
        'disease state: Asthma', 'cell type: nasal epithelia'
    ]
    classes['normal_pbmc'] = ['disease state: Normal', 'cell type: PBMC']
    classes['normal_bronch'] = [
        'disease state: Normal', 'cell type: bronchial epithelia'
    ]
    classes['normal_nasal'] = [
        'disease state: Normal', 'cell type: nasal epithelia'
    ]

    logging.info(classes)
    gsms = {
        cls: [
            gsm for gsm in gse.gsms
            if gse.gsms[gsm].metadata['characteristics_ch1'][2] == classes[cls]
            [0] and gse.gsms[gsm].metadata['characteristics_ch1'][1] ==
            classes[cls][1]
        ]
        for cls in classes
    }

    logging.info(' '.join(
        ['{} GSM:{}'.format(cls, len(gsms[cls])) for cls in classes]))
    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'),
                     annotated2)
    for cls in classes:
        utils.write_text(
            join(args.out_expr_dir, 'processed', '{}_gsms.txt'.format(cls)),
            gsms[cls])