Example #1
0
 def test_duplicate_column(self):
     columns = ['ID',
                'COL',
                'ROW',
                'NAME',
                'SPOT_ID',
                'CONTROL_TYPE',
                'REFSEQ',
                'GB_ACC',
                'GENE',
                'GENE_SYMBOL',
                'GENE_NAME',
                'UNIGENE_ID',
                'ENSEMBL_ID',
                'TIGR_ID',
                'ACCESSION_STRING',
                'CHROMOSOMAL_LOCATION',
                'CYTOBAND',
                'DESCRIPTION',
                'GO_ID',
                'SEQUENCE',
                'SPOT_ID.1',
                'ORDER']
     columns2 = ['ID', 'COL', 'ROW', 'NAME', 'SPOT_ID', 'CONTROL_TYPE',
                 'ENSEMBL_ID', 'GB_ACC',
                 'GENE', 'GENE_SYMBOL', 'ENSEMBL_ID.1', 'UNIGENE_ID',
                 'ENSEMBL_ID.2', 'TIGR_ID',
                 'ACCESSION_STRING', 'CHROMOSOMAL_LOCATION', 'CYTOBAND',
                 'DESCRIPTION', 'GO_ID',
                 'SEQUENCE', 'SPOT_ID.1', 'ORDER']
     gpl = GEO.get_GEO(filepath=join(download_geo, "GPL4133.txt"))
     self.assertEqual(list(gpl.columns.index), columns)
     gpl2 = GEO.get_GEO(filepath=join(download_geo, "GPL4134.txt"))
     self.assertEqual(list(gpl2.columns.index), columns2)
Example #2
0
 def test_get_geo_and_data(self):
     gsm = GEO.get_GEO(geo="GSM11805", destdir=download_geo)
     self.assertTrue(isinstance(gsm, GSM))
     self.assertEqual(gsm.get_accession(), "GSM11805")
     self.assertEqual(len(gsm.table.index), 22283)
     self.assertEqual(len(gsm.columns), 3)
     self.assertEqual(len(gsm.metadata.keys()), 28)
Example #3
0
 def test_merge_and_average(self):
     gse = GEO.get_GEO(filepath=join(download_geo, "soft_ex_family.txt"),
                       geotype="GSE")
     result = read_table(
         join(download_geo, "test_merged_by_id_and_averaged_by_gb_acc.tab"),
         index_col=0,
     )
     result = result.loc[sorted(
         result.index), sorted(
             result.columns
         )]  # gse.gsms is a dict so the columns might be in different order
     merged = gse.merge_and_average(
         gse.gpls[next(iter(gse.gpls))],
         "VALUE",
         "GB_ACC",
         gpl_on="ID",
         gsm_on="ID_REF",
     )
     merged = merged[sorted(
         merged.columns
     )]  # gse.gsms is a dict so the columns might be in different order
     assert_frame_equal(merged, result)
     with self.assertRaises(KeyError):
         gse.merge_and_average("platform",
                               "VALUE",
                               "GB_ACC",
                               gpl_on="ID",
                               gsm_on="ID_REF")
     with self.assertRaises(ValueError):
         gse.merge_and_average(["platform"],
                               "VALUE",
                               "GB_ACC",
                               gpl_on="ID",
                               gsm_on="ID_REF")
Example #4
0
    def test_download_SRA_parallel_by_sra(self):
        geo_id = "GSE63525"  # Hi-C dataset from Rao et al.

        def filterby(x):
            return ("HIC173" in x.metadata["title"][0]
                    or "HIC174" in x.metadata["title"][0]
                    or "HIC175" in x.metadata["title"][0])

        destdir = "./TMP_SOFT_parallel_by_sra"
        gse = GEO.get_GEO(geo=geo_id, destdir=destdir)
        downloaded_paths = gse.download_SRA(
            "*****@*****.**",  # some unused e-mail
            directory=destdir,
            filetype="sra",
            filterby=filterby,
            silent=True,
            keep_sra=True,
            nproc=3,
        )
        print(downloaded_paths)
        self.assertTrue(isdir(destdir))
        self.assertEqual(len(downloaded_paths), 3)
        for k in downloaded_paths.keys():
            self.assertTrue(k in gse.gsms.keys())
        for k in ["GSM1551718", "GSM1551719", "GSM1551720"]:
            self.assertTrue(k in downloaded_paths.keys())
        for k in downloaded_paths.keys():
            for f in downloaded_paths[k]["SRA"]:
                self.assertTrue(isfile(f))
Example #5
0
    def test_download_SRA_parallel_by_gsm(self):
        geo_id = "GSE63525"  # Hi-C dataset from Rao et al.

        def filterby(x):
            return ("HIC173" in x.metadata["title"][0]
                    or "HIC174" in x.metadata["title"][0]
                    or "HIC175" in x.metadata["title"][0])

        destdir = "./TMP_SOFT_parallel_by_gsm"

        gse = GEO.get_GEO(geo=geo_id, destdir=destdir)
        gsms_to_use = [gsm for gsm in gse.gsms.values() if filterby(gsm)]
        downloaded_paths = dict()
        for gsm in gsms_to_use:
            downloaded_paths[gsm.name] = gsm.download_SRA(
                "*****@*****.**",  # some unused e-mail
                directory=destdir,
                nproc=3,
                return_list=False,
                filetype="sra",
                silent=True,
                keep_sra=True,
            )
        self.assertTrue(isdir(destdir))
        self.assertEqual(len(downloaded_paths), 3)
        for k in downloaded_paths.keys():
            self.assertTrue(k in gse.gsms.keys())
        for k in ["GSM1551718", "GSM1551719", "GSM1551720"]:
            self.assertTrue(k in downloaded_paths.keys())
        for k in downloaded_paths.keys():
            for f in downloaded_paths[k]["SRA"]:
                self.assertTrue(isfile(f))
Example #6
0
def GSEA(geo_ID, gene_list):
    gse = GEOparse.get_GEO(geo=geo_ID, destdir="./")
    expression = gse.pivot_samples('VALUE').T
    experiments = {}
    for i, (idx, row) in enumerate(gse.phenotype_data.iterrows()):
        tmp = {}
        tmp["Type"] = 1 if "control" in row["description"] else 0
        experiments[i] = tmp
    experiments = pd.DataFrame(experiments).T
    counter = 0
    all_genes_set = []
    all_corr_set = []
    genes_corr_set = []
    for gene in expression:
        counter += 1
        if counter <= 3:
            continue
        all_genes_set.append(gene)
        corr_matrix = np.corrcoef(
            [list(experiments['Type']),
             list(expression[gene])])
        all_corr_set.append(corr_matrix[0, 1])
        if gene in gene_list:
            genes_corr_set.append(corr_matrix[0, 1])
    p_value = ks_2samp(genes_corr_set, all_corr_set)[1]
    return (str(p_value))
Example #7
0
def gene_expression_pipeline(geo_id, tissue_origin, gene_locations):
    """
    Given a GEO id and a gene location table, runs the whole data
    pipeline of annotation -> duplication removal -> filtering on the GEO gene expression
    table.
    The tissue origin parameter is given to save intermediate results of the pipeline
    and reduce runtime when trying different filtering parameters. An empty string can be
    supplied to disallow saving of intermediate files.

    NOTE:
    1. GEOparse will download an expression SOFT file and save it in destdir unless one
        is already provided.
    2. table_add_gene_annotations requires several minutes to run (because of eutils requests).
       Do NOT terminate early.
    3. After running table_remove_duplicates, for some reason the ID column header is not saved
       in the csv. This can be easily fixed manually once (not worth code intervention).
    """
    gse_data = geo.get_GEO(geo=geo_id, destdir='./expression_data')
    expression_table = generate_raw_expression_table(gse_data)
    expression_table = table_add_gene_annotations(expression_table,
                                                  gene_locations,
                                                  tissue_origin)
    expression_table = table_remove_duplicates(expression_table, tissue_origin)
    filtered_expression = table_filter(expression_table)

    return filtered_expression
Example #8
0
    def test_download_SRA_parallel_by_sra(self):
        geo_id = 'GSE63525'  # Hi-C dataset from Rao et al.

        def filterby(x):
            return 'HIC173' in x.metadata['title'][0] \
                   or 'HIC174' in x.metadata['title'][0] \
                   or 'HIC175' in x.metadata['title'][0]

        destdir = "./TMP_SOFT_parallel_by_sra"
        gse = GEO.get_GEO(geo=geo_id, destdir=destdir)
        downloaded_paths = gse.download_SRA("*****@*****.**",  # some unused e-mail
                                            directory=destdir,
                                            filetype='sra',
                                            filterby=filterby,
                                            silent=True,
                                            keep_sra=True,
                                            nproc=3)
        print(downloaded_paths)
        self.assertTrue(isdir(destdir))
        self.assertEqual(len(downloaded_paths), 3)
        for k in downloaded_paths.keys():
            self.assertTrue(k in gse.gsms.keys())
        for k in ['GSM1551718', 'GSM1551719', 'GSM1551720']:
            self.assertTrue(k in downloaded_paths.keys())
        for k in downloaded_paths.keys():
            for f in downloaded_paths[k]['SRA']:
                self.assertTrue(isfile(f))
Example #9
0
 def test_get_geo_and_data(self):
     gpl = GEO.get_GEO(geo="GPL96", destdir=download_geo)
     self.assertTrue(isinstance(gpl, GPL))
     self.assertEqual(gpl.name, "GPL96")
     self.assertEqual(gpl.get_accession(), "GPL96")
     self.assertEqual(len(gpl.table.index), 22283)
     self.assertEqual(len(gpl.columns), 16)
Example #10
0
def read_soft(datapath_or_datastring,
              is_datafile=True,
              return_filtered_data=False,
              rows=None,
              columns=None):
    """Read a file in SOFT format, either from a file or from a string of raw data.

    :param (string) datapath_or_datastring: Either the path to the SOFT data file (can be relative
                                            or absolute), or a string corresponding to the content
                                            of a SOFT file (including newline characters).
    :param (bool, optional) is_datafile: Either True (default) if passing the filepath to the data,
                                         or False if passing a string of raw data.
    :param (bool) return_filtered_data: Either False (default) to return all the metadata, or True
                                        to return only the data filtered by rows and/or columns.
    :param (list[string]) rows: The rows that should be filtered in if `is_data_unfiltered` is
                                False.
    :param (list[string]) columns: The columns that should be filtered in if `is_data_unfiltered`
                                   is False.

    :rtype (tuple|ndarray): Either a tuple containing the description (metadata), subsets, row
                            names, and column names for the SOFT data if `return_filtered_data` is
                            False, or an array of the filtered SOFT data if `return_filtered_data`
                            is True.
    """

    # ensure required argument is a string
    err_msg = 'Please pass either the filepath to the data, or the data as a string.'
    assert isinstance(datapath_or_datastring, str), err_msg

    if is_datafile:
        filepath = datapath_or_datastring
    else:
        with tempfile.NamedTemporaryFile(mode='w+',
                                         delete=False,
                                         suffix='.soft') as tf:
            tf.write(datapath_or_datastring)
            filepath = tf.name

    geo_file = gp.get_GEO(filepath=filepath, geotype='GDS')

    df = geo_file.table
    df.set_index('ID_REF', inplace=True)

    all_rows = list(df.index.values)
    all_cols = list(df.columns.values)

    for column in all_cols:
        if 'GSM' not in column:
            all_cols.remove(column)

    if return_filtered_data:
        return _get_selected_data(df, all_rows, all_cols, rows, columns)

    desc = geo_file.metadata
    subsets = geo_file.subsets

    for subset in subsets:
        subsets[subset] = subsets[subset].metadata

    return desc, subsets, all_rows, all_cols
Example #11
0
 def test_get_geo_and_data_with_annotations(self):
     gpl = GEO.get_GEO(geo="GPL96", destdir=download_geo, annotate_gpl=True)
     self.assertTrue(isinstance(gpl, GPL))
     self.assertEqual(gpl.name, "GPL96")
     self.assertEqual(gpl.get_metadata_attribute('platform'), "GPL96")
     self.assertEqual(len(gpl.table.index), 22283)
     self.assertEqual(len(gpl.columns), 21)
Example #12
0
 def test_get_geo_and_data_with_annotations(self):
     gpl = GEO.get_GEO(geo="GPL96", destdir=download_geo, annotate_gpl=True)
     self.assertTrue(isinstance(gpl, GPL))
     self.assertEqual(gpl.name, "GPL96")
     self.assertEqual(gpl.get_metadata_attribute('platform'), "GPL96")
     self.assertEqual(len(gpl.table.index), 22283)
     self.assertEqual(len(gpl.columns), 21)
Example #13
0
 def test_pivot_samples(self):
     gse = GEO.get_GEO(filepath=join(download_geo, "soft_ex_family.txt"),
                       geotype="GSE")
     result = read_table(
         join(download_geo, "test_sample_pivoted_by_value.tab"), index_col=0)
     result.columns.name = 'name'
     assert_frame_equal(gse.pivot_samples("VALUE"), result)
Example #14
0
 def test_get_geo_and_data(self):
     gpl = GEO.get_GEO(geo="GPL96", destdir=download_geo)
     self.assertTrue(isinstance(gpl, GPL))
     self.assertEqual(gpl.name, "GPL96")
     self.assertEqual(gpl.get_accession(), "GPL96")
     self.assertEqual(len(gpl.table.index), 22283)
     self.assertEqual(len(gpl.columns), 16)
Example #15
0
 def test_pivot_samples(self):
     gse = GEO.get_GEO(filepath=join(download_geo, "soft_ex_family.txt"),
                       geotype="GSE")
     result = read_table(
         join(download_geo, "test_sample_pivoted_by_value.tab"), index_col=0)
     result.columns.name = 'name'
     assert_frame_equal(gse.pivot_samples("VALUE"), result)
Example #16
0
 def test_get_geo_and_data(self):
     gsm = GEO.get_GEO(geo="GSM11805", destdir=download_geo)
     self.assertTrue(isinstance(gsm, GSM))
     self.assertEqual(gsm.get_accession(), "GSM11805")
     self.assertEqual(len(gsm.table.index), 22283)
     self.assertEqual(len(gsm.columns), 3)
     self.assertEqual(len(gsm.metadata.keys()), 28)
Example #17
0
    def test_download_SRA_parallel_by_gsm(self):
        geo_id = 'GSE63525'  # Hi-C dataset from Rao et al.

        def filterby(x):
            return 'HIC173' in x.metadata['title'][0] \
                   or 'HIC174' in x.metadata['title'][0] \
                   or 'HIC175' in x.metadata['title'][0]

        destdir = "./TMP_SOFT_parallel_by_gsm"

        gse = GEO.get_GEO(geo=geo_id, destdir=destdir)
        gsms_to_use = [gsm for gsm in gse.gsms.values() if filterby(gsm)]
        downloaded_paths = dict()
        for gsm in gsms_to_use:
            downloaded_paths[gsm.name] = gsm.download_SRA("*****@*****.**",  # some unused e-mail
                                                          directory=destdir,
                                                          nproc=3,
                                                          return_list=False,
                                                          filetype='sra',
                                                          silent=True,
                                                          keep_sra=True)
        self.assertTrue(isdir(destdir))
        self.assertEqual(len(downloaded_paths), 3)
        for k in downloaded_paths.keys():
            self.assertTrue(k in gse.gsms.keys())
        for k in ['GSM1551718', 'GSM1551719', 'GSM1551720']:
            self.assertTrue(k in downloaded_paths.keys())
        for k in downloaded_paths.keys():
            for f in downloaded_paths[k]['SRA']:
                self.assertTrue(isfile(f))
Example #18
0
def download_regular(name,
                     filterby=None,
                     metadata_path="./",
                     destdir="./TMP_SOFT",
                     nthreads=20):

    geo_id = gse_dict[name]

    gse = GEOparse.get_GEO(geo=geo_id, destdir=destdir)
    gsms = gse.gsms

    if filterby is None:
        downloaded_paths = gse.download_SRA(
            '*****@*****.**',
            filetype='fastq',
            fastq_dump_options=fastq_dump_options,
            nproc=nthreads,
            silent=True)
    else:
        downloaded_paths = gse.download_SRA(
            '*****@*****.**',
            filetype='fastq',
            filterby=filterby,
            fastq_dump_options=fastq_dump_options,
            nproc=nthreads,
            silent=True)  # fix a bug with multiple replicates!

    metadata_collected_dict = {
        x: parse_metadata(gsms[x], mode=name)
        for x in downloaded_paths.keys()
    }

    metadata_collected_list = []
    for k in metadata_collected_dict:
        for i in range(len(downloaded_paths[k]) // 2):
            d = metadata_collected_dict[k]
            d['path fastq R1'] = downloaded_paths[k][2 * i]
            d['path fastq R2'] = downloaded_paths[k][2 * i + 1]
            d['gse'] = geo_id
            metadata_collected_list.append(dict(d))

    df1 = pd.DataFrame(metadata_collected_list)

    df1 = pd.concat([
        g.drop('index', axis=1).reset_index(drop=True)
        for i, g in df1.reset_index().groupby("index")
    ]).reset_index()
    df1.loc[:, 'technical_rep'] = df1.loc[:, 'index'] + 1
    df1 = df1.drop('index', axis=1)
    df1 = df1.applymap(lambda x: str(x).replace(' ', '-'))

    df1.loc[:, "running_mode"] = name
    df1.to_csv(os.path.join(metadata_path, '{}_metadata.tsv'.format(geo_id)),
               sep='\t')
    df1 = df1.drop(['processing', 'protocol'], axis=1)
    df1.to_csv(os.path.join(metadata_path,
                            '{}_metadata_short.tsv'.format(geo_id)),
               sep='\t')
Example #19
0
def get_geo_metadata(acc, experiment_type=None):
    '''
    Parses information associated with a GEO Series or single experiment.
    Uses GEOparse library which downloads records from NCBI ftp rather than using
    NCBI Entrez e-utils, resulting in a single request rather than many. This
    function will parse information from the files and then delete them. Returns
    a Dataset object, holding information about all the associated experiments
    and biosamples.
    '''
    if acc.startswith('GSE') or '/GSE' in acc:  # experiment series
        if '/' in acc:
            gse = GEOparse.get_GEO(filepath=acc)
        else:
            gse = GEOparse.get_GEO(geo=acc)  # pragma: no cover
        # create Experiment objects from each GSM file
        experiments = [obj for obj in [parse_gsm(gsm, experiment_type) for gsm in gse.gsms.values()] if obj]
        # delete file after GSMs are parsed
        if '/' not in acc:
            print('GEO parsing done. Removing downloaded soft file.')
            os.remove('{}_family.soft.gz'.format(acc))
        if not experiments:
            print('Sequencing experiments not found. Exiting.')
            return
        gds = Dataset(acc, gse.metadata['sample_id'], experiments,
                      [parse_bs_record(experiment.bs) for experiment in experiments])
        return gds
    elif acc.startswith('GSM') or '/GSM' in acc:  # single experiment
        if '/' in acc:
            gsm = GEOparse.get_GEO(filepath=acc)
        else:
            gsm = GEOparse.get_GEO(geo=acc)  # pragma: no cover
        exp = parse_gsm(gsm, experiment_type)
        print("GEO parsing done. Removing downloaded soft file.")
        try:
            os.remove('{}.txt'.format(acc))  # delete file after GSM is parsed
        except Exception:
            pass
        if not exp:
            print("Accession not a sequencing experiment, or couldn't be parsed. Exiting.")
            return
        gds = Dataset(None, [acc], [exp], [parse_bs_record(exp.bs)])
        return gds
    else:
        print('Input not a valid GEO accession.')
        return
Example #20
0
def open_gds(filename):
    gds = GEOparse.get_GEO(filepath="%s" % filename)
    Table = gds.table
    metadata = gds.columns
    Table = Table.drop(columns='ID_REF')
    Table = Table[(Table['IDENTIFIER'] != '--Control') |
                  (Table['IDENTIFIER'] != 'control')]
    Table = Table.groupby('IDENTIFIER').mean().reset_index()
    return Table
Example #21
0
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[logging.FileHandler("../logs/report.log")])
    logging.info(args)

    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw'))
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed'))

    gse = GEOparse.get_GEO(geo='GSE64913',
                           destdir=join(args.out_expr_dir, 'raw'))
    annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL570'],
                                       'ENTREZ_GENE_ID')
    annotated2 = annotated[~pd.isnull(annotated.ENTREZ_GENE_ID)]
    annotated2 = annotated2.loc[~annotated2.isnull().values.all(axis=1)]
    annotated2['ENTREZ_GENE_ID'] = annotated2.ENTREZ_GENE_ID.str.split(
        '///').str[0].astype(int)
    annotated2 = annotated2.set_index('ENTREZ_GENE_ID')

    classes = {}
    classes['healthy_cae'] = [
        'diagnosis: Healthy', 'cell type: Central airway epithelium'
    ]
    classes['healthy_pae'] = [
        'diagnosis: Healthy', 'cell type: Peripheral airway epithelium'
    ]
    classes['asthma_cae'] = [
        'diagnosis: Severe Asthmatic', 'cell type: Central airway epithelium'
    ]
    classes['asthma_pae'] = [
        'diagnosis: Severe Asthmatic',
        'cell type: Peripheral airway epithelium'
    ]

    logging.info(classes)
    gsms = {
        cls: [
            gsm for gsm in gse.gsms
            if gse.gsms[gsm].metadata['characteristics_ch1'][1] == classes[cls]
            [0] and gse.gsms[gsm].metadata['characteristics_ch1'][5] ==
            classes[cls][1]
        ]
        for cls in classes
    }

    logging.info(' '.join(
        ['{} GSM:{}'.format(cls, len(gsms[cls])) for cls in classes]))
    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'),
                     annotated2)
    for cls in classes:
        utils.write_text(
            join(args.out_expr_dir, 'processed', '{}_gsms.txt'.format(cls)),
            gsms[cls])
Example #22
0
 def test_get_geo_and_data(self):
     gds = GEO.get_GEO(geo="GDS507", destdir=download_geo)
     self.assertTrue(isinstance(gds, GDS))
     self.assertEqual(len(gds.table.index), 22645)
     self.assertEqual(len(gds.table.columns), 19)
     self.assertEqual(len(gds.metadata.keys()), 16) # we omit DATABASE and SUBSET ! entries
     self.assertEqual(len(gds.database.metadata.keys()), 5)
     for subset_name, subset in iteritems(gds.subsets):
         self.assertEqual(len(subset.metadata.keys()), 4)
         self.assertTrue(isinstance(subset, GDSSubset))
def load_dataset(dataset_id, download_location="."):
    """
    Load the dataset from disk (or download it if it does not exists)
    Arguments:
    - dataset_id: the ID of the dataset to load
    
    Output:
    - GSE object (GEOparse Series)
    """
    path = download_location + "/" + dataset_id + "_family.soft.gz"
    if os.path.exists(path):
        # Load from an existing file
        print("- Loading from", path)
        gse = GEOparse.get_GEO(filepath=path)
    else:
        # Download GSE and load it
        print("- Downloading", dataset_id)
        gse = GEOparse.get_GEO(geo=dataset_id, destdir=download_location + "/")
    return gse
Example #24
0
def test_get_geo_metadata_sra_hidden(capfd, mocker, hidden_sra):
    gse_all = GEOparse.get_GEO(
        filepath='./tests/data_files/GSE93431_family.soft.gz')
    mocker.patch('scripts.geo2fdn.parse_bs_record',
                 return_value='SAMNXXXXXXXX')
    mocker.patch('scripts.geo2fdn.parse_gsm', return_value=hidden_sra)
    gse = geo.get_geo_metadata('./tests/data_files/GSE93431_family.soft.gz')
    out, err = capfd.readouterr()
    assert not gse
    assert len(gse_all.gsms.values()) > 10
Example #25
0
def get_geo(geo_accesions: list = None, paths=None):
    """

    Args:
        geo_accesions:

    Returns:

    """
    geos = list()
    if paths is None:
        for i in geo_accesions:
            geos.append(geo.get_GEO(i))
        geo_entries = geos
    else:
        for i in paths:
            geos.append(geo.get_GEO(filepath=os.path.join(i)))
        geo_entries = geos
    return geo_entries
Example #26
0
def get_geo(geo_id):

    #Import GSE dataset
    gse = GEOparse.get_GEO(geo=str(geo_id).upper())
    data = gse.pivot_samples('VALUE')
    del data.index.name

    data = clean_df(data)

    return data
Example #27
0
def comp_test(accession):
    start = time.time()
    gpl_data = GEOparse.get_GEO(geo = accession, destdir = "./cache", silent = True)
    table = gpl_data.table
    for line in table:
        #just for minimal processing overhead so not optimized away (can python even do that?), should never trigger
        if(len(line) == 10000000000):
            print("loooooong")
    end = time.time()
    print(end - start)
Example #28
0
def open_gds(filename):
    #Take filename input and parse geoquery data. Clean the table to remove unneccessary rows
    gds = GEOparse.get_GEO(filepath="%s" % filename)
    Table = gds.table
    metadata = gds.columns
    Table = Table.drop(columns='ID_REF')
    Table = Table[(Table['IDENTIFIER'] != '--Control') |
                  (Table['IDENTIFIER'] != 'control')]
    Table = Table.groupby("IDENTIFIER").mean().reset_index()
    return Table
Example #29
0
def get_from_geo(accession, disease):
    """Downloads a dataset from GEO."""
    geodir = tempfile.TemporaryDirectory()
    print("geodir is {}".format(geodir.name))
    print("Downloading data set {} from GEO....".format(accession),
          flush=True)
    # silent=True has no effect,
    # see https://github.com/guma44/GEOparse/issues/19
    raw_gse = GEOparse.get_GEO(geo=accession, destdir=geodir.name, silent=True)
    return GEOSeries(raw_gse, disease)
Example #30
0
 def test_duplicate_column(self):
     columns = [
         'ID', 'COL', 'ROW', 'NAME', 'SPOT_ID', 'CONTROL_TYPE', 'REFSEQ',
         'GB_ACC', 'GENE', 'GENE_SYMBOL', 'GENE_NAME', 'UNIGENE_ID',
         'ENSEMBL_ID', 'TIGR_ID', 'ACCESSION_STRING',
         'CHROMOSOMAL_LOCATION', 'CYTOBAND', 'DESCRIPTION', 'GO_ID',
         'SEQUENCE', 'SPOT_ID.1', 'ORDER'
     ]
     columns2 = [
         'ID', 'COL', 'ROW', 'NAME', 'SPOT_ID', 'CONTROL_TYPE',
         'ENSEMBL_ID', 'GB_ACC', 'GENE', 'GENE_SYMBOL', 'ENSEMBL_ID.1',
         'UNIGENE_ID', 'ENSEMBL_ID.2', 'TIGR_ID', 'ACCESSION_STRING',
         'CHROMOSOMAL_LOCATION', 'CYTOBAND', 'DESCRIPTION', 'GO_ID',
         'SEQUENCE', 'SPOT_ID.1', 'ORDER'
     ]
     gpl = GEO.get_GEO(filepath=join(download_geo, "GPL4133.txt"))
     self.assertEqual(list(gpl.columns.index), columns)
     gpl2 = GEO.get_GEO(filepath=join(download_geo, "GPL4134.txt"))
     self.assertEqual(list(gpl2.columns.index), columns2)
def main():
    fname = "cache.pkl"
    rebuildcache = True

    if rebuildcache:
        data1 = GEOparse.get_GEO(
            filepath="./GDS2947.soft.gz"
        )  # Adenoma/Healthy Set (Testing?) (True Count)
        data2 = GEOparse.get_GEO(
            filepath="./GDS4379.soft.gz"
        )  # Adenocarcinoma (Testing?) (True Count) Samples 64
        data4 = GEOparse.get_GEO(
            filepath="./GDS4393.soft.gz")  # Metastatic/Tumor Set (True Count)
        data5 = GEOparse.get_GEO(
            filepath="./GDS4513.soft.gz"
        )  # Tumor/Excised Set (Transformed count) Samples 53
        data6 = GEOparse.get_GEO(
            filepath="./GDS4516.soft.gz"
        )  # Metastatic/Stage 3 Set (Transformed count) Samples 104
        data7 = GEOparse.get_GEO(filepath="./GSE137140_family.soft.gz")
        data8 = GEOparse.get_GEO(filepath="./GSE134347_family.soft.gz")
        tup = (data1, data2, data4, data5, data6, data7, data8)
        with open(fname, 'wb') as f:
            pickle.dump(tup, f)
    else:
        with open(fname, 'rb') as f:
            tup = pickle.load(f)
        (data1, data2, data4, data5, data6, data7, data8) = tup
    gseData, gseClass, geneIndex = unpackValues(data8)

    # table1_expression, table1_info = getValues(data1)
    # table1_expression = np.log2(table1_expression)
    # table6_expression, table6_info = getValues(data6)
    # table2_expression, table2_info = getValues(data2)

    # catDataE, catDataI = catTables([table1_expression, table6_expression, table2_expression], [table1_info, table6_info,table2_info])
    # infolen = catDataI.shape
    # infolen = infolen[0]
    # classification = np.zeros((infolen), dtype=int)
    # for i in range(infolen):
    #     if (catDataI[i] == "adenoma" or catDataI[i] == "Large Intestine, Villous Adenoma"):
    #         classification[i] = 1
    #     elif (catDataI[i] == "normal mucosa"):
    #         classification[i] = 2
    trainSizes = np.linspace(0.2, 0.8, 4)
    regMax = np.array([0, 0, []])
    regularization = np.array([0, 0, [], 0, 0])

    for t in trainSizes:
        cMax, cVals = crossValidation(gseData, gseClass, t)
        regMax = np.vstack((regMax, cMax))
        regularization = np.vstack((regularization, cVals))
    regMax = regMax[1:, :]
    regularization = regularization[1:, :]
    plotRegularization(regularization, trainSizes)
    geneMax(regMax[3], geneIndex)
    print(1)
Example #32
0
def make_meta(gse_name):
    gse = GEOparse.get_GEO(geo=gse_name)
    gsms = list(gse.gsms.keys())
    gsms.sort()

    meta = [{
        "gsm": gsm,
        "sra": gse.gsms[gsm].relations["SRA"][0].split("=")[1],
        "title": gse.gsms[gsm].metadata["title"][0]
    } for gsm in gsms]
    return (meta)
Example #33
0
def get_value_from_sample_by_ids(gsm, id_refs):

    gsm = GEOparse.get_GEO(geo=gsm, destdir=cache)

    #print(gsm.table["ID_REF"])

    data = gsm.table

    values = data.loc[data["ID_REF"].isin(id_refs)].to_dict("records")

    return values
Example #34
0
 def test_annotate(self):
     gse = GEO.get_GEO(filepath=join(download_geo, "soft_ex_family.txt"), geotype="GSE")
     gsm = gse.gsms["Triple-Fusion Transfected Embryonic Stem Cells Replicate 1"]
     result = read_table(join(download_geo, "test_gsm_annotated.tab"))
     gpl = gse.gpls[next(iter(gse.gpls))]
     assert_frame_equal(result, gsm.annotate(gpl, annotation_column="GB_ACC"))
     assert_frame_equal(result, gsm.annotate(gpl.table, annotation_column="GB_ACC"))
     with self.assertRaises(TypeError):
         gsm.annotate("platform", annotation_column="GB_ACC")
     gsm.annotate(gpl.table, annotation_column="GB_ACC", in_place=True)
     assert_frame_equal(result, gsm.table)
Example #35
0
 def test_merge_and_average(self):
     gse = GEO.get_GEO(filepath=join(download_geo, "soft_ex_family.txt"), geotype="GSE")
     result = read_table(join(download_geo, "test_merged_by_id_and_averaged_by_gb_acc.tab"), index_col=0)
     result = result.ix[sorted(result.index), sorted(result.columns)]  # gse.gsms is a dict so the columns might be in different order
     merged = gse.merge_and_average(gse.gpls[gse.gpls.keys()[0]], "VALUE", "GB_ACC", gpl_on="ID", gsm_on="ID_REF")
     merged = merged[sorted(merged.columns)]  # gse.gsms is a dict so the columns might be in different order
     assert_frame_equal(merged, result)
     with self.assertRaises(KeyError):
         gse.merge_and_average("platform", "VALUE", "GB_ACC", gpl_on="ID", gsm_on="ID_REF")
     with self.assertRaises(ValueError):
         gse.merge_and_average(["platform"], "VALUE", "GB_ACC", gpl_on="ID", gsm_on="ID_REF")
Example #36
0
 def on_click():
     genes = genesEdit.text().split(',')
     gse = GEOparse.get_GEO(geo='GSE' + str(gseEdit.text()), destdir='./')
     for gsm_name, gsm in gse.gsms.items():
         print("Name: ", gsm_name)
         print("Metadata:", )
         for key, value in gsm.metadata.items():
             print(" - %s : %s" % (key, ", ".join(value)))
         print("Table data:", )
         print(gsm.table.head())
         break
Example #37
0
 def test_pivot_and_annotate(self):
     gse = GEO.get_GEO(filepath=join(download_geo, "soft_ex_family.txt"), geotype="GSE")
     gpl = gse.gpls[next(iter(gse.gpls))]
     result = read_table(join(download_geo, "test_sample_pivoted_by_value_and_annotated_by_gbacc.tab"), index_col=0)
     result.columns.name = 'name'
     pivoted = gse.pivot_and_annotate(values="VALUE", gpl=gpl, annotation_column="GB_ACC")
     assert_frame_equal(result, pivoted)
     assert_frame_equal(gse.pivot_and_annotate(values="VALUE", gpl=gpl.table, annotation_column="GB_ACC"),
                        result)
     with self.assertRaises(TypeError):
         gse.pivot_and_annotate(values="VALUE", gpl="gpl", annotation_column="GB_ACC")
Example #38
0
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[logging.FileHandler("../logs/report.log")])
    logging.info(args)

    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'raw'))
    utils.create_dir_if_not_exist(join(args.out_expr_dir, 'processed'))

    gse = GEOparse.get_GEO(geo='GSE54837',
                           destdir=join(args.out_expr_dir, 'raw'))
    annotated = gse.pivot_and_annotate('VALUE', gse.gpls['GPL570'],
                                       'ENTREZ_GENE_ID')
    annotated2 = annotated[~pd.isnull(annotated.ENTREZ_GENE_ID)]
    annotated2 = annotated2.loc[~annotated2.isnull().values.all(axis=1)]
    annotated2['ENTREZ_GENE_ID'] = annotated2.ENTREZ_GENE_ID.str.split(
        '///').str[0].astype(int)
    annotated2 = annotated2.set_index('ENTREZ_GENE_ID')
    annotated2[annotated2 <= 0] = 0.001
    annotated2 = np.log(annotated2)
    disease_cls = ['subject type: COPD Subjects']
    healthy_cls = [
        'subject type: Non-smoker Controls', 'subject type: Smoker Controls'
    ]
    healthy_non_smoker_cls = ['subject type: Non-smoker Controls']
    logging.info(disease_cls)
    logging.info(healthy_cls)
    logging.info(healthy_non_smoker_cls)
    disease_gsm = [
        gsm for gsm in gse.gsms
        if gse.gsms[gsm].metadata['characteristics_ch1'][5] in disease_cls
    ]
    healthy_gsm = [
        gsm for gsm in gse.gsms
        if gse.gsms[gsm].metadata['characteristics_ch1'][5] in healthy_cls
    ]
    healthy_non_smoker_gsm = [
        gsm for gsm in gse.gsms
        if gse.gsms[gsm].metadata['characteristics_ch1'][5] in
        healthy_non_smoker_cls
    ]
    logging.info(
        "Disease GSM: {}, Healthy GSM: {}, Healthy non smoker GSM: {}".format(
            len(disease_gsm), len(healthy_gsm), len(healthy_non_smoker_gsm)))
    utils.create_dir_if_not_exist(args.out_expr_dir)
    utils.write_expr(join(args.out_expr_dir, 'processed', 'expr.tsv'),
                     annotated2)
    utils.write_text(join(args.out_expr_dir, 'processed', 'disease_gsms.txt'),
                     disease_gsm)
    utils.write_text(join(args.out_expr_dir, 'processed', 'healthy_gsms.txt'),
                     healthy_gsm)
Example #39
0
 def test_soft_format_gse(self):
     print download_geo
     gse = GEO.get_GEO(geo="GSE1563", destdir=download_geo)
     self.assertTrue(isinstance(gse, GSE))
     self.assertEqual(gse.get_accession(), "GSE1563")
     self.assertEqual(len(gse.gsms.keys()), 62)
     self.assertEqual(len(gse.gpls.keys()), 1)
     self.assertEqual(len(gse.gpls[gse.gpls.keys()[0]].table.index), 12625)
     self.assertEqual(len(gse.gsms[gse.gsms.keys()[0]].table.index), 12625)
     for gsm_name, gsm in gse.gsms.iteritems():
         self.assertEqual(len(gsm.table.index), 12625)
         self.assertTrue(isinstance(gsm, GSM))
     for gpl_name, gpl in gse.gpls.iteritems():
         self.assertEqual(len(gpl.table.index), 12625)
         self.assertTrue(isinstance(gpl, GPL))
Example #40
0
    def test_get_geo_gpl_partially(self):
        partial = [
            "GSM1662787",
            "GSM1662789",
            "GSM1662791",
            "GSM1859499"
        ]

        gpl = GEO.get_GEO(geo="GPL20082", destdir=download_geo,
                          include_data=True, partial=partial)
        self.assertTrue(isinstance(gpl, GPL))
        self.assertEqual(gpl.get_accession(), "GPL20082")

        for gsm in gpl.gsms:
            self.assertTrue(gsm in partial)

        self.assertEqual(4, len(gpl.gsms))
Example #41
0
    def test_download_SRA(self):

        gse = GEO.get_GEO(geo="GSE1563", destdir=download_geo)
        self.assertTrue(isinstance(gse, GSE))
        self.assertEqual(gse.get_accession(), "GSE1563")
        self.assertEqual(len(gse.gsms.keys()), 62)
        self.assertEqual(len(gse.gpls.keys()), 1)
        self.assertEqual(len(gse.gpls[next(iter(gse.gpls))].table.index),
                         12625)
        self.assertEqual(len(gse.gsms[next(iter(gse.gsms))].table.index),
                         12625)
        for gsm_name, gsm in iteritems(gse.gsms):
            self.assertEqual(len(gsm.table.index), 12625)
            self.assertTrue(isinstance(gsm, GSM))
        for gpl_name, gpl in iteritems(gse.gpls):
            self.assertEqual(len(gpl.table.index), 12625)
            self.assertTrue(isinstance(gpl, GPL))
Example #42
0
    def test_get_geo_gpl_sequencing(self):
        gpl = GEO.get_GEO(geo="GPL20082", destdir=download_geo, include_data=True)
        self.assertTrue(isinstance(gpl, GPL))
        self.assertEqual(gpl.get_accession(), "GPL20082")
        
        samples = [   
            "GSM1662787", 
            "GSM1662788", 
            "GSM1662789", 
            "GSM1662790", 
            "GSM1662791", 
            "GSM1677167", 
            "GSM1859499", 
            "GSM1875285"
            ]

        for sample in samples:
            self.assertTrue(sample in gpl.gsms)
   
        self.assertEqual(6, len(gpl.gses["GSE68087"].gsms))
        self.assertEqual(2, len(gpl.gses["GSE67974"].gsms))
Example #43
0
def GSEA (geo_ID, gene_list):
    gse = GEOparse.get_GEO(geo=geo_ID, destdir="./")
    expression = gse.pivot_samples('VALUE').T
    experiments = {}
    for i, (idx, row) in enumerate(gse.phenotype_data.iterrows()):
        tmp = {}
        tmp["Type"] = 1 if "control" in row["description"] else 0
        experiments[i] = tmp
    experiments = pd.DataFrame(experiments).T
    counter = 0
    all_genes_set = []
    all_corr_set = []
    genes_corr_set = []
    for gene in expression:
        counter += 1
        if counter <= 3:
            continue
        all_genes_set.append(gene)               
        corr_matrix = np.corrcoef([list(experiments['Type']), list(expression[gene])])
        all_corr_set.append(corr_matrix[0,1])
        if gene in gene_list:
            genes_corr_set.append(corr_matrix[0,1])
    p_value = ks_2samp(genes_corr_set, all_corr_set)[1]
    return(str(p_value))
Example #44
0
 def test_no_table(self):
     try:
         gsm = GEO.get_GEO(filepath=join(download_geo, 'GSM2795971.txt'),
                           geotype='GSM')
     except Exception:
         self.fail("No data in the file error.")
Example #45
0
def download(geo_accession):
    if not os.path.exists("../../data/geo/"):
        os.makedirs("../../data/geo/")
    gse = GEOparse.get_GEO(geo=geo_accession, destdir="../../data/geo/")
    return gse
Example #46
0
 def test_empty_line(self):
     try:
         gsm = GEO.get_GEO(filepath=join(download_geo, 'GSM32878.txt'),
                           geotype='GSM')
     except IndexError:
         self.fail("Empty line in the file causes an error.")
Example #47
0
 def test_name(self):
     gpl = GEO.get_GEO(filepath=join(download_geo, "GPL20814_family.soft"),
                       geotype="GPL")
     self.assertEqual(gpl.name, "GPL20814")
Example #48
0
 def test_name(self):
     gse = GEO.get_GEO(filepath=join(download_geo, "GSE105845_family.soft"),
                       geotype="GSE")
     self.assertEqual(gse.name, "GSE105845")