Beispiel #1
0
def test_teetsv_unicode():

    t1 = ((u"name", u"id"), (u"Արամ Խաչատրյան", 1), (u"Johann Strauß", 2), (u"Вагиф Сәмәдоғлу", 3), (u"章子怡", 4))

    f1 = NamedTemporaryFile(delete=False)
    f2 = NamedTemporaryFile(delete=False)

    (etl.wrap(t1).teetsv(f1.name, encoding="utf-8").selectgt("id", 1).totsv(f2.name, encoding="utf-8"))

    ieq(t1, etl.fromtsv(f1.name, encoding="utf-8").convertnumbers())
    ieq(etl.wrap(t1).selectgt("id", 1), etl.fromtsv(f2.name, encoding="utf-8").convertnumbers())
Beispiel #2
0
def test_teetsv():

    t1 = (("foo", "bar"), ("a", 2), ("b", 1), ("c", 3))

    f1 = NamedTemporaryFile(delete=False)
    f2 = NamedTemporaryFile(delete=False)

    (etl.wrap(t1).teetsv(f1.name, encoding="ascii").selectgt("bar", 1).totsv(f2.name, encoding="ascii"))

    ieq(t1, etl.fromtsv(f1.name, encoding="ascii").convertnumbers())
    ieq(etl.wrap(t1).selectgt("bar", 1), etl.fromtsv(f2.name, encoding="ascii").convertnumbers())
Beispiel #3
0
def test_teetsv():

    t1 = (('foo', 'bar'), ('a', 2), ('b', 1), ('c', 3))

    f1 = NamedTemporaryFile(delete=False)
    f2 = NamedTemporaryFile(delete=False)

    (etl.wrap(t1).teetsv(f1.name,
                         encoding='ascii').selectgt('bar',
                                                    1).totsv(f2.name,
                                                             encoding='ascii'))

    ieq(t1, etl.fromtsv(f1.name, encoding='ascii').convertnumbers())
    ieq(
        etl.wrap(t1).selectgt('bar', 1),
        etl.fromtsv(f2.name, encoding='ascii').convertnumbers())
Beispiel #4
0
def init(release_dir):
    """Initialise data resources.

    Parameters
    ----------
    release_dir : string
        Local filesystem path where data from the release are stored.

    """

    # variation
    ###########

    global callset, callset_pass
    variation_dir = os.path.join(release_dir, 'variation')

    # main callset
    callset_zarr_fn = os.path.join(variation_dir, 'main', 'zarr2',
                                   'ag1000g.phase1.ar3')
    if os.path.exists(callset_zarr_fn):
        callset = zarr.open_group(callset_zarr_fn, mode='r')

    # main callset, PASS variants only
    callset_pass_zarr_fn = os.path.join(variation_dir, 'main', 'zarr2',
                                        'ag1000g.phase1.ar3.pass')
    if os.path.exists(callset_pass_zarr_fn):
        callset_pass = zarr.open_group(callset_pass_zarr_fn, mode='r')

    # haplotypes
    ############

    global callset_phased, tbl_haplotypes, lkp_haplotypes, df_haplotypes
    haplotypes_dir = os.path.join(release_dir, 'haplotypes')

    # try HDF5 first
    callset_phased_h5_fn = os.path.join(haplotypes_dir, 'main', 'hdf5',
                                        'ag1000g.phase1.ar3.1.haplotypes.h5')
    if os.path.exists(callset_phased_h5_fn):
        callset_phased = h5py.File(callset_phased_h5_fn, mode='r')

    # prefer Zarr if available
    # N.B., the Zarr data is not consistent with HDF5 or shapeit outputs,
    # it is based on a previous phasing run.
    #
    #callset_phased_zarr_fn = os.path.join(haplotypes_dir, 'main', 'zarr2',
    #                                      'ag1000g.phase1.ar3.1.haplotypes')
    #if os.path.exists(callset_phased_zarr_fn):
    #    callset_phased = zarr.open_group(callset_phased_zarr_fn, mode='r')

    # haplotypes metadata
    haplotypes_fn = os.path.join(haplotypes_dir, 'haplotypes.meta.txt')
    if os.path.exists(haplotypes_fn):
        tbl_haplotypes = (etl.fromtsv(haplotypes_fn).convert(
            ('index', 'kt_2la', 'kt_2rb'), int))
        lkp_haplotypes = tbl_haplotypes.recordlookupone('label')
        df_haplotypes = pandas.read_csv(haplotypes_fn,
                                        sep='\t',
                                        index_col='index')
Beispiel #5
0
    def get_table(source=None,
                  nrows=None,
                  skip=None,
                  fields=None,
                  exclude=None,
                  rownumbers=True,
                  **petlargs):
        """
        :param source: full path filename of the delimited file
        :param nrows: number of rows to include in the table
        :param skip: number of rows to skip from the file
        :param fields: selected fields to extract from the file
        :param exclude: selected fields to be excluded from the file
        :param rownumbers: Add a rowID column. This is True by default
        This is similar to pandas.RangeIndex see petl.transform.basics.addrownumbers()
        Notice: skip and nrows parameters require that addrownumbers() is applied to petl table
        If `fields` is specified and `rowID` is not included in the list the column will not be included in petl table
        :param petlargs: see petl.io.csv.fromcsv and petl.io.csv.fromtsv
        :return: petl table container
        Notice: petl makes extensive use of lazy evaluation and iterators
        the file is not loaded in memory instead a container/iterator is returned

        Examples:
        etl.get_table('movies.csv', 20, 100, ['rowID', 'movie_title', 'title_year']).lookall()
        etl.get_table('movies.csv', 20, 100, exclude=['language', 'actor_1_name']).lookall()
        """

        # Get the extension of the filename
        table = None
        if source:
            ext = os.path.splitext(source)[1]

            # Read all rows from the file and create a pandas dataframe in memory
            if ext == '.csv':
                table = petl.fromcsv(source, **petlargs)
            elif ext == '.tsv':
                table = petl.fromtsv(source, **petlargs)

            if rownumbers:
                table = table.addrownumbers(start=1, step=1, field='rowID')

            if skip and rownumbers:
                if nrows:
                    table = table.select(lambda num: num.rowID > skip and num.
                                         rowID <= nrows + skip)
                else:
                    table = table.select(lambda num: num.rowID > skip)

            if not skip and nrows and rownumbers:
                table = table.select(lambda num: num.rowID <= nrows)

            if fields:
                table = table.cut(*fields)

            if exclude:
                table = table.cutout(*exclude)

        return table
Beispiel #6
0
    def get_file_header(ftype, fname):
        ext = os.path.splitext(fname)[1][1:]
        if not ftype.lower() == ext:
            raise Exception(
                f'Failed: Filename extension does not match < ftype={ftype} >')

        if ftype == 'CSV':
            return petl.fromcsv(fname).head(0).tol()[0]
        elif ftype == 'TSV':
            return petl.fromtsv(fname).head(0).tol()[0]
Beispiel #7
0
def test_ZipSource():

    # setup
    table = [('foo', 'bar'), ('a', '1'), ('b', '2')]
    totsv(table, 'tmp/issue_241.tsv')
    z = zipfile.ZipFile('tmp/issue_241.zip', mode='w')
    z.write('tmp/issue_241.tsv', 'data.tsv')
    z.close()

    # test
    actual = fromtsv(ZipSource('tmp/issue_241.zip', 'data.tsv'))
    ieq(table, actual)
Beispiel #8
0
def test_ZipSource():

    # setup
    table = [('foo', 'bar'), ('a', '1'), ('b', '2')]
    totsv(table, 'tmp/issue_241.tsv')
    z = zipfile.ZipFile('tmp/issue_241.zip', mode='w')
    z.write('tmp/issue_241.tsv', 'data.tsv')
    z.close()

    # test
    actual = fromtsv(ZipSource('tmp/issue_241.zip', 'data.tsv'))
    ieq(table, actual)
Beispiel #9
0
def test_fromtsv():

    f = NamedTemporaryFile(delete=False)
    writer = csv.writer(f, delimiter="\t")
    table = (("foo", "bar"), ("a", 1), ("b", 2), ("c", 2))
    for row in table:
        writer.writerow(row)
    f.close()

    actual = fromtsv(f.name)
    expect = (("foo", "bar"), ("a", "1"), ("b", "2"), ("c", "2"))
    ieq(expect, actual)
    ieq(expect, actual)  # verify can iterate twice
Beispiel #10
0
def test_issue_231():

    table = [['foo', 'bar'], ['a', '1'], ['b', '2']]
    t = cut(table, 'foo')
    totsv(t, 'tmp/issue_231.tsv')
    u = fromtsv('tmp/issue_231.tsv')
    ieq(t, u)
    tocsv(t, 'tmp/issue_231.csv')
    u = fromcsv('tmp/issue_231.csv')
    ieq(t, u)
    topickle(t, 'tmp/issue_231.pickle')
    u = frompickle('tmp/issue_231.pickle')
    ieq(t, u)
Beispiel #11
0
def test_issue_231():

    table = [['foo', 'bar'], ['a', '1'], ['b', '2']]
    t = cut(table, 'foo')
    totsv(t, 'tmp/issue_231.tsv')
    u = fromtsv('tmp/issue_231.tsv')
    ieq(t, u)
    tocsv(t, 'tmp/issue_231.csv')
    u = fromcsv('tmp/issue_231.csv')
    ieq(t, u)
    topickle(t, 'tmp/issue_231.pickle')
    u = frompickle('tmp/issue_231.pickle')
    ieq(t, u)
Beispiel #12
0
def test_fromtsv():

    f = NamedTemporaryFile(delete=False)
    writer = csv.writer(f, delimiter='\t')
    table = (('foo', 'bar'), ('a', 1), ('b', 2), ('c', 2))
    for row in table:
        writer.writerow(row)
    f.close()

    actual = fromtsv(f.name)
    expect = (('foo', 'bar'), ('a', '1'), ('b', '2'), ('c', '2'))
    ieq(expect, actual)
    ieq(expect, actual)  # verify can iterate twice
Beispiel #13
0
def test_teetsv_unicode():

    t1 = (
        (u'name', u'id'),
        (u'Արամ Խաչատրյան', 1),
        (u'Johann Strauß', 2),
        (u'Вагиф Сәмәдоғлу', 3),
        (u'章子怡', 4),
    )

    f1 = NamedTemporaryFile(delete=False)
    f2 = NamedTemporaryFile(delete=False)

    (etl.wrap(t1).teetsv(f1.name,
                         encoding='utf-8').selectgt('id',
                                                    1).totsv(f2.name,
                                                             encoding='utf-8'))

    ieq(t1, etl.fromtsv(f1.name, encoding='utf-8').convertnumbers())
    ieq(
        etl.wrap(t1).selectgt('id', 1),
        etl.fromtsv(f2.name, encoding='utf-8').convertnumbers())
Beispiel #14
0
def test_zipsource():

    # setup
    tbl = [('foo', 'bar'), ('a', '1'), ('b', '2')]
    fn_tsv = NamedTemporaryFile().name
    etl.totsv(tbl, fn_tsv)
    fn_zip = NamedTemporaryFile().name
    z = zipfile.ZipFile(fn_zip, mode='w')
    z.write(fn_tsv, 'data.tsv')
    z.close()

    # test
    actual = etl.fromtsv(ZipSource(fn_zip, 'data.tsv'))
    ieq(tbl, actual)
Beispiel #15
0
def test_zipsource():

    # setup
    tbl = [('foo', 'bar'), ('a', '1'), ('b', '2')]
    fn_tsv = NamedTemporaryFile().name
    etl.totsv(tbl, fn_tsv)
    fn_zip = NamedTemporaryFile().name
    z = zipfile.ZipFile(fn_zip, mode='w')
    z.write(fn_tsv, 'data.tsv')
    z.close()

    # test
    actual = etl.fromtsv(ZipSource(fn_zip, 'data.tsv'))
    ieq(tbl, actual)
Beispiel #16
0
def test_fromtsv():
    
    f = NamedTemporaryFile(delete=False)
    writer = csv.writer(f, delimiter='\t')
    table = (('foo', 'bar'),
             ('a', 1),
             ('b', 2),
             ('c', 2))
    for row in table:
        writer.writerow(row)
    f.close()
    
    actual = fromtsv(f.name)
    expect = (('foo', 'bar'),
              ('a', '1'),
              ('b', '2'),
              ('c', '2'))
    ieq(expect, actual)
    ieq(expect, actual) # verify can iterate twice
Beispiel #17
0
def xls_tidy(xls,qvalue):
    d=etl.fromtsv(xls)
    sd=etl.select(d,lambda x: float(x.PepQValue) <=float(qvalue))
    psmsummary=sd

    ssd=etl.cut(sd, 'Peptide', 'Protein', 'PepQValue')
    #remove the mod info in peptide.
    ssd=etl.transform.regex.sub(ssd,'Peptide', r'^[\w-]\.(.+)\.[\w-]$', r'\1')
    ssd=etl.transform.regex.sub(ssd,'Peptide', r'[\d\.\+]+', r'')

    aggregation = OrderedDict()
    aggregation['SpecCount'] = len
    cssd=etl.aggregate(ssd, 'Peptide', aggregation)

    fssd=etl.groupselectfirst(ssd, key=('Peptide','Protein',"PepQValue"))
    aggregation = OrderedDict()
    aggregation['Protein'] = 'Protein', etl.strjoin(';')
    aggregation['PepQValue'] = 'PepQValue', etl.strjoin(';')
    assd=etl.aggregate(fssd, 'Peptide', aggregation)
    pepsummary=etl.join(assd, cssd, key='Peptide')

    return (psmsummary, pepsummary)
Beispiel #18
0
def condense(path=".", use_tsv=True):
    """
    Couldn't be more pleased with this thing.
    
    utility method for moi - 
    all I want it to do is generate a bunch of PETL
    objects that will allow me to extract columns.
    
    Should take some of the output in such a way that I
    can populate a database with patterns for rapid 
    matching later.
    
    I do want to add my scoring mechanism in to the 
    row/column generator when I can.
    """
    objects = []
    # I'll use os.listdir(path) to try and seek everything
    try:
        x = os.listdir(path)
        # get the state at the start.
        log_output(x)  # make a note of what was found.
        for a in x:
            pair = []
            try:
                if use_tsv: pair.append(petl.fromtsv(a))
                else: pair.append(petl.fromcsv(a))
                pair.append(os.path.basename(a))
                log_output("Added petl object for %s" % (a))
                objects.append(tuple(pair))
            except Exception as ECHO:
                log_output(ECHO, "./error_log.log")
                log_output("Exception has occurred: %s" % (ECHO))
    except Exception as eddy:
        log_output(ECHO, "./error_log.log")
        log_output("Exception has occurred: %s" % (eddy))
    return objects
Beispiel #19
0
def init(release_dir, load_geneset=False):
    """Initialise data resources.

    Parameters
    ----------
    release_dir : string
        Local filesystem path where data from the release are stored.
    load_geneset : string
        If True, load geneset into memory.

    """

    # reference sequence
    ####################

    global genome_fn, genome
    genome_dir = os.path.join(release_dir, 'genome')
    genome_fn = os.path.join(genome_dir,
                             'Anopheles-gambiae-PEST_CHROMOSOMES_AgamP3.fa')
    if os.path.exists(genome_fn):
        genome = pyfasta.Fasta(genome_fn)

    # genome annotations
    ####################

    global geneset_agamp42_fn, geneset_agamp42
    geneset_dir = os.path.join(release_dir, 'geneset')
    geneset_agamp42_fn = os.path.join(
        geneset_dir,
        'Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.2.sorted.gff3.gz')
    if os.path.exists(geneset_agamp42_fn) and load_geneset:
        geneset_agamp42 = allel.FeatureTable.from_gff3(geneset_agamp42_fn)

    # variant callsets
    ##################

    global callset, callset_pass
    variation_dir = os.path.join(release_dir, 'variation')

    # main callset
    callset_h5_fn = os.path.join(variation_dir, 'main', 'hdf5',
                                 'ag1000g.phase1.ar3.h5')
    if os.path.exists(callset_h5_fn):
        callset = h5py.File(callset_h5_fn, mode='r')

    # main callset, PASS variants only
    callset_pass_h5_fn = os.path.join(variation_dir, 'main', 'hdf5',
                                      'ag1000g.phase1.ar3.pass.h5')
    if os.path.exists(callset_pass_h5_fn):
        callset_pass = h5py.File(callset_pass_h5_fn, mode='r')

    # accessibility
    ###############

    global accessibility
    accessibility_dir = os.path.join(release_dir, 'accessibility')
    accessibility_fn = os.path.join(accessibility_dir, 'accessibility.h5')
    if os.path.exists(accessibility_fn):
        accessibility = h5py.File(accessibility_fn, mode='r')

    # sample metadata
    #################

    global samples_fn, tbl_samples, lkp_samples, sample_ids, df_samples
    samples_dir = os.path.join(release_dir, 'samples')
    samples_fn = os.path.join(samples_dir, 'samples.all.txt')
    if os.path.exists(samples_fn):
        tbl_samples = (etl.fromtsv(samples_fn).convert(
            ('index', 'year', 'n_sequences', 'kt_2la', 'kt_2rb'), int).convert(
                ('mean_coverage', 'latitude', 'longitude') +
                tuple(range(20, 36)), float))
        lkp_samples = tbl_samples.recordlookupone('ox_code')
        sample_ids = tbl_samples.values('ox_code').list()
        df_samples = pandas.read_csv(samples_fn, sep='\t', index_col='index')

    # extras
    ########

    global allele_counts, allele_counts_gq10, outgroup_alleles, outgroup_allele_counts, \
        outgroup_species
    extras_dir = os.path.join(release_dir, 'extras')

    # allele counts
    allele_counts_fn = os.path.join(extras_dir, 'allele_counts.h5')
    if os.path.exists(allele_counts_fn):
        allele_counts = h5py.File(allele_counts_fn, mode='r')
    allele_counts_gq10_fn = os.path.join(extras_dir, 'allele_counts.gq10.h5')
    if os.path.exists(allele_counts_gq10_fn):
        allele_counts_gq10 = h5py.File(allele_counts_gq10_fn, mode='r')

    # outgroup data
    outgroup_species = 'arab', 'meru', 'mela', 'quad', 'epir', 'chri'
    outgroup_alleles_fn = os.path.join(extras_dir, 'outgroup_alleles.h5')
    if os.path.exists(outgroup_alleles_fn):
        outgroup_alleles = h5py.File(outgroup_alleles_fn, mode='r')
    outgroup_allele_counts_fn = os.path.join(extras_dir,
                                             'outgroup_allele_counts.h5')
    if os.path.exists(outgroup_allele_counts_fn):
        outgroup_allele_counts = h5py.File(outgroup_allele_counts_fn, mode='r')
Beispiel #20
0
def fromgff3(filename, region=None):
    """
    Extract feature rows from a GFF3 file, e.g.::

        >>> import petl as etl
        >>> # activate bio extensions
        ... import petlx.bio
        >>> table1 = etl.fromgff3('fixture/sample.gff')
        >>> table1.look(truncate=30)
        +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+
        | seqid        | source  | type          | start | end     | score | strand | phase | attributes                     |
        +==============+=========+===============+=======+=========+=======+========+=======+================================+
        | 'apidb|MAL1' | 'ApiDB' | 'supercontig' |     1 |  643292 | '.'   | '+'    | '.'   | {'localization': 'nuclear', 'o |
        +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL2' | 'ApiDB' | 'supercontig' |     1 |  947102 | '.'   | '+'    | '.'   | {'localization': 'nuclear', 'o |
        +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL3' | 'ApiDB' | 'supercontig' |     1 | 1060087 | '.'   | '+'    | '.'   | {'localization': 'nuclear', 'o |
        +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL4' | 'ApiDB' | 'supercontig' |     1 | 1204112 | '.'   | '+'    | '.'   | {'localization': 'nuclear', 'o |
        +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL5' | 'ApiDB' | 'supercontig' |     1 | 1343552 | '.'   | '+'    | '.'   | {'localization': 'nuclear', 'o |
        +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+
        ...

    A region query string of the form '[seqid]' or '[seqid]:[start]-[end]'
    may be given for the `region` argument. If given, requires the GFF3
    file to be position sorted, bgzipped and tabix indexed. Requires pysam to be
    installed. E.g.::

        >>> # extract from a specific genome region via tabix
        ... table2 = etl.fromgff3('fixture/sample.sorted.gff.gz',
        ...                       region='apidb|MAL5:1289593-1289595')
        >>> table2.look(truncate=30)
        +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+
        | seqid        | source  | type          | start   | end     | score | strand | phase | attributes                     |
        +==============+=========+===============+=========+=========+=======+========+=======+================================+
        | 'apidb|MAL5' | 'ApiDB' | 'supercontig' |       1 | 1343552 | '.'   | '+'    | '.'   | {'localization': 'nuclear', 'o |
        +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL5' | 'ApiDB' | 'exon'        | 1289594 | 1291685 | '.'   | '+'    | '.'   | {'size': '2092', 'Parent': 'ap |
        +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL5' | 'ApiDB' | 'gene'        | 1289594 | 1291685 | '.'   | '+'    | '.'   | {'ID': 'apidb|MAL5_18S', 'web_ |
        +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL5' | 'ApiDB' | 'rRNA'        | 1289594 | 1291685 | '.'   | '+'    | '.'   | {'ID': 'apidb|rna_MAL5_18S-1', |
        +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+

    """

    if region is None:

        # parse file as tab-delimited
        table = etl.fromtsv(filename)

    else:

        # extract via tabix
        table = etl.fromtabix(filename, region=region)

    return (
        table
        .pushheader(GFF3_HEADER)
        .skipcomments('#')
        # ignore any row not 9 values long (e.g., trailing fasta)
        .rowlenselect(9)
        # parse attributes into a dict
        .convert('attributes', gff3_parse_attributes)
        # parse coordinates
        .convert(('start', 'end'), int)
    )
Beispiel #21
0
import petl as etl
readFile = etl.fromtsv("donedeal_data_sample.tsv")
tmpTable = etl.addfield(readFile, 'InKms', lambda rec: rec['mileage'])
tmpTable2File = etl.convert(tmpTable,
                            'InKms',
                            lambda v: int(float(v) * 1.6),
                            where=lambda r: r.mileageType == 'miles')
etl.totsv(tmpTable2File, 'donedeal_inKms.tsv')
Beispiel #22
0
import petl as etl
table1 = etl.fromtsv("D:\JOB\BI_Developer_Challenge\donedeal_data_sample.tsv")

table2 = etl.convert(table1, 'mileage', float)
table3 = etl.convert(table2,
                     'mileage',
                     lambda v: v * 1.60934,
                     where=lambda r: r.mileageType == 'miles')
table4 = etl.convert(table3,
                     'mileageType',
                     lambda v: 'km',
                     where=lambda r: r.mileageType in ('miles', 'kilometres'))
table4 = etl.convert(table3,
                     'mileageType',
                     lambda v: 'NA',
                     where=lambda r: r.mileageType not in ('km'))

etl.totsv(table4, "D:\JOB\BI_Developer_Challenge\donedeal_data_etl.tsv")
def init(release_dir, load_geneset=False, geneset_attributes=None):
    """Initialise data resources.

    Parameters
    ----------
    release_dir : string
        Local filesystem path where data from the release are stored.
    load_geneset : string
        If True, load geneset into memory.
    geneset_attributes : dict-like
        Attributes to load.

    """

    # reference sequence
    ####################

    global genome_agamp3, genome_agamp4, genome_dir
    genome_dir = os.path.join(release_dir, 'genome')
    genome_agamp3_dir = os.path.join(genome_dir, 'agamP3')
    genome_agamp3_fn = os.path.join(
        genome_agamp3_dir, 'Anopheles-gambiae-PEST_CHROMOSOMES_AgamP3.fa')
    if os.path.exists(genome_agamp3_fn):
        genome_agamp3 = pyfasta.Fasta(genome_agamp3_fn,
                                      key_fn=lambda v: v.split()[0])
    genome_agamp4_dir = os.path.join(genome_dir, 'agamP4')
    genome_agamp4_fn = os.path.join(
        genome_agamp4_dir, 'Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa')
    if os.path.exists(genome_agamp4_fn):
        genome_agamp4 = pyfasta.Fasta(genome_agamp4_fn,
                                      key_fn=lambda v: v.split()[0])

    # genome annotations
    ####################

    global geneset_agamp44_fn, geneset_agamp44, geneset_dir
    geneset_dir = os.path.join(release_dir, 'geneset')
    geneset_agamp44_fn = os.path.join(
        geneset_dir,
        'Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.4.sorted.gff3.gz')
    if load_geneset:
        geneset_agamp44 = allel.FeatureTable.from_gff3(
            geneset_agamp44_fn, attributes=geneset_attributes)

    # variant callsets
    ##################

    global callset, callset_pass, callset_pass_biallelic, variation_dir, \
        callset_snpeff_agamp42
    variation_dir = os.path.join(release_dir, 'variation')

    # main callset
    callset_h5_fn = os.path.join(variation_dir, 'main', 'hdf5', 'all',
                                 'ag1000g.phase2.ar1.h5')
    callset_lite_h5_fn = os.path.join(variation_dir, 'main', 'hdf5', 'lite',
                                      'ag1000g.phase2.ar1.lite.h5')
    callset_zarr_fn = os.path.join(variation_dir, 'main', 'zarr', 'all',
                                   'ag1000g.phase2.ar1')

    # preference: zarr > hdf5 > hdf5 (lite)
    if os.path.exists(callset_zarr_fn):
        callset = zarr.open_group(callset_zarr_fn, mode='r')
    elif os.path.exists(callset_h5_fn):
        callset = h5py.File(callset_h5_fn, mode='r')
    elif os.path.exists(callset_lite_h5_fn):
        callset = h5py.File(callset_lite_h5_fn, mode='r')

    # main callset, PASS variants only
    callset_pass_h5_fn = os.path.join(variation_dir, 'main', 'hdf5', 'pass',
                                      'ag1000g.phase2.ar1.pass.h5')
    callset_pass_lite_h5_fn = os.path.join(variation_dir, 'main', 'hdf5',
                                           'lite',
                                           'ag1000g.phase2.ar1.pass.lite.h5')
    callset_pass_zarr_fn = os.path.join(variation_dir, 'main', 'zarr', 'pass',
                                        'ag1000g.phase2.ar1.pass')

    # preference: zarr > hdf5 > hdf5 (lite)
    if os.path.exists(callset_pass_zarr_fn):
        callset_pass = zarr.open_group(callset_pass_zarr_fn, mode='r')
    elif os.path.exists(callset_pass_h5_fn):
        callset_pass = h5py.File(callset_pass_h5_fn, mode='r')
    elif os.path.exists(callset_pass_lite_h5_fn):
        callset_pass = h5py.File(callset_pass_lite_h5_fn, mode='r')

    # main callset, PASS biallelic variants only
    callset_pass_biallelic_h5_fn = os.path.join(
        variation_dir, 'main', 'hdf5', 'biallelic',
        'ag1000g.phase2.ar1.pass.biallelic.h5')
    callset_pass_biallelic_lite_h5_fn = os.path.join(
        variation_dir, 'main', 'hdf5', 'lite',
        'ag1000g.phase2.ar1.pass.biallelic.lite.h5')
    callset_pass_biallelic_zarr_fn = os.path.join(
        variation_dir, 'main', 'zarr', 'biallelic',
        'ag1000g.phase2.ar1.pass.biallelic')

    # preference: zarr > hdf5 > hdf5 (lite)
    if os.path.exists(callset_pass_biallelic_zarr_fn):
        callset_pass_biallelic = zarr.open_group(
            callset_pass_biallelic_zarr_fn, mode='r')
    elif os.path.exists(callset_pass_biallelic_h5_fn):
        callset_pass_biallelic = h5py.File(callset_pass_biallelic_h5_fn,
                                           mode='r')
    elif os.path.exists(callset_pass_biallelic_lite_h5_fn):
        callset_pass_biallelic = h5py.File(callset_pass_biallelic_lite_h5_fn,
                                           mode='r')

    # SNPEFF annotations
    callset_snpeff_agamp42_h5_fn_template = os.path.join(
        variation_dir, 'main', 'hdf5', 'all_snpeff',
        'ag1000g.phase2.ar1.snpeff.AgamP4.2.{chrom}.h5')
    # work around broken link file
    callset_snpeff_agamp42 = dict()
    for chrom in '2L', '2R', '3L', '3R', 'X':
        fn = callset_snpeff_agamp42_h5_fn_template.format(chrom=chrom)
        if os.path.exists(fn):
            callset_snpeff_agamp42[chrom] = h5py.File(fn, mode='r')[chrom]

    # accessibility
    ###############

    global accessibility, accessibility_dir
    accessibility_dir = os.path.join(release_dir, 'accessibility')
    accessibility_fn = os.path.join(accessibility_dir, 'accessibility.h5')
    if os.path.exists(accessibility_fn):
        accessibility = h5py.File(accessibility_fn, mode='r')

    # sample metadata
    #################

    global tbl_samples, lkp_samples, sample_ids, df_samples, samples_dir
    samples_dir = os.path.join(release_dir, 'samples')
    samples_fn = os.path.join(samples_dir, 'samples.meta.txt')
    if os.path.exists(samples_fn):
        tbl_samples = (etl.fromtsv(samples_fn).convert(
            ('year', 'n_sequences'), int).convert(('mean_coverage', ), float))
        lkp_samples = tbl_samples.recordlookupone('ox_code')
        sample_ids = tbl_samples.values('ox_code').list()
        df_samples = pandas.read_csv(samples_fn, sep='\t', index_col='ox_code')

    # extras
    ########

    global allele_counts
    extras_dir = os.path.join(release_dir, 'extras')

    # allele counts
    allele_counts_fn = os.path.join(extras_dir, 'allele_counts.h5')
    if os.path.exists(allele_counts_fn):
        allele_counts = h5py.File(allele_counts_fn, mode='r')

    # haplotypes
    ############

    global haplotypes_dir, callset_phased, tbl_haplotypes, df_haplotypes, lkp_haplotypes
    haplotypes_dir = os.path.join(release_dir, 'haplotypes')

    # no HDF5 link file, load up as dict for now
    callset_phased_hdf5_fn_template = os.path.join(
        haplotypes_dir, 'main', 'hdf5',
        'ag1000g.phase2.ar1.haplotypes.{chrom}.h5')
    callset_phased = dict()
    for chrom in '2L', '2R', '3L', '3R', 'X':
        fn = callset_phased_hdf5_fn_template.format(chrom=chrom)
        if os.path.exists(fn):
            callset_phased[chrom] = h5py.File(fn, mode='r')[chrom]

    # no haplotypes file, create here for now
    # TODO source this from file Nick has created
    if '3R' in callset_phased:
        phased_samples = callset_phased['3R']['samples'][:].astype('U')
        haplotype_labels = list(
            itertools.chain(*[[s + 'a', s + 'b'] for s in phased_samples]))
        tbl_haplotypes = (etl.empty().addcolumn(
            'label', haplotype_labels).addrownumbers(start=0).rename(
                'row', 'index'
            ).addfield('ox_code', lambda row: row.label[:-1]).hashleftjoin(
                tbl_samples, key='ox_code').addfield(
                    'label_aug', lambda row: '%s [%s, %s, %s, %s]' %
                    (row.label, row.country, row.location, row.m_s, row.sex)))
        lkp_haplotypes = tbl_haplotypes.recordlookupone('label')
        df_haplotypes = tbl_haplotypes.todataframe(index='index')
Beispiel #24
0
def fromgff3(filename, region=None):
    """
    Extract feature rows from a GFF3 file, e.g.::

        >>> import petl as etl
        >>> # activate bio extensions
        ... import petlx.bio
        >>> table1 = etl.fromgff3('fixture/sample.gff')
        >>> table1.look(truncate=30)
        +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+
        | seqid        | source  | type          | start | end     | score | strand | phase | attributes                     |
        +==============+=========+===============+=======+=========+=======+========+=======+================================+
        | 'apidb|MAL1' | 'ApiDB' | 'supercontig' |     1 |  643292 | '.'   | '+'    | '.'   | {'localization': 'nuclear', 'o |
        +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL2' | 'ApiDB' | 'supercontig' |     1 |  947102 | '.'   | '+'    | '.'   | {'localization': 'nuclear', 'o |
        +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL3' | 'ApiDB' | 'supercontig' |     1 | 1060087 | '.'   | '+'    | '.'   | {'localization': 'nuclear', 'o |
        +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL4' | 'ApiDB' | 'supercontig' |     1 | 1204112 | '.'   | '+'    | '.'   | {'localization': 'nuclear', 'o |
        +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL5' | 'ApiDB' | 'supercontig' |     1 | 1343552 | '.'   | '+'    | '.'   | {'localization': 'nuclear', 'o |
        +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+
        ...

    A region query string of the form '[seqid]' or '[seqid]:[start]-[end]'
    may be given for the `region` argument. If given, requires the GFF3
    file to be position sorted, bgzipped and tabix indexed. Requires pysam to be
    installed. E.g.::

        >>> # extract from a specific genome region via tabix
        ... table2 = etl.fromgff3('fixture/sample.sorted.gff.gz',
        ...                       region='apidb|MAL5:1289593-1289595')
        >>> table2.look(truncate=30)
        +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+
        | seqid        | source  | type          | start   | end     | score | strand | phase | attributes                     |
        +==============+=========+===============+=========+=========+=======+========+=======+================================+
        | 'apidb|MAL5' | 'ApiDB' | 'supercontig' |       1 | 1343552 | '.'   | '+'    | '.'   | {'localization': 'nuclear', 'o |
        +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL5' | 'ApiDB' | 'exon'        | 1289594 | 1291685 | '.'   | '+'    | '.'   | {'size': '2092', 'Parent': 'ap |
        +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL5' | 'ApiDB' | 'gene'        | 1289594 | 1291685 | '.'   | '+'    | '.'   | {'ID': 'apidb|MAL5_18S', 'web_ |
        +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+
        | 'apidb|MAL5' | 'ApiDB' | 'rRNA'        | 1289594 | 1291685 | '.'   | '+'    | '.'   | {'ID': 'apidb|rna_MAL5_18S-1', |
        +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+

    """

    if region is None:

        # parse file as tab-delimited
        table = etl.fromtsv(filename)

    else:

        # extract via tabix
        table = etl.fromtabix(filename, region=region)

    return (table.pushheader(GFF3_HEADER).skipcomments('#')
            # ignore any row not 9 values long (e.g., trailing fasta)
            .rowlenselect(9)
            # parse attributes into a dict
            .convert('attributes', gff3_parse_attributes)
            # parse coordinates
            .convert(('start', 'end'), int))
Beispiel #25
0
import petl as etl
import IPython
import pandas as pd

_DEBUG = True
_TIME_TEST = True

# Test case on run time complement vs antijoin.
# normally these would be toggles but for testing we set both to true
_COMPLEMENT = True
_ANTI_JOIN = True

# csv = comma delimited, tsv = tab delimited
pre_etl_time = time.time()
a = etl.fromtsv('snpdata.csv')
post_etl_time = time.time()
b = etl.fromtsv('popdata.csv')

pre_df_time = time.time()
df_a = pd.read_csv('snpdata.csv', sep='\t', header=0)
post_df_time = time.time()

print("ETL time to load A file: {} Pandas time to load A file: {}".format(
    post_etl_time - pre_etl_time, post_df_time - pre_df_time))

df_b = pd.read_csv('popdata.csv', sep='\t', header=0)

header_a = etl.header(a)
header_b = etl.header(b)
if _DEBUG:
Beispiel #26
0
def convert_folder(base_source_dir,
                   base_target_dir,
                   tmp_dir,
                   tika=False,
                   ocr=False,
                   merge=False,
                   tsv_source_path=None,
                   tsv_target_path=None,
                   make_unique=True,
                   sample=False,
                   zip=False):
    # WAIT: Legg inn i gui at kan velge om skal ocr-behandles
    txt_target_path = base_target_dir + '_result.txt'
    json_tmp_dir = base_target_dir + '_tmp'
    converted_now = False
    errors = False
    originals = False

    if merge is False:  # TODO: Trengs begge argumentene?
        make_unique = False

    if tsv_source_path is None:
        tsv_source_path = base_target_dir + '.tsv'
    else:
        txt_target_path = os.path.splitext(
            tsv_source_path)[1][1:] + '_result.txt'

    if tsv_target_path is None:
        tsv_target_path = base_target_dir + '_processed.tsv'

    if os.path.exists(tsv_target_path):
        os.remove(tsv_target_path)

    Path(base_target_dir).mkdir(parents=True, exist_ok=True)

    # TODO: Viser mime direkte om er pdf/a eller må en sjekke mot ekstra felt i de to under? Forsjekk om Tika og siegfried?

    # TODO: Trengs denne sjekk om tsv her. Gjøres sjekk før kaller denne funskjonen og slik at unødvendig?
    if not os.path.isfile(tsv_source_path):
        if tika:
            run_tika(tsv_source_path, base_source_dir, json_tmp_dir, zip)
        else:
            run_siegfried(base_source_dir, tmp_dir, tsv_source_path, zip)

    # TODO: Legg inn test på at tsv-fil ikke er tom
    replace_text_in_file(tsv_source_path, '\0', '')

    table = etl.fromtsv(tsv_source_path)
    table = etl.rename(table, {
        'filename': 'source_file_path',
        'tika_batch_fs_relative_path': 'source_file_path',
        'filesize': 'file_size',
        'mime': 'mime_type',
        'Content_Type': 'mime_type',
        'Version': 'version'
    },
                       strict=False)

    thumbs_table = etl.select(
        table, lambda rec: Path(rec.source_file_path).name == 'Thumbs.db')
    if etl.nrows(thumbs_table) > 0:
        thumbs_paths = etl.values(thumbs_table, 'source_file_path')
        for path in thumbs_paths:
            if '/' not in path:
                path = os.path.join(base_source_dir, path)
            if os.path.isfile(path):
                os.remove(path)

        table = etl.select(
            table, lambda rec: Path(rec.source_file_path).name != 'Thumbs.db')

    table = etl.select(table, lambda rec: rec.source_file_path != '')
    table = etl.select(table, lambda rec: '#' not in rec.source_file_path)
    # WAIT: Ikke fullgod sjekk på embedded dokument i linje over da # faktisk kan forekomme i filnavn
    row_count = etl.nrows(table)

    file_count = sum([len(files) for r, d, files in os.walk(base_source_dir)])

    if row_count == 0:
        print('No files to convert. Exiting.')
        return 'Error', file_count
    elif file_count != row_count:
        print('Row count: ' + str(row_count))
        print('File count: ' + str(file_count))
        print("Files listed in '" + tsv_source_path +
              "' doesn't match files on disk. Exiting.")
        return 'Error', file_count
    elif not zip:
        print('Converting files..')

    # WAIT: Legg inn sjekk på filstørrelse før og etter konvertering

    append_fields = ('version', 'norm_file_path', 'result',
                     'original_file_copy', 'id')
    table = add_fields(append_fields, table)

    cut_fields = ('0', '1', 'X_TIKA_EXCEPTION_runtime',
                  'X_TIKA_EXCEPTION_warn')
    table = remove_fields(cut_fields, table)

    header = etl.header(table)
    append_tsv_row(tsv_target_path, header)

    # Treat csv (detected from extension only) as plain text:
    table = etl.convert(table,
                        'mime_type',
                        lambda v, row: 'text/plain'
                        if row.id == 'x-fmt/18' else v,
                        pass_row=True)

    # Update for missing mime types where id is known:
    table = etl.convert(table,
                        'mime_type',
                        lambda v, row: 'application/xml'
                        if row.id == 'fmt/979' else v,
                        pass_row=True)

    if os.path.isfile(txt_target_path):
        os.remove(txt_target_path)

    data = etl.dicts(table)
    count = 0
    for row in data:
        count += 1
        count_str = ('(' + str(count) + '/' + str(file_count) + '): ')
        source_file_path = row['source_file_path']
        if '/' not in source_file_path:
            source_file_path = os.path.join(base_source_dir, source_file_path)

        mime_type = row['mime_type']
        # TODO: Virker ikke når Tika brukt -> finn hvorfor
        if ';' in mime_type:
            mime_type = mime_type.split(';')[0]

        version = row['version']
        result = None
        old_result = row['result']

        if not mime_type:
            if os.path.islink(source_file_path):
                mime_type = 'n/a'

            # kind = filetype.guess(source_file_path)
            extension = os.path.splitext(source_file_path)[1][1:].lower()
            if extension == 'xml':
                mime_type = 'application/xml'

        if not zip:
            print_path = os.path.relpath(source_file_path,
                                         Path(base_source_dir).parents[1])
            print(count_str + '.../' + print_path + ' (' + mime_type + ')')

        if mime_type not in mime_to_norm.keys():
            # print("|" + mime_type + "|")

            errors = True
            converted_now = True
            result = 'Conversion not supported'
            append_txt_file(
                txt_target_path,
                result + ': ' + source_file_path + ' (' + mime_type + ')')
            row['norm_file_path'] = ''
            row['original_file_copy'] = ''
        else:
            keep_original = mime_to_norm[mime_type][0]

            if keep_original:
                originals = True

            if zip:
                keep_original = False

            function = mime_to_norm[mime_type][1]

            # Ensure unique file names in dir hierarchy:
            norm_ext = mime_to_norm[mime_type][2]
            if not norm_ext:
                norm_ext = 'none'

            if make_unique:
                norm_ext = (base64.b32encode(
                    bytes(
                        str(count), encoding='ascii'))).decode('utf8').replace(
                            '=', '').lower() + '.' + norm_ext
            target_dir = os.path.dirname(
                source_file_path.replace(base_source_dir, base_target_dir))
            normalized = file_convert(source_file_path,
                                      mime_type,
                                      function,
                                      target_dir,
                                      tmp_dir,
                                      None,
                                      norm_ext,
                                      version,
                                      ocr,
                                      keep_original,
                                      zip=zip)

            if normalized['result'] == 0:
                errors = True
                result = 'Conversion failed'
                append_txt_file(
                    txt_target_path,
                    result + ': ' + source_file_path + ' (' + mime_type + ')')
            elif normalized['result'] == 1:
                result = 'Converted successfully'
                converted_now = True
            elif normalized['result'] == 2:
                errors = True
                result = 'Conversion not supported'
                append_txt_file(
                    txt_target_path,
                    result + ': ' + source_file_path + ' (' + mime_type + ')')
            elif normalized['result'] == 3:
                if old_result not in ('Converted successfully',
                                      'Manually converted'):
                    result = 'Manually converted'
                    converted_now = True
                else:
                    result = old_result
            elif normalized['result'] == 4:
                converted_now = True
                errors = True
                result = normalized['error']
                append_txt_file(
                    txt_target_path,
                    result + ': ' + source_file_path + ' (' + mime_type + ')')
            elif normalized['result'] == 5:
                result = 'Not a document'

            if normalized['norm_file_path']:
                row['norm_file_path'] = relpath(normalized['norm_file_path'],
                                                base_target_dir)

            file_copy_path = normalized['original_file_copy']
            if file_copy_path:
                file_copy_path = relpath(file_copy_path, base_target_dir)
            row['original_file_copy'] = file_copy_path

        row['result'] = result
        row_values = list(row.values())

        # TODO: Fikset med å legge inn escapechar='\\' i append_tsv_row -> vil det skal problemer senere?
        # row_values = [r.replace('\n', ' ') for r in row_values if r is not None]
        append_tsv_row(tsv_target_path, row_values)

        if sample and count > 9:
            break

    if not sample:
        shutil.move(tsv_target_path, tsv_source_path)
    # TODO: Legg inn valg om at hvis merge = true kopieres alle filer til mappe på øverste nivå og så slettes tomme undermapper

    msg = None
    if sample:
        msg = 'Sample files converted.'
        if errors:
            msg = "Not all sample files were converted. See '" + txt_target_path + "' for details."
    else:
        if converted_now:
            msg = 'All files converted succcessfully.'
            if errors:
                msg = "Not all files were converted. See '" + txt_target_path + "' for details."
        else:
            msg = 'All files converted previously.'

    return msg, file_count, errors, originals  # TODO: Fiks så bruker denne heller for oppsummering til slutt når flere mapper konvertert