def test_teetsv_unicode(): t1 = ((u"name", u"id"), (u"Արամ Խաչատրյան", 1), (u"Johann Strauß", 2), (u"Вагиф Сәмәдоғлу", 3), (u"章子怡", 4)) f1 = NamedTemporaryFile(delete=False) f2 = NamedTemporaryFile(delete=False) (etl.wrap(t1).teetsv(f1.name, encoding="utf-8").selectgt("id", 1).totsv(f2.name, encoding="utf-8")) ieq(t1, etl.fromtsv(f1.name, encoding="utf-8").convertnumbers()) ieq(etl.wrap(t1).selectgt("id", 1), etl.fromtsv(f2.name, encoding="utf-8").convertnumbers())
def test_teetsv(): t1 = (("foo", "bar"), ("a", 2), ("b", 1), ("c", 3)) f1 = NamedTemporaryFile(delete=False) f2 = NamedTemporaryFile(delete=False) (etl.wrap(t1).teetsv(f1.name, encoding="ascii").selectgt("bar", 1).totsv(f2.name, encoding="ascii")) ieq(t1, etl.fromtsv(f1.name, encoding="ascii").convertnumbers()) ieq(etl.wrap(t1).selectgt("bar", 1), etl.fromtsv(f2.name, encoding="ascii").convertnumbers())
def test_teetsv(): t1 = (('foo', 'bar'), ('a', 2), ('b', 1), ('c', 3)) f1 = NamedTemporaryFile(delete=False) f2 = NamedTemporaryFile(delete=False) (etl.wrap(t1).teetsv(f1.name, encoding='ascii').selectgt('bar', 1).totsv(f2.name, encoding='ascii')) ieq(t1, etl.fromtsv(f1.name, encoding='ascii').convertnumbers()) ieq( etl.wrap(t1).selectgt('bar', 1), etl.fromtsv(f2.name, encoding='ascii').convertnumbers())
def init(release_dir): """Initialise data resources. Parameters ---------- release_dir : string Local filesystem path where data from the release are stored. """ # variation ########### global callset, callset_pass variation_dir = os.path.join(release_dir, 'variation') # main callset callset_zarr_fn = os.path.join(variation_dir, 'main', 'zarr2', 'ag1000g.phase1.ar3') if os.path.exists(callset_zarr_fn): callset = zarr.open_group(callset_zarr_fn, mode='r') # main callset, PASS variants only callset_pass_zarr_fn = os.path.join(variation_dir, 'main', 'zarr2', 'ag1000g.phase1.ar3.pass') if os.path.exists(callset_pass_zarr_fn): callset_pass = zarr.open_group(callset_pass_zarr_fn, mode='r') # haplotypes ############ global callset_phased, tbl_haplotypes, lkp_haplotypes, df_haplotypes haplotypes_dir = os.path.join(release_dir, 'haplotypes') # try HDF5 first callset_phased_h5_fn = os.path.join(haplotypes_dir, 'main', 'hdf5', 'ag1000g.phase1.ar3.1.haplotypes.h5') if os.path.exists(callset_phased_h5_fn): callset_phased = h5py.File(callset_phased_h5_fn, mode='r') # prefer Zarr if available # N.B., the Zarr data is not consistent with HDF5 or shapeit outputs, # it is based on a previous phasing run. # #callset_phased_zarr_fn = os.path.join(haplotypes_dir, 'main', 'zarr2', # 'ag1000g.phase1.ar3.1.haplotypes') #if os.path.exists(callset_phased_zarr_fn): # callset_phased = zarr.open_group(callset_phased_zarr_fn, mode='r') # haplotypes metadata haplotypes_fn = os.path.join(haplotypes_dir, 'haplotypes.meta.txt') if os.path.exists(haplotypes_fn): tbl_haplotypes = (etl.fromtsv(haplotypes_fn).convert( ('index', 'kt_2la', 'kt_2rb'), int)) lkp_haplotypes = tbl_haplotypes.recordlookupone('label') df_haplotypes = pandas.read_csv(haplotypes_fn, sep='\t', index_col='index')
def get_table(source=None, nrows=None, skip=None, fields=None, exclude=None, rownumbers=True, **petlargs): """ :param source: full path filename of the delimited file :param nrows: number of rows to include in the table :param skip: number of rows to skip from the file :param fields: selected fields to extract from the file :param exclude: selected fields to be excluded from the file :param rownumbers: Add a rowID column. This is True by default This is similar to pandas.RangeIndex see petl.transform.basics.addrownumbers() Notice: skip and nrows parameters require that addrownumbers() is applied to petl table If `fields` is specified and `rowID` is not included in the list the column will not be included in petl table :param petlargs: see petl.io.csv.fromcsv and petl.io.csv.fromtsv :return: petl table container Notice: petl makes extensive use of lazy evaluation and iterators the file is not loaded in memory instead a container/iterator is returned Examples: etl.get_table('movies.csv', 20, 100, ['rowID', 'movie_title', 'title_year']).lookall() etl.get_table('movies.csv', 20, 100, exclude=['language', 'actor_1_name']).lookall() """ # Get the extension of the filename table = None if source: ext = os.path.splitext(source)[1] # Read all rows from the file and create a pandas dataframe in memory if ext == '.csv': table = petl.fromcsv(source, **petlargs) elif ext == '.tsv': table = petl.fromtsv(source, **petlargs) if rownumbers: table = table.addrownumbers(start=1, step=1, field='rowID') if skip and rownumbers: if nrows: table = table.select(lambda num: num.rowID > skip and num. rowID <= nrows + skip) else: table = table.select(lambda num: num.rowID > skip) if not skip and nrows and rownumbers: table = table.select(lambda num: num.rowID <= nrows) if fields: table = table.cut(*fields) if exclude: table = table.cutout(*exclude) return table
def get_file_header(ftype, fname): ext = os.path.splitext(fname)[1][1:] if not ftype.lower() == ext: raise Exception( f'Failed: Filename extension does not match < ftype={ftype} >') if ftype == 'CSV': return petl.fromcsv(fname).head(0).tol()[0] elif ftype == 'TSV': return petl.fromtsv(fname).head(0).tol()[0]
def test_ZipSource(): # setup table = [('foo', 'bar'), ('a', '1'), ('b', '2')] totsv(table, 'tmp/issue_241.tsv') z = zipfile.ZipFile('tmp/issue_241.zip', mode='w') z.write('tmp/issue_241.tsv', 'data.tsv') z.close() # test actual = fromtsv(ZipSource('tmp/issue_241.zip', 'data.tsv')) ieq(table, actual)
def test_fromtsv(): f = NamedTemporaryFile(delete=False) writer = csv.writer(f, delimiter="\t") table = (("foo", "bar"), ("a", 1), ("b", 2), ("c", 2)) for row in table: writer.writerow(row) f.close() actual = fromtsv(f.name) expect = (("foo", "bar"), ("a", "1"), ("b", "2"), ("c", "2")) ieq(expect, actual) ieq(expect, actual) # verify can iterate twice
def test_issue_231(): table = [['foo', 'bar'], ['a', '1'], ['b', '2']] t = cut(table, 'foo') totsv(t, 'tmp/issue_231.tsv') u = fromtsv('tmp/issue_231.tsv') ieq(t, u) tocsv(t, 'tmp/issue_231.csv') u = fromcsv('tmp/issue_231.csv') ieq(t, u) topickle(t, 'tmp/issue_231.pickle') u = frompickle('tmp/issue_231.pickle') ieq(t, u)
def test_fromtsv(): f = NamedTemporaryFile(delete=False) writer = csv.writer(f, delimiter='\t') table = (('foo', 'bar'), ('a', 1), ('b', 2), ('c', 2)) for row in table: writer.writerow(row) f.close() actual = fromtsv(f.name) expect = (('foo', 'bar'), ('a', '1'), ('b', '2'), ('c', '2')) ieq(expect, actual) ieq(expect, actual) # verify can iterate twice
def test_teetsv_unicode(): t1 = ( (u'name', u'id'), (u'Արամ Խաչատրյան', 1), (u'Johann Strauß', 2), (u'Вагиф Сәмәдоғлу', 3), (u'章子怡', 4), ) f1 = NamedTemporaryFile(delete=False) f2 = NamedTemporaryFile(delete=False) (etl.wrap(t1).teetsv(f1.name, encoding='utf-8').selectgt('id', 1).totsv(f2.name, encoding='utf-8')) ieq(t1, etl.fromtsv(f1.name, encoding='utf-8').convertnumbers()) ieq( etl.wrap(t1).selectgt('id', 1), etl.fromtsv(f2.name, encoding='utf-8').convertnumbers())
def test_zipsource(): # setup tbl = [('foo', 'bar'), ('a', '1'), ('b', '2')] fn_tsv = NamedTemporaryFile().name etl.totsv(tbl, fn_tsv) fn_zip = NamedTemporaryFile().name z = zipfile.ZipFile(fn_zip, mode='w') z.write(fn_tsv, 'data.tsv') z.close() # test actual = etl.fromtsv(ZipSource(fn_zip, 'data.tsv')) ieq(tbl, actual)
def xls_tidy(xls,qvalue): d=etl.fromtsv(xls) sd=etl.select(d,lambda x: float(x.PepQValue) <=float(qvalue)) psmsummary=sd ssd=etl.cut(sd, 'Peptide', 'Protein', 'PepQValue') #remove the mod info in peptide. ssd=etl.transform.regex.sub(ssd,'Peptide', r'^[\w-]\.(.+)\.[\w-]$', r'\1') ssd=etl.transform.regex.sub(ssd,'Peptide', r'[\d\.\+]+', r'') aggregation = OrderedDict() aggregation['SpecCount'] = len cssd=etl.aggregate(ssd, 'Peptide', aggregation) fssd=etl.groupselectfirst(ssd, key=('Peptide','Protein',"PepQValue")) aggregation = OrderedDict() aggregation['Protein'] = 'Protein', etl.strjoin(';') aggregation['PepQValue'] = 'PepQValue', etl.strjoin(';') assd=etl.aggregate(fssd, 'Peptide', aggregation) pepsummary=etl.join(assd, cssd, key='Peptide') return (psmsummary, pepsummary)
def condense(path=".", use_tsv=True): """ Couldn't be more pleased with this thing. utility method for moi - all I want it to do is generate a bunch of PETL objects that will allow me to extract columns. Should take some of the output in such a way that I can populate a database with patterns for rapid matching later. I do want to add my scoring mechanism in to the row/column generator when I can. """ objects = [] # I'll use os.listdir(path) to try and seek everything try: x = os.listdir(path) # get the state at the start. log_output(x) # make a note of what was found. for a in x: pair = [] try: if use_tsv: pair.append(petl.fromtsv(a)) else: pair.append(petl.fromcsv(a)) pair.append(os.path.basename(a)) log_output("Added petl object for %s" % (a)) objects.append(tuple(pair)) except Exception as ECHO: log_output(ECHO, "./error_log.log") log_output("Exception has occurred: %s" % (ECHO)) except Exception as eddy: log_output(ECHO, "./error_log.log") log_output("Exception has occurred: %s" % (eddy)) return objects
def init(release_dir, load_geneset=False): """Initialise data resources. Parameters ---------- release_dir : string Local filesystem path where data from the release are stored. load_geneset : string If True, load geneset into memory. """ # reference sequence #################### global genome_fn, genome genome_dir = os.path.join(release_dir, 'genome') genome_fn = os.path.join(genome_dir, 'Anopheles-gambiae-PEST_CHROMOSOMES_AgamP3.fa') if os.path.exists(genome_fn): genome = pyfasta.Fasta(genome_fn) # genome annotations #################### global geneset_agamp42_fn, geneset_agamp42 geneset_dir = os.path.join(release_dir, 'geneset') geneset_agamp42_fn = os.path.join( geneset_dir, 'Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.2.sorted.gff3.gz') if os.path.exists(geneset_agamp42_fn) and load_geneset: geneset_agamp42 = allel.FeatureTable.from_gff3(geneset_agamp42_fn) # variant callsets ################## global callset, callset_pass variation_dir = os.path.join(release_dir, 'variation') # main callset callset_h5_fn = os.path.join(variation_dir, 'main', 'hdf5', 'ag1000g.phase1.ar3.h5') if os.path.exists(callset_h5_fn): callset = h5py.File(callset_h5_fn, mode='r') # main callset, PASS variants only callset_pass_h5_fn = os.path.join(variation_dir, 'main', 'hdf5', 'ag1000g.phase1.ar3.pass.h5') if os.path.exists(callset_pass_h5_fn): callset_pass = h5py.File(callset_pass_h5_fn, mode='r') # accessibility ############### global accessibility accessibility_dir = os.path.join(release_dir, 'accessibility') accessibility_fn = os.path.join(accessibility_dir, 'accessibility.h5') if os.path.exists(accessibility_fn): accessibility = h5py.File(accessibility_fn, mode='r') # sample metadata ################# global samples_fn, tbl_samples, lkp_samples, sample_ids, df_samples samples_dir = os.path.join(release_dir, 'samples') samples_fn = os.path.join(samples_dir, 'samples.all.txt') if os.path.exists(samples_fn): tbl_samples = (etl.fromtsv(samples_fn).convert( ('index', 'year', 'n_sequences', 'kt_2la', 'kt_2rb'), int).convert( ('mean_coverage', 'latitude', 'longitude') + tuple(range(20, 36)), float)) lkp_samples = tbl_samples.recordlookupone('ox_code') sample_ids = tbl_samples.values('ox_code').list() df_samples = pandas.read_csv(samples_fn, sep='\t', index_col='index') # extras ######## global allele_counts, allele_counts_gq10, outgroup_alleles, outgroup_allele_counts, \ outgroup_species extras_dir = os.path.join(release_dir, 'extras') # allele counts allele_counts_fn = os.path.join(extras_dir, 'allele_counts.h5') if os.path.exists(allele_counts_fn): allele_counts = h5py.File(allele_counts_fn, mode='r') allele_counts_gq10_fn = os.path.join(extras_dir, 'allele_counts.gq10.h5') if os.path.exists(allele_counts_gq10_fn): allele_counts_gq10 = h5py.File(allele_counts_gq10_fn, mode='r') # outgroup data outgroup_species = 'arab', 'meru', 'mela', 'quad', 'epir', 'chri' outgroup_alleles_fn = os.path.join(extras_dir, 'outgroup_alleles.h5') if os.path.exists(outgroup_alleles_fn): outgroup_alleles = h5py.File(outgroup_alleles_fn, mode='r') outgroup_allele_counts_fn = os.path.join(extras_dir, 'outgroup_allele_counts.h5') if os.path.exists(outgroup_allele_counts_fn): outgroup_allele_counts = h5py.File(outgroup_allele_counts_fn, mode='r')
def fromgff3(filename, region=None): """ Extract feature rows from a GFF3 file, e.g.:: >>> import petl as etl >>> # activate bio extensions ... import petlx.bio >>> table1 = etl.fromgff3('fixture/sample.gff') >>> table1.look(truncate=30) +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+ | seqid | source | type | start | end | score | strand | phase | attributes | +==============+=========+===============+=======+=========+=======+========+=======+================================+ | 'apidb|MAL1' | 'ApiDB' | 'supercontig' | 1 | 643292 | '.' | '+' | '.' | {'localization': 'nuclear', 'o | +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL2' | 'ApiDB' | 'supercontig' | 1 | 947102 | '.' | '+' | '.' | {'localization': 'nuclear', 'o | +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL3' | 'ApiDB' | 'supercontig' | 1 | 1060087 | '.' | '+' | '.' | {'localization': 'nuclear', 'o | +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL4' | 'ApiDB' | 'supercontig' | 1 | 1204112 | '.' | '+' | '.' | {'localization': 'nuclear', 'o | +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL5' | 'ApiDB' | 'supercontig' | 1 | 1343552 | '.' | '+' | '.' | {'localization': 'nuclear', 'o | +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+ ... A region query string of the form '[seqid]' or '[seqid]:[start]-[end]' may be given for the `region` argument. If given, requires the GFF3 file to be position sorted, bgzipped and tabix indexed. Requires pysam to be installed. E.g.:: >>> # extract from a specific genome region via tabix ... table2 = etl.fromgff3('fixture/sample.sorted.gff.gz', ... region='apidb|MAL5:1289593-1289595') >>> table2.look(truncate=30) +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+ | seqid | source | type | start | end | score | strand | phase | attributes | +==============+=========+===============+=========+=========+=======+========+=======+================================+ | 'apidb|MAL5' | 'ApiDB' | 'supercontig' | 1 | 1343552 | '.' | '+' | '.' | {'localization': 'nuclear', 'o | +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL5' | 'ApiDB' | 'exon' | 1289594 | 1291685 | '.' | '+' | '.' | {'size': '2092', 'Parent': 'ap | +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL5' | 'ApiDB' | 'gene' | 1289594 | 1291685 | '.' | '+' | '.' | {'ID': 'apidb|MAL5_18S', 'web_ | +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL5' | 'ApiDB' | 'rRNA' | 1289594 | 1291685 | '.' | '+' | '.' | {'ID': 'apidb|rna_MAL5_18S-1', | +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+ """ if region is None: # parse file as tab-delimited table = etl.fromtsv(filename) else: # extract via tabix table = etl.fromtabix(filename, region=region) return ( table .pushheader(GFF3_HEADER) .skipcomments('#') # ignore any row not 9 values long (e.g., trailing fasta) .rowlenselect(9) # parse attributes into a dict .convert('attributes', gff3_parse_attributes) # parse coordinates .convert(('start', 'end'), int) )
import petl as etl readFile = etl.fromtsv("donedeal_data_sample.tsv") tmpTable = etl.addfield(readFile, 'InKms', lambda rec: rec['mileage']) tmpTable2File = etl.convert(tmpTable, 'InKms', lambda v: int(float(v) * 1.6), where=lambda r: r.mileageType == 'miles') etl.totsv(tmpTable2File, 'donedeal_inKms.tsv')
import petl as etl table1 = etl.fromtsv("D:\JOB\BI_Developer_Challenge\donedeal_data_sample.tsv") table2 = etl.convert(table1, 'mileage', float) table3 = etl.convert(table2, 'mileage', lambda v: v * 1.60934, where=lambda r: r.mileageType == 'miles') table4 = etl.convert(table3, 'mileageType', lambda v: 'km', where=lambda r: r.mileageType in ('miles', 'kilometres')) table4 = etl.convert(table3, 'mileageType', lambda v: 'NA', where=lambda r: r.mileageType not in ('km')) etl.totsv(table4, "D:\JOB\BI_Developer_Challenge\donedeal_data_etl.tsv")
def init(release_dir, load_geneset=False, geneset_attributes=None): """Initialise data resources. Parameters ---------- release_dir : string Local filesystem path where data from the release are stored. load_geneset : string If True, load geneset into memory. geneset_attributes : dict-like Attributes to load. """ # reference sequence #################### global genome_agamp3, genome_agamp4, genome_dir genome_dir = os.path.join(release_dir, 'genome') genome_agamp3_dir = os.path.join(genome_dir, 'agamP3') genome_agamp3_fn = os.path.join( genome_agamp3_dir, 'Anopheles-gambiae-PEST_CHROMOSOMES_AgamP3.fa') if os.path.exists(genome_agamp3_fn): genome_agamp3 = pyfasta.Fasta(genome_agamp3_fn, key_fn=lambda v: v.split()[0]) genome_agamp4_dir = os.path.join(genome_dir, 'agamP4') genome_agamp4_fn = os.path.join( genome_agamp4_dir, 'Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa') if os.path.exists(genome_agamp4_fn): genome_agamp4 = pyfasta.Fasta(genome_agamp4_fn, key_fn=lambda v: v.split()[0]) # genome annotations #################### global geneset_agamp44_fn, geneset_agamp44, geneset_dir geneset_dir = os.path.join(release_dir, 'geneset') geneset_agamp44_fn = os.path.join( geneset_dir, 'Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.4.sorted.gff3.gz') if load_geneset: geneset_agamp44 = allel.FeatureTable.from_gff3( geneset_agamp44_fn, attributes=geneset_attributes) # variant callsets ################## global callset, callset_pass, callset_pass_biallelic, variation_dir, \ callset_snpeff_agamp42 variation_dir = os.path.join(release_dir, 'variation') # main callset callset_h5_fn = os.path.join(variation_dir, 'main', 'hdf5', 'all', 'ag1000g.phase2.ar1.h5') callset_lite_h5_fn = os.path.join(variation_dir, 'main', 'hdf5', 'lite', 'ag1000g.phase2.ar1.lite.h5') callset_zarr_fn = os.path.join(variation_dir, 'main', 'zarr', 'all', 'ag1000g.phase2.ar1') # preference: zarr > hdf5 > hdf5 (lite) if os.path.exists(callset_zarr_fn): callset = zarr.open_group(callset_zarr_fn, mode='r') elif os.path.exists(callset_h5_fn): callset = h5py.File(callset_h5_fn, mode='r') elif os.path.exists(callset_lite_h5_fn): callset = h5py.File(callset_lite_h5_fn, mode='r') # main callset, PASS variants only callset_pass_h5_fn = os.path.join(variation_dir, 'main', 'hdf5', 'pass', 'ag1000g.phase2.ar1.pass.h5') callset_pass_lite_h5_fn = os.path.join(variation_dir, 'main', 'hdf5', 'lite', 'ag1000g.phase2.ar1.pass.lite.h5') callset_pass_zarr_fn = os.path.join(variation_dir, 'main', 'zarr', 'pass', 'ag1000g.phase2.ar1.pass') # preference: zarr > hdf5 > hdf5 (lite) if os.path.exists(callset_pass_zarr_fn): callset_pass = zarr.open_group(callset_pass_zarr_fn, mode='r') elif os.path.exists(callset_pass_h5_fn): callset_pass = h5py.File(callset_pass_h5_fn, mode='r') elif os.path.exists(callset_pass_lite_h5_fn): callset_pass = h5py.File(callset_pass_lite_h5_fn, mode='r') # main callset, PASS biallelic variants only callset_pass_biallelic_h5_fn = os.path.join( variation_dir, 'main', 'hdf5', 'biallelic', 'ag1000g.phase2.ar1.pass.biallelic.h5') callset_pass_biallelic_lite_h5_fn = os.path.join( variation_dir, 'main', 'hdf5', 'lite', 'ag1000g.phase2.ar1.pass.biallelic.lite.h5') callset_pass_biallelic_zarr_fn = os.path.join( variation_dir, 'main', 'zarr', 'biallelic', 'ag1000g.phase2.ar1.pass.biallelic') # preference: zarr > hdf5 > hdf5 (lite) if os.path.exists(callset_pass_biallelic_zarr_fn): callset_pass_biallelic = zarr.open_group( callset_pass_biallelic_zarr_fn, mode='r') elif os.path.exists(callset_pass_biallelic_h5_fn): callset_pass_biallelic = h5py.File(callset_pass_biallelic_h5_fn, mode='r') elif os.path.exists(callset_pass_biallelic_lite_h5_fn): callset_pass_biallelic = h5py.File(callset_pass_biallelic_lite_h5_fn, mode='r') # SNPEFF annotations callset_snpeff_agamp42_h5_fn_template = os.path.join( variation_dir, 'main', 'hdf5', 'all_snpeff', 'ag1000g.phase2.ar1.snpeff.AgamP4.2.{chrom}.h5') # work around broken link file callset_snpeff_agamp42 = dict() for chrom in '2L', '2R', '3L', '3R', 'X': fn = callset_snpeff_agamp42_h5_fn_template.format(chrom=chrom) if os.path.exists(fn): callset_snpeff_agamp42[chrom] = h5py.File(fn, mode='r')[chrom] # accessibility ############### global accessibility, accessibility_dir accessibility_dir = os.path.join(release_dir, 'accessibility') accessibility_fn = os.path.join(accessibility_dir, 'accessibility.h5') if os.path.exists(accessibility_fn): accessibility = h5py.File(accessibility_fn, mode='r') # sample metadata ################# global tbl_samples, lkp_samples, sample_ids, df_samples, samples_dir samples_dir = os.path.join(release_dir, 'samples') samples_fn = os.path.join(samples_dir, 'samples.meta.txt') if os.path.exists(samples_fn): tbl_samples = (etl.fromtsv(samples_fn).convert( ('year', 'n_sequences'), int).convert(('mean_coverage', ), float)) lkp_samples = tbl_samples.recordlookupone('ox_code') sample_ids = tbl_samples.values('ox_code').list() df_samples = pandas.read_csv(samples_fn, sep='\t', index_col='ox_code') # extras ######## global allele_counts extras_dir = os.path.join(release_dir, 'extras') # allele counts allele_counts_fn = os.path.join(extras_dir, 'allele_counts.h5') if os.path.exists(allele_counts_fn): allele_counts = h5py.File(allele_counts_fn, mode='r') # haplotypes ############ global haplotypes_dir, callset_phased, tbl_haplotypes, df_haplotypes, lkp_haplotypes haplotypes_dir = os.path.join(release_dir, 'haplotypes') # no HDF5 link file, load up as dict for now callset_phased_hdf5_fn_template = os.path.join( haplotypes_dir, 'main', 'hdf5', 'ag1000g.phase2.ar1.haplotypes.{chrom}.h5') callset_phased = dict() for chrom in '2L', '2R', '3L', '3R', 'X': fn = callset_phased_hdf5_fn_template.format(chrom=chrom) if os.path.exists(fn): callset_phased[chrom] = h5py.File(fn, mode='r')[chrom] # no haplotypes file, create here for now # TODO source this from file Nick has created if '3R' in callset_phased: phased_samples = callset_phased['3R']['samples'][:].astype('U') haplotype_labels = list( itertools.chain(*[[s + 'a', s + 'b'] for s in phased_samples])) tbl_haplotypes = (etl.empty().addcolumn( 'label', haplotype_labels).addrownumbers(start=0).rename( 'row', 'index' ).addfield('ox_code', lambda row: row.label[:-1]).hashleftjoin( tbl_samples, key='ox_code').addfield( 'label_aug', lambda row: '%s [%s, %s, %s, %s]' % (row.label, row.country, row.location, row.m_s, row.sex))) lkp_haplotypes = tbl_haplotypes.recordlookupone('label') df_haplotypes = tbl_haplotypes.todataframe(index='index')
def fromgff3(filename, region=None): """ Extract feature rows from a GFF3 file, e.g.:: >>> import petl as etl >>> # activate bio extensions ... import petlx.bio >>> table1 = etl.fromgff3('fixture/sample.gff') >>> table1.look(truncate=30) +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+ | seqid | source | type | start | end | score | strand | phase | attributes | +==============+=========+===============+=======+=========+=======+========+=======+================================+ | 'apidb|MAL1' | 'ApiDB' | 'supercontig' | 1 | 643292 | '.' | '+' | '.' | {'localization': 'nuclear', 'o | +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL2' | 'ApiDB' | 'supercontig' | 1 | 947102 | '.' | '+' | '.' | {'localization': 'nuclear', 'o | +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL3' | 'ApiDB' | 'supercontig' | 1 | 1060087 | '.' | '+' | '.' | {'localization': 'nuclear', 'o | +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL4' | 'ApiDB' | 'supercontig' | 1 | 1204112 | '.' | '+' | '.' | {'localization': 'nuclear', 'o | +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL5' | 'ApiDB' | 'supercontig' | 1 | 1343552 | '.' | '+' | '.' | {'localization': 'nuclear', 'o | +--------------+---------+---------------+-------+---------+-------+--------+-------+--------------------------------+ ... A region query string of the form '[seqid]' or '[seqid]:[start]-[end]' may be given for the `region` argument. If given, requires the GFF3 file to be position sorted, bgzipped and tabix indexed. Requires pysam to be installed. E.g.:: >>> # extract from a specific genome region via tabix ... table2 = etl.fromgff3('fixture/sample.sorted.gff.gz', ... region='apidb|MAL5:1289593-1289595') >>> table2.look(truncate=30) +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+ | seqid | source | type | start | end | score | strand | phase | attributes | +==============+=========+===============+=========+=========+=======+========+=======+================================+ | 'apidb|MAL5' | 'ApiDB' | 'supercontig' | 1 | 1343552 | '.' | '+' | '.' | {'localization': 'nuclear', 'o | +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL5' | 'ApiDB' | 'exon' | 1289594 | 1291685 | '.' | '+' | '.' | {'size': '2092', 'Parent': 'ap | +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL5' | 'ApiDB' | 'gene' | 1289594 | 1291685 | '.' | '+' | '.' | {'ID': 'apidb|MAL5_18S', 'web_ | +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+ | 'apidb|MAL5' | 'ApiDB' | 'rRNA' | 1289594 | 1291685 | '.' | '+' | '.' | {'ID': 'apidb|rna_MAL5_18S-1', | +--------------+---------+---------------+---------+---------+-------+--------+-------+--------------------------------+ """ if region is None: # parse file as tab-delimited table = etl.fromtsv(filename) else: # extract via tabix table = etl.fromtabix(filename, region=region) return (table.pushheader(GFF3_HEADER).skipcomments('#') # ignore any row not 9 values long (e.g., trailing fasta) .rowlenselect(9) # parse attributes into a dict .convert('attributes', gff3_parse_attributes) # parse coordinates .convert(('start', 'end'), int))
import petl as etl import IPython import pandas as pd _DEBUG = True _TIME_TEST = True # Test case on run time complement vs antijoin. # normally these would be toggles but for testing we set both to true _COMPLEMENT = True _ANTI_JOIN = True # csv = comma delimited, tsv = tab delimited pre_etl_time = time.time() a = etl.fromtsv('snpdata.csv') post_etl_time = time.time() b = etl.fromtsv('popdata.csv') pre_df_time = time.time() df_a = pd.read_csv('snpdata.csv', sep='\t', header=0) post_df_time = time.time() print("ETL time to load A file: {} Pandas time to load A file: {}".format( post_etl_time - pre_etl_time, post_df_time - pre_df_time)) df_b = pd.read_csv('popdata.csv', sep='\t', header=0) header_a = etl.header(a) header_b = etl.header(b) if _DEBUG:
def convert_folder(base_source_dir, base_target_dir, tmp_dir, tika=False, ocr=False, merge=False, tsv_source_path=None, tsv_target_path=None, make_unique=True, sample=False, zip=False): # WAIT: Legg inn i gui at kan velge om skal ocr-behandles txt_target_path = base_target_dir + '_result.txt' json_tmp_dir = base_target_dir + '_tmp' converted_now = False errors = False originals = False if merge is False: # TODO: Trengs begge argumentene? make_unique = False if tsv_source_path is None: tsv_source_path = base_target_dir + '.tsv' else: txt_target_path = os.path.splitext( tsv_source_path)[1][1:] + '_result.txt' if tsv_target_path is None: tsv_target_path = base_target_dir + '_processed.tsv' if os.path.exists(tsv_target_path): os.remove(tsv_target_path) Path(base_target_dir).mkdir(parents=True, exist_ok=True) # TODO: Viser mime direkte om er pdf/a eller må en sjekke mot ekstra felt i de to under? Forsjekk om Tika og siegfried? # TODO: Trengs denne sjekk om tsv her. Gjøres sjekk før kaller denne funskjonen og slik at unødvendig? if not os.path.isfile(tsv_source_path): if tika: run_tika(tsv_source_path, base_source_dir, json_tmp_dir, zip) else: run_siegfried(base_source_dir, tmp_dir, tsv_source_path, zip) # TODO: Legg inn test på at tsv-fil ikke er tom replace_text_in_file(tsv_source_path, '\0', '') table = etl.fromtsv(tsv_source_path) table = etl.rename(table, { 'filename': 'source_file_path', 'tika_batch_fs_relative_path': 'source_file_path', 'filesize': 'file_size', 'mime': 'mime_type', 'Content_Type': 'mime_type', 'Version': 'version' }, strict=False) thumbs_table = etl.select( table, lambda rec: Path(rec.source_file_path).name == 'Thumbs.db') if etl.nrows(thumbs_table) > 0: thumbs_paths = etl.values(thumbs_table, 'source_file_path') for path in thumbs_paths: if '/' not in path: path = os.path.join(base_source_dir, path) if os.path.isfile(path): os.remove(path) table = etl.select( table, lambda rec: Path(rec.source_file_path).name != 'Thumbs.db') table = etl.select(table, lambda rec: rec.source_file_path != '') table = etl.select(table, lambda rec: '#' not in rec.source_file_path) # WAIT: Ikke fullgod sjekk på embedded dokument i linje over da # faktisk kan forekomme i filnavn row_count = etl.nrows(table) file_count = sum([len(files) for r, d, files in os.walk(base_source_dir)]) if row_count == 0: print('No files to convert. Exiting.') return 'Error', file_count elif file_count != row_count: print('Row count: ' + str(row_count)) print('File count: ' + str(file_count)) print("Files listed in '" + tsv_source_path + "' doesn't match files on disk. Exiting.") return 'Error', file_count elif not zip: print('Converting files..') # WAIT: Legg inn sjekk på filstørrelse før og etter konvertering append_fields = ('version', 'norm_file_path', 'result', 'original_file_copy', 'id') table = add_fields(append_fields, table) cut_fields = ('0', '1', 'X_TIKA_EXCEPTION_runtime', 'X_TIKA_EXCEPTION_warn') table = remove_fields(cut_fields, table) header = etl.header(table) append_tsv_row(tsv_target_path, header) # Treat csv (detected from extension only) as plain text: table = etl.convert(table, 'mime_type', lambda v, row: 'text/plain' if row.id == 'x-fmt/18' else v, pass_row=True) # Update for missing mime types where id is known: table = etl.convert(table, 'mime_type', lambda v, row: 'application/xml' if row.id == 'fmt/979' else v, pass_row=True) if os.path.isfile(txt_target_path): os.remove(txt_target_path) data = etl.dicts(table) count = 0 for row in data: count += 1 count_str = ('(' + str(count) + '/' + str(file_count) + '): ') source_file_path = row['source_file_path'] if '/' not in source_file_path: source_file_path = os.path.join(base_source_dir, source_file_path) mime_type = row['mime_type'] # TODO: Virker ikke når Tika brukt -> finn hvorfor if ';' in mime_type: mime_type = mime_type.split(';')[0] version = row['version'] result = None old_result = row['result'] if not mime_type: if os.path.islink(source_file_path): mime_type = 'n/a' # kind = filetype.guess(source_file_path) extension = os.path.splitext(source_file_path)[1][1:].lower() if extension == 'xml': mime_type = 'application/xml' if not zip: print_path = os.path.relpath(source_file_path, Path(base_source_dir).parents[1]) print(count_str + '.../' + print_path + ' (' + mime_type + ')') if mime_type not in mime_to_norm.keys(): # print("|" + mime_type + "|") errors = True converted_now = True result = 'Conversion not supported' append_txt_file( txt_target_path, result + ': ' + source_file_path + ' (' + mime_type + ')') row['norm_file_path'] = '' row['original_file_copy'] = '' else: keep_original = mime_to_norm[mime_type][0] if keep_original: originals = True if zip: keep_original = False function = mime_to_norm[mime_type][1] # Ensure unique file names in dir hierarchy: norm_ext = mime_to_norm[mime_type][2] if not norm_ext: norm_ext = 'none' if make_unique: norm_ext = (base64.b32encode( bytes( str(count), encoding='ascii'))).decode('utf8').replace( '=', '').lower() + '.' + norm_ext target_dir = os.path.dirname( source_file_path.replace(base_source_dir, base_target_dir)) normalized = file_convert(source_file_path, mime_type, function, target_dir, tmp_dir, None, norm_ext, version, ocr, keep_original, zip=zip) if normalized['result'] == 0: errors = True result = 'Conversion failed' append_txt_file( txt_target_path, result + ': ' + source_file_path + ' (' + mime_type + ')') elif normalized['result'] == 1: result = 'Converted successfully' converted_now = True elif normalized['result'] == 2: errors = True result = 'Conversion not supported' append_txt_file( txt_target_path, result + ': ' + source_file_path + ' (' + mime_type + ')') elif normalized['result'] == 3: if old_result not in ('Converted successfully', 'Manually converted'): result = 'Manually converted' converted_now = True else: result = old_result elif normalized['result'] == 4: converted_now = True errors = True result = normalized['error'] append_txt_file( txt_target_path, result + ': ' + source_file_path + ' (' + mime_type + ')') elif normalized['result'] == 5: result = 'Not a document' if normalized['norm_file_path']: row['norm_file_path'] = relpath(normalized['norm_file_path'], base_target_dir) file_copy_path = normalized['original_file_copy'] if file_copy_path: file_copy_path = relpath(file_copy_path, base_target_dir) row['original_file_copy'] = file_copy_path row['result'] = result row_values = list(row.values()) # TODO: Fikset med å legge inn escapechar='\\' i append_tsv_row -> vil det skal problemer senere? # row_values = [r.replace('\n', ' ') for r in row_values if r is not None] append_tsv_row(tsv_target_path, row_values) if sample and count > 9: break if not sample: shutil.move(tsv_target_path, tsv_source_path) # TODO: Legg inn valg om at hvis merge = true kopieres alle filer til mappe på øverste nivå og så slettes tomme undermapper msg = None if sample: msg = 'Sample files converted.' if errors: msg = "Not all sample files were converted. See '" + txt_target_path + "' for details." else: if converted_now: msg = 'All files converted succcessfully.' if errors: msg = "Not all files were converted. See '" + txt_target_path + "' for details." else: msg = 'All files converted previously.' return msg, file_count, errors, originals # TODO: Fiks så bruker denne heller for oppsummering til slutt når flere mapper konvertert