def aggregate_collection( request: HttpRequest, collection_id: int, ) -> HttpResponse: """ Value count computations could be also moved into a celery task that would prepare the answer for the user and bring it to him later (via email or on page with results). """ collection = get_object_or_404(StarWarsCollection, id=collection_id) table = etl.fromcsv(collection.filepath) aggregate_keys, parameters_settings = parse_parameters( request.GET.get( 'current_parameters', '0000001001', ), ) if len(aggregate_keys) == 1: # aggregate does not work correctly # if list with 1 element is passed aggregate_keys = aggregate_keys[0] if len(aggregate_keys) == 0: # show no table if every option is disabled table = etl.empty() else: table = table.aggregate(key=aggregate_keys, aggregation=len) return render( request, 'main/collection_aggregate.html', { 'collection': collection, 'parameters_settings': parameters_settings, 'headers': etl.header(table), 'data': etl.data(table), }, )
def test_empty(): actual = (etl.empty().addcolumn('foo', ['a', 'b', 'c']).addcolumn( 'bar', [1, 2, 2])) expect = (('foo', 'bar'), ('a', 1), ('b', 2), ('c', 2)) ieq(expect, actual) ieq(expect, actual)
def save_characters_to_file(generated_file_path, characters_pages): etl.setheader( etl.empty(), settings.STAR_WARS_CHARACTERS_OUTPUT_FILE_HEADER_FIELDS, ).tocsv(generated_file_path) logger.info('Created file: %s', generated_file_path) for characters_page in characters_pages: etl.appendcsv( characters_page, generated_file_path, write_header=False, ) logger.info('Added data to file: %s', generated_file_path)
list(d) # records() ############### import petl as etl table = [['foo', 'bar'], ['a', 1], ['b', 2]] d = etl.records(table) d list(d) # rowgroupby() ############## import petl as etl table1 = [['foo', 'bar', 'baz'], ['a', 1, True], ['b', 3, True], ['b', 2]] # group entire rows for key, group in etl.rowgroupby(table1, 'foo'): print(key, list(group)) # group specific values for key, group in etl.rowgroupby(table1, 'foo', 'bar'): print(key, list(group)) # empty() ######### import petl as etl table = (etl.empty().addcolumn('foo', ['A', 'B']).addcolumn('bar', [1, 2])) table
def init(release_dir, load_geneset=False, geneset_attributes=None): """Initialise data resources. Parameters ---------- release_dir : string Local filesystem path where data from the release are stored. load_geneset : string If True, load geneset into memory. geneset_attributes : dict-like Attributes to load. """ # reference sequence #################### global genome_agamp3, genome_agamp4, genome_dir genome_dir = os.path.join(release_dir, 'genome') genome_agamp3_dir = os.path.join(genome_dir, 'agamP3') genome_agamp3_fn = os.path.join( genome_agamp3_dir, 'Anopheles-gambiae-PEST_CHROMOSOMES_AgamP3.fa') if os.path.exists(genome_agamp3_fn): genome_agamp3 = pyfasta.Fasta(genome_agamp3_fn, key_fn=lambda v: v.split()[0]) genome_agamp4_dir = os.path.join(genome_dir, 'agamP4') genome_agamp4_fn = os.path.join( genome_agamp4_dir, 'Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa') if os.path.exists(genome_agamp4_fn): genome_agamp4 = pyfasta.Fasta(genome_agamp4_fn, key_fn=lambda v: v.split()[0]) # genome annotations #################### global geneset_agamp44_fn, geneset_agamp44, geneset_dir geneset_dir = os.path.join(release_dir, 'geneset') geneset_agamp44_fn = os.path.join( geneset_dir, 'Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.4.sorted.gff3.gz') if load_geneset: geneset_agamp44 = allel.FeatureTable.from_gff3( geneset_agamp44_fn, attributes=geneset_attributes) # variant callsets ################## global callset, callset_pass, callset_pass_biallelic, variation_dir, \ callset_snpeff_agamp42 variation_dir = os.path.join(release_dir, 'variation') # main callset callset_h5_fn = os.path.join(variation_dir, 'main', 'hdf5', 'all', 'ag1000g.phase2.ar1.h5') callset_lite_h5_fn = os.path.join(variation_dir, 'main', 'hdf5', 'lite', 'ag1000g.phase2.ar1.lite.h5') callset_zarr_fn = os.path.join(variation_dir, 'main', 'zarr', 'all', 'ag1000g.phase2.ar1') # preference: zarr > hdf5 > hdf5 (lite) if os.path.exists(callset_zarr_fn): callset = zarr.open_group(callset_zarr_fn, mode='r') elif os.path.exists(callset_h5_fn): callset = h5py.File(callset_h5_fn, mode='r') elif os.path.exists(callset_lite_h5_fn): callset = h5py.File(callset_lite_h5_fn, mode='r') # main callset, PASS variants only callset_pass_h5_fn = os.path.join(variation_dir, 'main', 'hdf5', 'pass', 'ag1000g.phase2.ar1.pass.h5') callset_pass_lite_h5_fn = os.path.join(variation_dir, 'main', 'hdf5', 'lite', 'ag1000g.phase2.ar1.pass.lite.h5') callset_pass_zarr_fn = os.path.join(variation_dir, 'main', 'zarr', 'pass', 'ag1000g.phase2.ar1.pass') # preference: zarr > hdf5 > hdf5 (lite) if os.path.exists(callset_pass_zarr_fn): callset_pass = zarr.open_group(callset_pass_zarr_fn, mode='r') elif os.path.exists(callset_pass_h5_fn): callset_pass = h5py.File(callset_pass_h5_fn, mode='r') elif os.path.exists(callset_pass_lite_h5_fn): callset_pass = h5py.File(callset_pass_lite_h5_fn, mode='r') # main callset, PASS biallelic variants only callset_pass_biallelic_h5_fn = os.path.join( variation_dir, 'main', 'hdf5', 'biallelic', 'ag1000g.phase2.ar1.pass.biallelic.h5') callset_pass_biallelic_lite_h5_fn = os.path.join( variation_dir, 'main', 'hdf5', 'lite', 'ag1000g.phase2.ar1.pass.biallelic.lite.h5') callset_pass_biallelic_zarr_fn = os.path.join( variation_dir, 'main', 'zarr', 'biallelic', 'ag1000g.phase2.ar1.pass.biallelic') # preference: zarr > hdf5 > hdf5 (lite) if os.path.exists(callset_pass_biallelic_zarr_fn): callset_pass_biallelic = zarr.open_group( callset_pass_biallelic_zarr_fn, mode='r') elif os.path.exists(callset_pass_biallelic_h5_fn): callset_pass_biallelic = h5py.File(callset_pass_biallelic_h5_fn, mode='r') elif os.path.exists(callset_pass_biallelic_lite_h5_fn): callset_pass_biallelic = h5py.File(callset_pass_biallelic_lite_h5_fn, mode='r') # SNPEFF annotations callset_snpeff_agamp42_h5_fn_template = os.path.join( variation_dir, 'main', 'hdf5', 'all_snpeff', 'ag1000g.phase2.ar1.snpeff.AgamP4.2.{chrom}.h5') # work around broken link file callset_snpeff_agamp42 = dict() for chrom in '2L', '2R', '3L', '3R', 'X': fn = callset_snpeff_agamp42_h5_fn_template.format(chrom=chrom) if os.path.exists(fn): callset_snpeff_agamp42[chrom] = h5py.File(fn, mode='r')[chrom] # accessibility ############### global accessibility, accessibility_dir accessibility_dir = os.path.join(release_dir, 'accessibility') accessibility_fn = os.path.join(accessibility_dir, 'accessibility.h5') if os.path.exists(accessibility_fn): accessibility = h5py.File(accessibility_fn, mode='r') # sample metadata ################# global tbl_samples, lkp_samples, sample_ids, df_samples, samples_dir samples_dir = os.path.join(release_dir, 'samples') samples_fn = os.path.join(samples_dir, 'samples.meta.txt') if os.path.exists(samples_fn): tbl_samples = (etl.fromtsv(samples_fn).convert( ('year', 'n_sequences'), int).convert(('mean_coverage', ), float)) lkp_samples = tbl_samples.recordlookupone('ox_code') sample_ids = tbl_samples.values('ox_code').list() df_samples = pandas.read_csv(samples_fn, sep='\t', index_col='ox_code') # extras ######## global allele_counts extras_dir = os.path.join(release_dir, 'extras') # allele counts allele_counts_fn = os.path.join(extras_dir, 'allele_counts.h5') if os.path.exists(allele_counts_fn): allele_counts = h5py.File(allele_counts_fn, mode='r') # haplotypes ############ global haplotypes_dir, callset_phased, tbl_haplotypes, df_haplotypes, lkp_haplotypes haplotypes_dir = os.path.join(release_dir, 'haplotypes') # no HDF5 link file, load up as dict for now callset_phased_hdf5_fn_template = os.path.join( haplotypes_dir, 'main', 'hdf5', 'ag1000g.phase2.ar1.haplotypes.{chrom}.h5') callset_phased = dict() for chrom in '2L', '2R', '3L', '3R', 'X': fn = callset_phased_hdf5_fn_template.format(chrom=chrom) if os.path.exists(fn): callset_phased[chrom] = h5py.File(fn, mode='r')[chrom] # no haplotypes file, create here for now # TODO source this from file Nick has created if '3R' in callset_phased: phased_samples = callset_phased['3R']['samples'][:].astype('U') haplotype_labels = list( itertools.chain(*[[s + 'a', s + 'b'] for s in phased_samples])) tbl_haplotypes = (etl.empty().addcolumn( 'label', haplotype_labels).addrownumbers(start=0).rename( 'row', 'index' ).addfield('ox_code', lambda row: row.label[:-1]).hashleftjoin( tbl_samples, key='ox_code').addfield( 'label_aug', lambda row: '%s [%s, %s, %s, %s]' % (row.label, row.country, row.location, row.m_s, row.sex))) lkp_haplotypes = tbl_haplotypes.recordlookupone('label') df_haplotypes = tbl_haplotypes.todataframe(index='index')
import petl as etl table = [["foo", "bar"], ["a", 1], ["b", 2]] d = etl.records(table) d list(d) # rowgroupby() ############## import petl as etl table1 = [["foo", "bar", "baz"], ["a", 1, True], ["b", 3, True], ["b", 2]] # group entire rows for key, group in etl.rowgroupby(table1, "foo"): print(key, list(group)) # group specific values for key, group in etl.rowgroupby(table1, "foo", "bar"): print(key, list(group)) # empty() ######### import petl as etl table = etl.empty().addcolumn("foo", ["A", "B"]).addcolumn("bar", [1, 2]) table
import yaml parser = argparse.ArgumentParser( description='Parse an xml file to csv via an yaml config.') parser.add_argument('-f', metavar='xml', help='path of input xml file') parser.add_argument('-t', metavar='csv', help='path of output csv file') parser.add_argument('-e', action='store_true', help='add quotes to header to fit SQL pattern') parser.add_argument('config', help='path of config yaml file') args = parser.parse_args() info = yaml.load(open(args.config)) xml_file = args.f or info['xml'] csv_file = args.t or info['csv'] table = petl.empty() # substitute namespace to keys for key in eval(Template(str(info['keys'])).substitute( **info['namespace'])) if 'namespace' in info else info['keys']: # collect data from each key table = table.cat(petl.fromxml(xml_file, key['anchor'], key['select'])) if 'pks' in info: table = table.mergeduplicates( info['pks'] if len(info['pks']) > 1 else info['pks'][0]) if 'orderBy' in info: table = table.sort(info['orderBy']) if 'skip' in info:
def dicts2table(dicts): """transform dicts into a ``petl.util.base.Table``""" return petl.wrap(petl.fromdicts(dicts)) if dicts else petl.empty()
def _get_eval_modes(self): return etl.empty()
def extract(self, task, job_config): return petl.empty()
def _get_artifacts(self): # TODO: double check that this should be empty # TODO: see if there is any point in keeping teacher adv return etl.empty()
def _get_tasks(self): return etl.empty()
def _get_items(self): return etl.empty()
def _get_artifacts(self): return etl.empty()
def _get_answers(self): comments = (etl.fromcsv(f'{self._dirc}/assessment_result.csv', delimiter=',').listoflists()) return etl.empty()