class SyncColumn(WrapperTask): ''' Upload tables relevant to updating a particular column by keyword. ''' keywords = Parameter() def requires(self): session = current_session() cols = session.query(OBSColumn).filter( OBSColumn.id.ilike('%' + self.keywords + '%')) if cols.count(): for col in cols: for coltable in col.tables: yield SyncData(exact_id=coltable.table.id) else: tables = session.query(OBSTable).filter( OBSTable.id.ilike('%' + self.keywords + '%')) if tables.count(): for table in tables: yield SyncData(exact_id=table.id) else: raise Exception('Unable to find any tables or columns with ID ' 'that matched "{keywords}" via ILIKE'.format( keywords=self.keywords))
class ImportEnglandWalesLocal(TempTableTask): table = Parameter() def requires(self): return DownloadEnglandWalesLocal() def run(self): session = current_session() infile = os.path.join(self.input().path, self.table + 'DATA.CSV') headers = shell('head -n 1 {csv}'.format(csv=infile)) cols = ['{} NUMERIC'.format(h) for h in headers.split(',')[1:]] session.execute('CREATE TABLE {output} (GeographyCode TEXT, {cols})'.format( output=self.output().table, cols=', '.join(cols) )) session.commit() shell("cat '{infile}' | psql -c 'COPY {output} FROM STDIN WITH CSV HEADER'".format( output=self.output().table, infile=infile, )) session.execute('ALTER TABLE {output} ADD PRIMARY KEY (geographycode)'.format( output=self.output().table ))
class TarballTask(Task): """ A task that puts another task's output (assuming it outputs a FileTarget) into a tarball) """ describe = "Package a task's output into an uncompressed tarball." upstream_task = TaskParameter( description="Task that produces a local file") output_path = Parameter(description="Where the output archive should go") def requires(self): return self.upstream_task def run(self): input_path = self.input().path output_path = self.output().path if not os.path.exists(input_path): raise FileNotFoundError( "{input_path}: no such file or directory: should be a *local* file/dir to be archived" .format(input_path=input_path)) logger.info( "Putting {input_path} into a tar located at {output_path}".format( input_path=input_path, output_path=output_path)) input_path = self.input().path with tarfile.open(self.output().path, "w") as tar: tar.add(input_path, arcname=os.path.basename(input_path)) logger.info("{output_path}: tar created: size = {size} bytes".format( output_path=output_path, size=os.stat(output_path).st_size)) def output(self): return LocalTarget(self.output_path)
class DownloadUK(Task): API_URL = 'https://www.nomisweb.co.uk/api/v01/dataset/def.sdmx.json?search={}*' DOWNLOAD_URL = 'https://www.nomisweb.co.uk/api/v01/dataset/{id}.bulk.csv?time=2011&measures=20100&geography={geo}' table = Parameter() def version(self): return 1 def requires(self): requirements = {} # Query API, extract table ID from name meta = requests.get(self.API_URL.format(self.table)).json() api_id = (meta['structure']['keyfamilies']['keyfamily'][0]['id']).lower() for geo in self.GEO_TYPES: requirements[geo] = RepoFile(resource_id='{task_id}_{geo}'.format(task_id=self.task_id, geo=geo), version=self.version(), url=self.DOWNLOAD_URL.format(id=api_id, geo=geo)) return requirements def run(self): # Download for SA (EW,S) and OA (NI) in a single file with self.output().temporary_path() as tmp, open(os.path.join(tmp, '{}.csv'.format(self.table)), 'wb') as outcsv: skip_header = False for geo in self.GEO_TYPES: with open(self.input()[geo].path, 'rb') as remote_file: if skip_header: next(remote_file) else: skip_header = True for l in remote_file: outcsv.write(l) def output(self): return DirectoryTarget(self)
class Sample(object): sample_id = Parameter() @property def sample(self): if not hasattr(self, '_sample'): self._sample = client.get_record_by_name( self.sample_id, cfg['AIRTABLE_SAMPLE_TABLE'])['fields'] return self._sample @property def sample_folder(self): return '{expt}/{sample}'.format(bucket=cfg['S3_BUCKET'], expt=self.experiment['Name'], sample=self.sample_id) @property def experiment(self): if not hasattr(self, '_experiment'): expt_key = self.sample['Experiment'][0] self._experiment = client.get_record( expt_key, cfg['AIRTABLE_EXPT_TABLE'])['fields'] return self._experiment
class StartIndexCollection(WrapperTask): collection_name = Parameter(default='dummy') data_folder = Parameter(default='/data') hadoop_jar = Parameter( default="/opt/searcher/scripts/nutchwax-job-0.11.0-SNAPSHOT.jar") hadoop_bin = Parameter(default="/opt/searcher/hadoop/bin/hadoop") lucene_jar = Parameter( default="/opt/searcher/scripts/lib/pwalucene-1.0.0-SNAPSHOT.jar") document_server = Parameter(default="p64.arquivo.pt") def complete(self): return False def requires(self): return PruneIndexes(collection_name=self.collection_name, data_folder=self.data_folder, hadoop_jar=self.hadoop_jar, lucene_jar=self.lucene_jar, hadoop_bin=self.hadoop_bin, document_server=self.document_server)
class StaticJSON(ExternalTask): filename = Parameter() def output(self): return LocalTarget("data/{}.json".format(self.filename))
class AnalyzeModelResults(Task): # pragma: no cover """ Manages the analysis of the model results: 1. Term of interest is fed to trained model 2. Model returns top N words that are most similar This similarity decision is made based on the cosine similarity of the words' vector representations """ # Parameter representing the local data root RESULTS_ROOT = Parameter( default=os.path.abspath(os.path.join("data", "models"))) # Parameter representing the model results target file model_results_target_file = Parameter(default="model_results.csv") # Parameter representing the term for which to retrieve # the top N similar words search_term = Parameter(default="news") def requires(self): return self.clone(FeedToModel) def output(self): # Represents the file path to store the video captions video_ids_path = str(self.channel_author) + '_' + str( self.query) + '_captions' paths_list = [ str(self.RESULTS_ROOT), video_ids_path, str(self.model_results_target_file), ] return SuffixPreservingLocalTarget(reduce(os.path.join, paths_list)) def run(self): # Loads the saved model with self.input().open(mode="r") as input_target: loaded_model = Word2Vec.load(input_target.name) # Determines the words that are most similar to the term of interest most_similar_words = loaded_model.similar_by_word(str( self.search_term), topn=30) # Stores results to DataFrame words_df = pd.DataFrame(most_similar_words, columns=["word", "probability"]) # Writes results to target file with self.output().open(mode="w") as output_target: words_df.to_csv(output_target) # Displays results self.print_results() def print_results(self): print(pd.read_csv(self.output().path))
class VaccineDataGlobalCleanupTask(Task): """Luigi Task to clean Vaccine time series data. The input is from External Task that specifies files in GIT. The cleaning from below code handles removing rows with null date and doses administered values are non-zero. The default parameters can be overridden for testing and I have overridden for all test cases. Parameters: subset: bool, True to process one partition, False to process the entire dataset default: True data_root: str, base directory to store cleaned output files Output: Dataframe stored in compressed Parquet format """ # default parameters subset = BoolParameter(default=True) data_root = Parameter(default="./data/vaccine/") # External task completion is required, to work with GIT / CSVTarget requires = Requires() input_data = Requirement(VaccineDataGlobalTask) # TargetOutput returns ParquetTarget output = TargetOutput( "{task.data_root}", ext="subset-{task.subset}/", target_class=ParquetTarget, flag="_SUCCESS", storage_options=None, ) def run(self): """ Clean Vaccine data from Task input and stores dataframe in Parquet format. :return: File content is stored in the data directory """ # The columns ["Doses_admin", "People_partially_vaccinated", "People_fully_vaccinated"] # are all integers. However, given there are missing values, you must first # read them as floats, fill nan's as 0, then convert to int. # You can provide a dict of {col: dtype} when providing the dtype arg in places like # read_parquet and astype. number_columns = [ "Doses_admin", "People_partially_vaccinated", "People_fully_vaccinated", ] # Ensure that the date column is parsed as a pandas datetime using parse_dates vdg_dask = self.input()["input_data"].read_dask( parse_dates=["Date"], dtype={c: "float" for c in number_columns}) if self.subset: vdg_dask = vdg_dask.get_partition(0) # perform data cleaning # Remove any blank countries vdg_dask = vdg_dask[~vdg_dask.Country_Region.isnull()] # Filter out invalid dates vdg_dask = vdg_dask[~vdg_dask.Date.isnull()] # You should set the index to Country_Region and ensure the output reads back with meaningful divisions # vdg_dask = vdg_dask.set_index("Country_Region") vdg_dask[number_columns] = vdg_dask[number_columns].fillna(0).astype( int) # write_dask parquet file output with gzip compression. vdg_output = vdg_dask self.output().write_dask(vdg_output, compression="gzip")
class BaseParams(metaclass=ABCMeta): resolution = Parameter(default=GEO_PR) survey = Parameter(default=SURVEY_CEN)
class ImportAllSurveys(WrapperTask): resolution = Parameter(default=GEO_PR) def requires(self): for survey in SURVEYS: yield ImportData(resolution=self.resolution, survey=survey)
class SoiaMetricsFetcher(sqla.CopyToTable): columns = [(["id", Integer()], { "autoincrement": True, "primary_key": True }), (["start", BigInteger()], {}), (["end", BigInteger()], {}), (["insert_date", BigInteger()], {}), (["path", Text()], {}), (["metric_anomaly", Text()], {}), (["metric_whole", Text()], {})] connection_string = "sqlite:///data/soia_email.db" table = "soia_with_values" path = Parameter() date = DateParameter() def requires(self): return SoiaEmailFetcher(date=datetime.now()), MetricFetcher( path_prefix=self.path) def copy(self, conn, ins_rows, table): bound_cols = dict((c, bindparam("_" + c.key)) for c in table.columns if c.key != "id") ins = table.insert().values(bound_cols) conn.execute(ins, ins_rows) def rows(self): for start, end, path, metric, whole in deduplicated( self.generate_rows()): yield "auto", start, end, datetime.now().strftime( '%s'), path, metric, whole def generate_rows(self): now = int(datetime.now().strftime('%s')) _14_days_ago = int( (datetime.now() - timedelta(days=14)).strftime('%s')) _, preloaded_metrics = self.input() metrics = json.loads(preloaded_metrics.open('r').read()) conn = sqlite3.connect('data/soia_email.db') c = conn.cursor() c.execute("select distinct start, end from soia;") rows = c.fetchall() conn.close() formed_rows = [] for start, end in rows: if start < _14_days_ago or end < _14_days_ago: logging.warning( f"date to early :C - {datetime.fromtimestamp(start)}, {datetime.fromtimestamp(end)}" ) else: logging.info( f"date good to go! - {datetime.fromtimestamp(start)}, {datetime.fromtimestamp(end)}" ) for metric in metrics: shorter = list( filter(lambda tup: tup[1] >= start and tup[1] <= end, metric['datapoints'])) formed_rows.append( (start, end, metric['target'], json.dumps(shorter), json.dumps(metric['datapoints']))) print(len(formed_rows[0])) return formed_rows
class ImagesForMeasure(Task): ''' Generate a set of static images for a measure ''' MAP_URL = '{cartodb_url}/api/v1/map'.format( cartodb_url=os.environ['CARTODB_URL']) BASEMAP = { "type": "http", "options": { "urlTemplate": "http://{s}.basemaps.cartocdn.com/light_nolabels/{z}/{x}/{y}.png", "subdomains": "abcd", } } LABELS = { "type": "http", "options": { "urlTemplate": "http://{s}.basemaps.cartocdn.com/light_only_labels/{z}/{x}/{y}.png", "subdomains": "abcd", } } CENTER_ZOOM_BOUNDS = { 'es': [ ( (40.4139017, -3.7350414), 6, None, ), ( (40.4139017, -3.7350414), 8, None, ), ( (40.4139017, -3.7350414), 11, None, ), ( (40.4139017, -3.7050414), 13, None, ), ], 'mx': [ ( (22.979, -101.777), 4, 'mx.inegi.entidad', ), ( (19.316, -99.152), 7, 'mx.inegi.municipio', ), ( (19.441989391028706, -99.14474487304688), 11, 'mx.inegi.ageb', ), ( (19.441989391028706, -99.14474487304688), 13, 'mx.inegi.manzana', ), ], 'uk': [ ( (52.51622086393074, -1.197509765625), 5, None, ), # All England ( (51.50190410761811, -0.120849609375), 9, None, ), # London ( (52.47274306920925, -3.982543945312), 7, None, ), # Wales ( (53.491313790532956, -2.9706787109375), 9, None, ), # Manchester ], 'us': [ ( (37.996162679728116, -97.6904296875), 3, 'us.census.tiger.state_clipped', ), ( (38.16911413556086, -114.884033203125), 5, 'us.census.tiger.county_clipped', ), ( (37.75225820732333, -122.11584777832031), 9, 'us.census.tiger.census_tract_clipped', ), ( (37.75225820732333, -122.44584777832031), 12, 'us.census.tiger.block_group_clipped', ), ], } PALETTES = { 'tags.people': ''' @5:#6c2167; @4:#a24186; @3:#ca699d; @2:#e498b4; @1:#f3cbd3;''', 'tags.money': ''' @5:#1d4f60; @4:#2d7974; @3:#4da284; @2:#80c799; @1:#c4e6c3;''', 'tags.households': ''' @5:#63589f; @4:#9178c4; @3:#b998dd; @2:#dbbaed; @1:#f3e0f7;''', 'tags.housing': ''' @5:#2a5674; @4:#45829b; @3:#68abb8; @2:#96d0d1; @1:#d1eeea;''', 'tags.ratio': ''' @5:#eb4a40; @4:#f17854; @3:#f59e72; @2:#f9c098; @1:#fde0c5;''', 'tags.segmentation': ''' @1:#7F3C8D; @2:#11A579; @3:#3969AC; @4:#F2B701; @5:#E73F74; @6:#80BA5A; @7:#E68310; @8:#008695; @9:#CF1C90; @10:#f97b72; @11:#A5AA99;''', } measure = Parameter() force = BooleanParameter(default=False) def __init__(self, *args, **kwargs): if kwargs.get('force'): target_path = self.output(measure=kwargs['measure']).path try: os.unlink(target_path) except OSError: pass super(ImagesForMeasure, self).__init__(*args, **kwargs) def _generate_config(self, zoom, lon, lat, boundary=None): layers = [] layers.append(self.BASEMAP) session = current_session() measure = session.query(OBSColumn).get(self.measure) mainquery = ''' SELECT numer_aggregate, numer_type, numer_colname, numer_geomref_colname, numer_tablename, geom_geomref_colname, geom_colname, geom_tablename, denom_colname, denom_tablename, denom_geomref_colname FROM observatory.obs_meta WHERE numer_id = '{measure}' {boundary_clause} ORDER BY geom_weight DESC, numer_timespan DESC, geom_colname DESC; ''' query = mainquery.format( measure=self.measure, boundary_clause="AND geom_id = '{}'".format(boundary) if boundary else '') resp = session.execute(query) results = resp.fetchone() # how should we determine fallback resolution? if results is None: query = mainquery.format(measure=self.measure, boundary_clause="") resp = session.execute(query) results = resp.fetchone() numer_aggregate, numer_type, numer_colname, numer_geomref_colname, \ numer_tablename, geom_geomref_colname, geom_colname, \ geom_tablename, denom_colname, \ denom_tablename, denom_geomref_colname = results if denom_colname: cartosql = "SELECT geom.cartodb_id, geom.{geom_colname} as the_geom, " \ "geom.the_geom_webmercator, " \ "numer.{numer_colname} / NULLIF(denom.{denom_colname}, 0) measure " \ "FROM {geom_tablename} as geom, {numer_tablename} as numer, " \ " {denom_tablename} as denom " \ "WHERE geom.{geom_geomref_colname} = numer.{numer_geomref_colname} " \ " AND numer.{numer_geomref_colname} = denom.{denom_geomref_colname} " statssql = "SELECT " \ 'CDB_HeadsTailsBins(array_agg(distinct( ' \ ' (numer.{numer_colname} / ' \ ' NULLIF(denom.{denom_colname}, 0))::NUMERIC)), 4) as "headtails" ' \ "FROM {geom_tablename} as geom, " \ " {numer_tablename} as numer, " \ " {denom_tablename} as denom " \ "WHERE geom.{geom_geomref_colname} = numer.{numer_geomref_colname} " \ " AND numer.{numer_geomref_colname} = denom.{denom_geomref_colname} " elif numer_aggregate == 'sum': cartosql = "SELECT geom.cartodb_id, geom.{geom_colname} as the_geom, " \ "geom.the_geom_webmercator, " \ "numer.{numer_colname} / " \ " ST_Area(geom.the_geom) * 1000000.0 measure " \ "FROM {geom_tablename} as geom, {numer_tablename} as numer " \ "WHERE geom.{geom_geomref_colname} = numer.{numer_geomref_colname} " statssql = "SELECT CDB_HeadsTailsBins(array_agg(distinct( " \ ' (numer.{numer_colname} / ST_Area(geom.the_geom) ' \ ' * 1000000.0)::NUMERIC)), 4) as "headtails" ' \ "FROM {geom_tablename} as geom, " \ " {numer_tablename} as numer " \ "WHERE geom.{geom_geomref_colname} = numer.{numer_geomref_colname} " else: cartosql = "SELECT geom.cartodb_id, geom.{geom_colname} as the_geom, " \ " geom.the_geom_webmercator, " \ " numer.{numer_colname} measure " \ "FROM {geom_tablename} as geom, {numer_tablename} as numer " \ " WHERE geom.{geom_geomref_colname} = numer.{numer_geomref_colname} " if numer_type.lower() == 'numeric': statssql = "SELECT " \ 'CDB_HeadsTailsBins(array_agg( ' \ ' distinct(numer.{numer_colname}::NUMERIC)), 4) as "headtails" ' \ "FROM {geom_tablename} as geom, " \ " {numer_tablename} as numer " \ "WHERE geom.{geom_geomref_colname} = numer.{numer_geomref_colname} " else: statssql = ''' SELECT array_agg(category) categories FROM ( SELECT row_number() over () catname, {numer_colname} as category, COUNT(*) cnt FROM {numer_tablename} GROUP BY {numer_colname} ORDER BY COUNT(*) DESC LIMIT 10 ) foo''' cartosql = cartosql.format(geom_colname=geom_colname, numer_colname=numer_colname, geom_tablename=geom_tablename, numer_tablename=numer_tablename, geom_geomref_colname=geom_geomref_colname, numer_geomref_colname=numer_geomref_colname, denom_colname=denom_colname, denom_tablename=denom_tablename, denom_geomref_colname=denom_geomref_colname) statssql = statssql.format(geom_colname=geom_colname, numer_colname=numer_colname, geom_tablename=geom_tablename, numer_tablename=numer_tablename, geom_geomref_colname=geom_geomref_colname, numer_geomref_colname=numer_geomref_colname, denom_colname=denom_colname, denom_tablename=denom_tablename, denom_geomref_colname=denom_geomref_colname) resp = query_cartodb(statssql) if resp.status_code != 200: raise Exception("Unable to obtain statssql: {}".format(resp.text)) if measure.unit(): ramp = self.PALETTES.get(measure.unit().id, self.PALETTES['tags.ratio']) else: ramp = self.PALETTES['tags.ratio'] bucket_css = u'' if numer_type.lower() == 'numeric': buckets = resp.json()['rows'][0]['headtails'] for i, bucket in enumerate(buckets): bucket_css = u''' [measure <= {bucket}] {{ polygon-fill: @{i}; }} '''.format(bucket=bucket, i=i + 1) + bucket_css else: buckets = resp.json()['rows'][0]['categories'] for i, bucket in enumerate(buckets): bucket_css = u''' [measure = "{bucket}"] {{ polygon-fill: @{i}; }} '''.format(bucket=bucket, i=i + 1) + bucket_css layers.append({ 'type': 'mapnik', 'options': { 'layer_name': geom_tablename, 'cartocss': '''/** choropleth visualization */ {ramp} #data {{ polygon-opacity: 0.9; polygon-gamma: 0.5; line-color: #000000; line-width: 0.25; line-opacity: 0.2; line-comp-op: hard-light; polygon-fill: @{bucketlen}; [measure=null]{{ polygon-fill: #cacdce; }} {bucket_css} }}'''.format(ramp=ramp, bucketlen=len(buckets) + 1, bucket_css=bucket_css), 'cartocss_version': "2.1.1", 'sql': cartosql, "table_name": "\"\"." } }) #layers.append(self.LABELS) return { 'layers': layers, 'center': [lon, lat], #'bounds': self.bounds, 'zoom': zoom } def get_named_map(self, map_config): config = {"version": "1.3.0", "layers": map_config} resp = requests.get(self.MAP_URL, headers={ 'content-type': 'application/json' }, params={ 'config': json.dumps(config) }).json() if 'layergroupid' not in resp: raise Exception('Named map returned no layergroupid: {}'.format( pprint(resp))) return resp def run(self): self.output().makedirs() image_urls = [] country = self.measure.split('.')[0] for center, zoom, boundary in self.CENTER_ZOOM_BOUNDS[country]: lon, lat = center if country == 'uk': image_size = ( 300, 700, ) else: image_size = ( 500, 500, ) config = self._generate_config(zoom, lon, lat, boundary) named_map = self.get_named_map(config['layers']) image_urls.append('{cartodb_url}/api/v1/map/static/center/' \ '{layergroupid}/{zoom}/{center_lon}/{center_lat}/{x}/{y}.png'.format( cartodb_url=os.environ['CARTODB_URL'], layergroupid=named_map['layergroupid'], zoom=zoom, center_lon=lon, center_lat=lat, x=image_size[0], y=image_size[1], )) url1 = image_urls.pop(0) LOGGER.info(url1) file1 = StringIO(requests.get(url1, stream=True).content) image1 = ImageOps.expand(Image.open(file1), border=10, fill='white') for url2 in image_urls: LOGGER.info(url2) file2 = StringIO(requests.get(url2, stream=True).content) image2 = ImageOps.expand(Image.open(file2), border=10, fill='white') (width1, height1) = image1.size (width2, height2) = image2.size result_width = width1 + width2 result_height = max(height1, height2) result = Image.new('RGB', (result_width, result_height)) result.paste(im=image1, box=(0, 0)) result.paste(im=image2, box=(width1, 0)) image1 = result image1.save(self.output().path) def complete(self): ''' If we support this country, ''' country = self.measure.split('.')[0] if country in self.CENTER_ZOOM_BOUNDS: return super(ImagesForMeasure, self).complete() else: LOGGER.warn('No info to create images for %s', self.measure) return True def output(self, measure=None): if measure is None: measure = self.measure return LocalTarget(os.path.join('catalog/img', measure + '.png'))
class Hisat(luigi.Task): """Mapping the QCed sequences to reference.""" fastqs = ListParameter() indexfile = Parameter() outsam = Parameter() map_dir = Parameter() workdir = Parameter() num_cpus = Parameter() sample = Parameter() min_introlen = luigi.IntParameter() max_introlen = luigi.IntParameter() rna_strandness = luigi.Parameter() kingdom = luigi.Parameter() def output(self): """SAM file output of the mapping.""" bam_file = self.outsam.split(".sam")[0] + ".bam" return luigi.LocalTarget(bam_file) def run(self): """Run hisat2.""" if self.kingdom == "prokarya": hisat2_nosplice_option = [ "-p", self.num_cpus, "-x", self.indexfile, "-1", self.fastqs[0], "-2", self.fastqs[1], "-S", self.outsam, "--min-intronlen", self.min_introlen, "--max-intronlen", self.max_introlen, "--rna-strandness", self.rna_strandness, "--no-spliced-alignment", "--no-unal", "--un-conc", os.path.join(self.map_dir, "unaligned.fastq"), "2>", os.path.join(self.map_dir, "mapping.log") ] hisat2_cmd = hisat2[hisat2_nosplice_option] hisat2_cmd() self.sam2bam() self.sort_bam() else: h2_splice_option = [ "-p", self.num_cpus, "-x", self.indexfile, "-1", self.fastqs[0], "-2", self.fastqs[1], "-S", self.outsam, "--min-intronlen", self.min_introlen, "--max-intronlen", self.max_introlen, "--rna-strandness", self.rna_strandness, "--no-unal", "--un-conc", os.path.join(self.map_dir, "unaligned.fastq"), "2>", os.path.join(self.map_dir, "mapping.log") ] hisat2_cmd = hisat2[h2_splice_option] hisat2_cmd() self.sam2bam() self.sort_bam() def sam2bam(self): """Convert SAM to BAM file.""" bam_file = self.outsam.split(".sam")[0] + ".bam" options = ["view", "-bS", "-F", "4", self.outsam, "-o", bam_file] samtools_cmd = samtools[options] samtools_cmd() def sort_bam(self): """Sort BAM file.""" bam_file = self.outsam.split(".sam")[0] + ".bam" sorted_bam_file = bam_file.split(".bam")[0] + "_srt.bam" options = ["sort", bam_file, "-o", sorted_bam_file] samtools_cmd = samtools[options] samtools_cmd()
class MockTask(MixinNaiveBulkComplete, Task): param_a = Parameter() param_b = Parameter(default="Not Mandatory") def complete(self): return self.param_a in COMPLETE_TASKS
class InputModel(ExternalTask): MODEL_ROOT = os.path.abspath('data') model = Parameter(default="rnn.pth") # Filename of the model def output(self): return SuffixPreservingLocalTarget(self.MODEL_ROOT + '/' + 'models' + '/' + self.model, format=format.Nop)
class InputData(ExternalTask): IMAGE_ROOT = os.path.abspath('data') data = Parameter(default="names.txt") # Filename of the model def output(self): return SuffixPreservingLocalTarget(self.IMAGE_ROOT + '/' + 'input' + '/' + self.data, format=format.Nop)
class GenerateStaticImage(Task): BASEMAP = { "type": "http", "options": { #"urlTemplate": "https://{s}.maps.nlp.nokia.com/maptile/2.1/maptile/newest/satellite.day/{z}/{x}/{y}/256/jpg?lg=eng&token=A7tBPacePg9Mj_zghvKt9Q&app_id=KuYppsdXZznpffJsKT24", #"subdomains": "1234", # Dark Matter "urlTemplate": "http://{s}.basemaps.cartocdn.com/dark_nolabels/{z}/{x}/{y}.png", "subdomains": "abcd", #"urlTemplate": "http://{s}.basemaps.cartocdn.com/dark_nolabels/{z}/{x}/{y}.png", #"subdomains": ["a", "b", "c"] } } LABELS = { "type": "http", "options": { "urlTemplate": "http://{s}.basemaps.cartocdn.com/dark_only_labels/{z}/{x}/{y}.png", "subdomains": "abcd", } } #57d9408e-0351-11e6-9c12-0e787de82d45 viz = Parameter() VIZ_URL = '{cartodb_url}/api/v2/viz/{{viz}}/viz.json'.format( cartodb_url=os.environ['CARTODB_URL']) MAP_URL = '{cartodb_url}/api/v1/map'.format( cartodb_url=os.environ['CARTODB_URL']) def viz_to_config(self): resp = requests.get(self.VIZ_URL.format(viz=self.viz)) assert resp.status_code == 200 data = resp.json() layers = [] layers.append(self.BASEMAP) for data_layer in data['layers']: if data_layer['type'] == 'layergroup': for layer in data_layer['options']['layer_definition'][ 'layers']: if layer['visible'] is True: layers.append({ 'type': 'mapnik', 'options': layer['options'] }) layers.append(self.LABELS) return { 'layers': layers, 'center': json.loads(data['center']), 'bounds': data['bounds'], 'zoom': data['zoom'] } def get_named_map(self, map_config): config = {"version": "1.3.0", "layers": map_config} resp = requests.get(self.MAP_URL, headers={ 'content-type': 'application/json' }, params={ 'config': json.dumps(config) }).json() if 'layergroupid' not in resp: raise Exception('Named map returned no layergroupid: {}'.format( pprint(resp))) return resp def run(self): self.output().makedirs() config = self.viz_to_config() named_map = self.get_named_map(config['layers']) img_url = '{cartodb_url}/api/v1/map/static/center/' \ '{layergroupid}/{zoom}/{center_lon}/{center_lat}/800/500.png'.format( cartodb_url=os.environ['CARTODB_URL'], layergroupid=named_map['layergroupid'], zoom=config['zoom'], center_lon=config['center'][0], center_lat=config['center'][1] ) LOGGER.info(img_url) shell('curl "{img_url}" > {output}'.format(img_url=img_url, output=self.output().path)) def output(self): return LocalTarget( os.path.join('catalog/source/img', self.task_id + '.png'))
class ImportAllResolutions(WrapperTask): survey = Parameter(default=SURVEY_CEN) def requires(self): for resolution in GEOGRAPHIES: yield ImportData(resolution=resolution, survey=self.survey)
class PerformSpectralClustering(Task): """Coalesce the Wikipedia article parquet files as a Dask Dataframe, and perform spectral clustering using the magic of Dask-ML.""" num_clusters = IntParameter() word_vectors = Parameter() def requires(self): return [GenerateDocumentEmbeddings(model=self.word_vectors)] def output(self): return LocalTarget( config.CLUSTERING_RESULTS_DIR / f'cluster_{self.num_clusters}_{self.word_vectors}.txt') def run(self): if self.word_vectors not in {"fasttext", "word2vec"}: raise ValueError( f'Expected fasttext or word2vec; got {self.word_vectors}') print( f'Initializing dask dataframe of word embeddings at {datetime.now()}' ) ddf = dask.dataframe.read_csv(config.ARTICLE_EMBEDDINGS_DIR / f'{self.word_vectors}_to_csv' / "*.part") print( f'Dropping columns and converting to design matrix (dask array) at {datetime.now()}' ) X = ddf.drop(['Unnamed: 0', "id", "url", "title"], axis=1) X = X.to_dask_array(lengths=True) # Perform k-means clustering print(f'Starting K-Means clustering at {datetime.now()}') k_means_clustering_model = KMeans(n_clusters=self.num_clusters, n_jobs=-1, max_iter=config.K_MEANS_MAX_ITER) k_means_cluster_labels = k_means_clustering_model.fit(X) # Write k-means results to disk print( f'Joining K-means results and writing to disk at {datetime.now()}') k_means_results_ddf = ddf.join(k_means_cluster_labels) k_means_ddf_output_path = config.CLUSTERING_RESULTS_DIR / f'{self.word_vectors}_w_k_means' k_means_ddf_output_path.mkdir(parents=True, exist_ok=True) dask.dataframe.to_csv(k_means_results_ddf, k_means_ddf_output_path) # Perform spectral clustering print(f'Starting Spectral clustering at {datetime.now()}') spectral_clustering_model = SpectralClustering( n_clusters=self.num_clusters, n_jobs=-1, persist_embedding=True, kmeans_params={"max_iter": config.K_MEANS_MAX_ITER}) spectral_cluster_labels = spectral_clustering_model.fit(X) # Write spectral results to disk print( f'Joining Spectral results and writing to disk at {datetime.now()}' ) spectral_results_ddf = ddf.join(spectral_cluster_labels) spectral_ddf_output_path = config.CLUSTERING_RESULTS_DIR / f'{self.word_vectors}_w_spectral' spectral_ddf_output_path.mkdir(parents=True, exist_ok=True) dask.dataframe.to_csv(spectral_results_ddf, spectral_ddf_output_path) # And save the success flag with self.output().open("w") as f: # f.write(f'Clustering {self.word_vectors} k={self.num_clusters}: {silhouette_score_result}' + "\n") # f.write(spectral_clustering_model.get_params(deep=True)) f.write(f'{self.word_vectors}: Success!')
class Survey(BaseParams, TableTask): topic = Parameter(default='t001') def version(self): return 6 def requires(self): ''' Subclasses must override this. ''' raise NotImplementedError('Survey must define requires()') def timespan(self): ''' Subclasses must override this. ''' raise NotImplementedError('Survey must define timespan()') def columns(self): cols = OrderedDict() input_ = self.input() cols['geo_code'] = input_['geometa']['geom_id'] for colname, coltarget in input_['meta'].items(): if coltarget._id.split('.')[-1].lower().startswith(self.topic.lower()): cols[colname] = coltarget return cols def populate(self): if self.survey == SURVEY_NHS: if self.resolution == GEO_DA: self.populate_da_from_cd() elif self.resolution == GEO_FSA: self.populate_fsa_from_csd() else: self.populate_general() else: self.populate_general() def populate_da_from_cd(self): session = current_session() columns = self.columns() colnames = list(columns.keys()) out_colnames = [oc for oc in colnames if oc is not None] in_colnames = ['da.geom_id'] for colname in out_colnames: if colname != 'geo_code': # We reduce the number of decimals to reduce the size of the row to avoid hit # the limit which is 8Kb. More info https://github.com/CartoDB/bigmetadata/issues/527 in_colnames.append('round(cast(float8 ({colname} * (ST_Area(da.the_geom)/ST_Area(cd.the_geom))) as numeric), 2) {colname}'.format(colname=colname)) insert_query = ''' INSERT INTO {output} ({out_colnames}) SELECT {in_colnames} FROM {da_geom} da INNER JOIN {cd_geom} cd ON (cd.geom_id = left(da.geom_id,4)) INNER JOIN {cd_data} data ON (cd.geom_id = data.geo_code) '''.format(output=self.output().table, da_geom=self.input()['geo'].table, cd_geom=self.input()['geo_source'].table, cd_data=self.input()['data_source'].table, in_colnames=', '.join(in_colnames), out_colnames=', '.join(out_colnames)) LOGGER.debug(insert_query) session.execute(insert_query) def populate_fsa_from_csd(self): session = current_session() columns = self.columns() colnames = list(columns.keys()) out_colnames = [oc for oc in colnames if oc is not None] in_colnames = [x for x in out_colnames if x != 'geo_code'] insert_query = ''' INSERT INTO {output} ({out_colnames}) SELECT fsa_geom_id, {in_colnames_group} FROM {csd_data} data_csd, {interpolation_table} interp WHERE data_csd.geo_code = interp.csd_geom_id GROUP BY fsa_geom_id '''.format( output=self.output().table, csd_data=self.input()['data_source'].table, interpolation_table=self.input()['geo_interpolation'].table, in_colnames_group=', '.join(['round(sum({x} * area_ratio)::numeric, 2) as {x}'.format(x=x) for x in in_colnames]), out_colnames=', '.join(out_colnames) ) LOGGER.debug(insert_query) session.execute(insert_query) def populate_general(self): session = current_session() columns = self.columns() out_colnames = list(columns.keys()) in_table = self.input()['data'] in_colnames = [ct._id.split('.')[-1] for ct in list(columns.values())] in_colnames[0] = 'geo_code' for i, in_c in enumerate(in_colnames): cmd = "SELECT 'exists' FROM information_schema.columns " \ "WHERE table_schema = '{schema}' " \ " AND table_name = '{tablename}' " \ " AND column_name = '{colname}' " \ " LIMIT 1".format( schema=in_table.schema, tablename=in_table.tablename.lower(), colname=in_c.lower()) # remove columns that aren't in input table if session.execute(cmd).fetchone() is None: in_colnames[i] = None out_colnames[i] = None in_colnames = [ "CASE {ic}::TEXT WHEN '-6' THEN NULL ELSE {ic} END".format(ic=ic) for ic in in_colnames if ic is not None] out_colnames = [oc for oc in out_colnames if oc is not None] cmd = 'INSERT INTO {output} ({out_colnames}) ' \ 'SELECT {in_colnames} FROM {input} '.format( output=self.output().table, input=in_table.table, in_colnames=', '.join(in_colnames), out_colnames=', '.join(out_colnames)) session.execute(cmd)
class TigerBlocksInterpolation(Task): ''' Task used to create a table with the block and blockgroups geoid and the percentage of the block in the block group ''' year = Parameter() def requires(self): return { 'shoreline_block': ShorelineClip(year=self.year, geography='block'), 'shoreline_blockgroup': ShorelineClip(year=self.year, geography='block_group'), } def run(self): session = current_session() with session.no_autoflush: tiger_tables = {} tiger_tables_query = '''SELECT id,tablename FROM observatory.obs_table WHERE id ilike 'us.census.tiger.shoreline_clip_block%' ''' tiger_tables_result = session.execute(tiger_tables_query) if tiger_tables_result: for tiger_table in tiger_tables_result.fetchall(): if re.search('block_group_{}'.format(self.year), tiger_table['id']): tiger_tables['block_group'] = tiger_table['tablename'] elif re.search('block_{}'.format(self.year), tiger_table['id']): tiger_tables['block'] = tiger_table['tablename'] # Create the table with block/blockgroups and percentage field empty start_time = time.time() LOGGER.info("Start creating the interpolation table...") query = ''' CREATE TABLE {table_output} AS SELECT geoid blockid, left(geoid,12) blockgroupid, 0::float percentage, the_geom block_geom FROM "{schema_input}".{block_table} b '''.format(schema_input='observatory', block_table=tiger_tables['block'], table_output=self.output().table) session.execute(query) end_time = time.time() LOGGER.info("Time creating the table {}".format((end_time - start_time))) # Creating indexes LOGGER.info("Start creating the indexes for the interpolation table...") start_time = time.time() indexes_query = ''' CREATE INDEX blocks_idx ON {table_output} (blockid); CREATE INDEX block_groups_idx ON {table_output} (blockgroupid); '''.format(table_output=self.output().table) session.execute(indexes_query) end_time = time.time() LOGGER.info("Indexes created in {}".format((end_time - start_time))) # Set the interpolation percentages in the table LOGGER.info("Start updating the table...") start_time = time.time() update_percentage_query = ''' UPDATE {table_output} b SET percentage = ( SELECT (ST_Area(b.block_geom)/ST_Area(bg.the_geom))::float*100.00 FROM "{schema_input}".{bg_table} bg WHERE b.blockgroupid = bg.geoid ) '''.format(schema_input='observatory', bg_table=tiger_tables['block_group'], table_output=self.output().table) session.execute(update_percentage_query) session.commit() end_time = time.time() LOGGER.info("Time creating the table {}".format((end_time - start_time))) else: LOGGER.error('Cant retrieve tiger tables for block and block group') def output(self): schema = 'tiger{year}'.format(year=self.year) return PostgresTarget(schema, 'blocks_interpolation')
class ETLAnalysis(Task): """Created an abstract class for conducting analysis of vaccine data at different levels - by country, by year, by month and by week. This is a luigi task and sub-classed by the different levels of covid data analysis tasks. The analysis abstract class requires Cleanup and the parquet files for performing the analysis and display. This abstract class has one analysis method to override / implement in their respective tasks. Each analysis should be a separate Luigi task, which computes its analysis and writes the result to parquet. To display to the terminal or answer a quiz, the output should be read back from the written parquet file. Parameters: subset: bool, True to process just one partition, False to process the entire dataset, default: True analysis_path: str, base directory to store output files Output: Dataframe stored in compressed Parquet format in {task.analysis_path}/{task.sub_dir}/subset-{task.subset}/ """ subset = BoolParameter(default=True) analysis_path = Parameter(default="./data/vaccine/") requires = Requires() input_data = Requirement(VaccineDataGlobalCleanupTask) # the output references a "sub_dir" parameter, which is expected to be defined # in a subclass output = TargetOutput( "{task.analysis_path}{task.sub_dir}", ext="subset-{task.subset}/", target_class=ParquetTarget, flag="_SUCCESS", ) def perform_analysis(self, df): """ this method will be implemented by sub-classes. """ raise NotImplementedError def run(self): """ Uses the three data points we need for analysis -> Country_Region and Date calls the implemented perform_analysis method to do the calculations """ analysis_dataframe = self.input()["input_data"].read_dask(columns=[ "Country_Region", "Date", "Doses_admin", "People_partially_vaccinated", "People_fully_vaccinated", "Report_Date_String", "UID", ]) # invoke perform_analysis from the implemented sub-classes # only gets the aggregated analysis column (stars, year, decade and weekday) and the review length output_dataframe = self.perform_analysis(analysis_dataframe) # write_dask parquet file output with gzip compression. self.output().write_dask(output_dataframe, write_index=True, compression="gzip")
class SimplifiedUnionTigerWaterGeoms(SimplifiedTempTableTask): year = IntParameter() geography = Parameter() def requires(self): return UnionTigerWaterGeoms(year=self.year, geography=self.geography)
class SoiaEmailFetcher(sqla.CopyToTable): email_address = Parameter() password = Parameter() date = DateParameter() columns = [(["id", Integer()], { "autoincrement": True, "primary_key": True }), (["start", BigInteger()], {}), (["end", BigInteger()], {}), (["insert_date", BigInteger()], {})] connection_string = "sqlite:///data/soia_email.db" table = "soia" regexes = [(r"<b>Duration:</b>.*<br>", ["<b>Duration:</b>", "<br>"]), (r"(\d{2,4}.){2,4}.*<o", ["<o"])] def rows(self): for start, end in deduplicated(self.generate_rows()): yield "auto", start, end, datetime.now().strftime('%s') def copy(self, conn, ins_rows, table): bound_cols = dict((c, bindparam("_" + c.key)) for c in table.columns if c.key != "id") ins = table.insert().values(bound_cols) conn.execute(ins, ins_rows) def generate_rows(self): imap_client = create_imap_client(self.email_address, self.password) try: code, data = imap_client.search(None, "ALL") soia_timestamps = [] # iterate over emails for number in data[0].split(b" "): code, data = imap_client.fetch(number, '(RFC822)') message = email.message_from_string(data[0][1].decode()) # get actual email content date = dateparser.parse(message["Date"]) content = message.get_payload() # handle base64 content try: unbased_content = unbase64_content(content) except ValueError: continue # iterate over regexes trying to match date for regex, replaces in self.regexes: match = re.search(regex, unbased_content) if match is not None: dates = remove_occurances(match.group(), replaces) start, end = dates.rsplit("-", maxsplit=1) if " " not in end.strip(): end = f"{date.year}-{date.month}-{date.day} {end}" if " " not in start.strip(): start = f"{date.year}-{date.month}-{date.day} {start}" parsed_start = dateparser.parse(start) parsed_end = dateparser.parse(end) if parsed_end is None or parsed_start is None: logger.warning("coudn't parse the following: %s", match) continue row = (parsed_start.strftime('%s'), parsed_end.strftime('%s')) logger.debug("Adding the following row: %s", row) soia_timestamps.append(row) break except Exception as err: logger.error("Something went terribly wrong! %s", err) finally: imap_client.close() imap_client.logout() return soia_timestamps
class SumLevel(TableTask): geography = Parameter() year = IntParameter() @property def geoid(self): return SUMLEVELS[self.geography]['fields']['geoid'] @property def aland(self): return SUMLEVELS[self.geography]['fields']['aland'] @property def awater(self): return SUMLEVELS[self.geography]['fields']['awater'] @property def input_tablename(self): return SUMLEVELS[self.geography]['table'] + SIMPLIFIED_SUFFIX def version(self): return 15 def requires(self): if self.geography == BLOCK: tiger = SimplifyGeoByState(geography=self.geography, year=self.year) else: tiger = SimplifiedDownloadTiger(geography=self.geography, year=self.year) return { 'attributes': Attributes(), 'geoids': GeoidColumns(year=self.year), 'geoms': GeomColumns(year=self.year), 'data': tiger, } def columns(self): input_ = self.input() return OrderedDict([ ('geoid', input_['geoids'][self.geography + '_{}'.format(self.year) + GEOID_SUMLEVEL_COLUMN]), ('the_geom', input_['geoms'][self.geography + '_{}'.format(self.year)]), ('aland', input_['attributes']['aland']), ('awater', input_['attributes']['awater']), ]) def table_timespan(self): return get_timespan(str(self.year)) # TODO: https://github.com/CartoDB/bigmetadata/issues/435 def targets(self): return { OBSTable(id='.'.join([self.schema(), self.name()])): GEOM_REF, } def populate(self): session = current_session() from_clause = '{inputschema}.{input_tablename}'.format( inputschema='tiger' + str(self.year), input_tablename=self.input_tablename, ) in_colnames = [self.geoid, 'geom', self.aland, self.awater] out_colnames = list(self.columns().keys()) session.execute('INSERT INTO {output} ({out_colnames}) ' 'SELECT {in_colnames} ' 'FROM {from_clause} '.format( output=self.output().table, in_colnames=', '.join(in_colnames), out_colnames=', '.join(out_colnames), from_clause=from_clause ))
class Measurements2CSV(Task): geography = Parameter() file_name = Parameter() def __init__(self, *args, **kwargs): super(Measurements2CSV, self).__init__(*args, **kwargs) def requires(self): requirements = {} if self.geography == 'GEO_PA': requirements['data'] = CensusPostcodeAreas() elif self.geography == 'GEO_PD': requirements['data'] = CensusPostcodeDistricts() elif self.geography == 'GEO_PS': requirements['data'] = CensusPostcodeSectors() return requirements def _get_config_data(self): dir_path = os.path.dirname(os.path.realpath(__file__)) with (open('{}/{}'.format(dir_path, 'measurements.json'))) as f: return json.load(f) def run(self): session = current_session() measurements = self._get_config_data() for measure in measurements: measure['geom_id'] = GEOGRAPHY_LEVELS[self.geography] json_metadata = json.dumps(measurements) result = session.execute(self._get_meta_query(json_metadata)) if result: join_data = {} join_data['numer'] = {} if self.geography == 'GEO_PA': colnames = ['geom.pa_id as geoid'] else: colnames = ['geom.geographycode as geoid'] for data in result.fetchall(): join_data['numer'][data['numer_table']] = { 'table': 'observatory.{}'.format(data['numer_table']), 'join_column': data['numer_join_col'] } # All come from the same geometry tables so we use, by now, just one geometry # TODO Make it possible to have multiple geometry tables join_data['geom'] = { 'table': 'observatory.{}'.format(data['geom_table']), 'join_column': data['geom_join_col'] } colnames.append(data['numer_col']) measurement_result = session.execute( self._get_measurements_query(join_data, colnames)) if measurement_result: measurements = measurement_result.fetchall() self._generate_csv_file(colnames, measurements) else: LOGGER.error('No results for the queried measurements') else: LOGGER.error( 'No results for the defined measurements in the JSON file') def _get_meta_query(self, metadata): return '''SELECT meta->>'numer_tablename' numer_table, meta->>'numer_geomref_colname' numer_join_col, meta->>'numer_colname' numer_col, meta->>'geom_tablename' geom_table, meta->>'geom_geomref_colname' geom_join_col, meta->>'geom_colname' geom_col FROM json_array_elements(cdb_observatory.OBS_GetMeta( ST_MakeEnvelope(-179, 89, 179, -89, 4326), -- World bbox '{}'::json, 1, 1, 1)) meta '''.format(metadata) def _get_measurements_query(self, join_data, colnames): joins = [] for join_table in join_data['numer'].values(): joins.append( 'LEFT JOIN {table} ON (geom.{geomcol} = {table}.{numercol})'. format(table=join_table['table'], geomcol=join_data['geom']['join_column'], numercol=join_table['join_column'])) return '''SELECT {cols} FROM {geom} geom {joins} '''.format(cols=' ,'.join(colnames), geom=join_data['geom']['table'], joins=' '.join(joins)) def _generate_csv_file(self, headers, measurements): try: self.output().makedirs() with (open(self.output().path, 'w+')) as csvfile: headers[0] = 'geoid' writer = csv.DictWriter(csvfile, fieldnames=headers) writer.writeheader() for measurement in measurements: writer.writerow(dict(measurement)) except BaseException: self.output().remove def output(self): csv_filename = 'tmp/geographica/uk/{}'.format(self.file_name) return LocalTarget(path=csv_filename, format='csv')
class StringTieScoresW(luigi.WrapperTask): """From Mapping to Counting step for Eukaryotic reference.""" gff_file = Parameter() kingdom = Parameter() def requires(self): """A wrapper for running Stringtie scores on all samples.""" splice_list = [ self.workdir + "/" + f for f in os.listdir(self.workdir) if f.endswith('.splice') ] if len(splice_list) > 1: splice_file = ','.join(splice_list) elif len(splice_list) == 1: splice_file = splice_list[0] for samp, fastq in self.fastq_dic.iteritems(): map_dir = self.workdir + "/" + samp + "/mapping_results" trim_dir = self.workdir + "/" + samp + "/trimming_results" if os.path.isdir(map_dir) is False: os.makedirs(map_dir) if self.kingdom in ['prokarya', 'eukarya']: if ',' in self.gff_file: gff_list = [ os.path.abspath(gff) for gff in self.gff_file.split(",") ] for gff in gff_list: gtf = self.workdir + "/" + gff.split("/")[-1].split( ".gff")[0] + ".gtf" gff_name = gtf.split(".gtf")[0].split("/")[-1] yield StringTieScores( fastq1=trim_dir + "/" + samp + ".1.trimmed.fastq", fastq2=trim_dir + "/" + samp + ".2.trimmed.fastq", numCPUs=self.numCPUs, indexfile=self.indexfile, spliceFile=splice_file, mappingLogFile=map_dir + "/mapping.log", unalned=map_dir + "/unligned.fastq", outsam=map_dir + "/" + samp + ".sam", bam_file=map_dir + "/" + samp + ".bam", sorted_bam_file=map_dir + "/" + samp + "_srt.bam", ref_file=self.ref_file, in_gtf=gtf, gff_file=self.gff_file, out_gtf=map_dir + "/" + samp + "_" + gff_name + "_sTie.gtf", out_cover=map_dir + "/" + samp + "_" + gff_name + "_covered_sTie.gtf", out_abun=map_dir + "/" + samp + "_" + gff_name + "_sTie.tab", in_bam_file=map_dir + "/" + samp + "_srt.bam", bindir=self.bindir, workdir=self.workdir) elif self.kingdom == 'both': prok_gtf = self.workdir + "/" + \ self.gff_file.split(";")[0].split("/")[-1].split(".gff")[0] + ".gtf" euk_gtf = self.workdir + "/" + \ self.gff_file.split(";")[1].split("/")[-1].split(".gff")[0] + ".gtf" yield StringTieScores( fastq1=trim_dir + "/" + samp + ".1.trimmed.fastq", fastq2=trim_dir + "/" + samp + ".2.trimmed.fastq", numCPUs=self.numCPUs, indexfile=self.indexfile, spliceFile=splice_file, mappingLogFile=map_dir + "/mapping.log", unalned=map_dir + "/unligned.fastq", outsam=map_dir + "/" + samp + ".sam", bam_file=map_dir + "/" + samp + ".bam", sorted_bam_file=map_dir + "/" + samp + "_srt.bam", ref_file=self.ref_file, gtf=prok_gtf, out_gtf=map_dir + "/" + samp + "_prok_sTie.gtf", out_cover=map_dir + "/" + samp + "_prok_covered_sTie.gtf", out_abun=map_dir + "/" + samp + "_prok_sTie.tab", in_bam_file=map_dir + "/prokarya.bam", bindir=self.bindir, workdir=self.workdir, gff_file=self.gff_file) yield StringTieScores( fastq1=trim_dir + "/" + samp + ".1.trimmed.fastq", fastq2=trim_dir + "/" + samp + ".2.trimmed.fastq", numCPUs=self.numCPUs, indexfile=self.indexfile, spliceFile=splice_file, mappingLogFile=map_dir + "/mapping.log", unalned=map_dir + "/unligned.fastq", outsam=map_dir + "/" + samp + ".sam", bam_file=map_dir + "/" + samp + ".bam", sorted_bam_file=map_dir + "/" + samp + "_srt.bam", ref_file=self.ref_file, gtf=euk_gtf, out_gtf=map_dir + "/" + samp + "_euk_sTie.gtf", out_cover=map_dir + "/" + samp + "_euk_covered_sTie.gtf", out_abun=map_dir + "/" + samp + "_euk_sTie.tab", in_bam_file=map_dir + "/eukarya.bam", bindir=self.bindir, workdir=self.workdir, gff_file=self.gff_file)
class Anuario(TableTask): resolution = Parameter() year = Parameter() @property def infilepath(self): base = self.input()['data'].path year = str(self.year)[-2:] if self.resolution == 'prov': fname = 'AE{year}_Provincial_Completo.xls'.format(year=year) elif self.resolution == 'muni': fname = 'AE{year}_Municipal_Completo.xls'.format(year=year) else: raise RuntimeError('Unknown resolution "{}"'.format(self.resolution)) return os.path.join(base, fname) def version(self): return 4 def timespan(self): return self.year def requires(self): return { 'data_columns': AnuarioColumns(), 'geom_columns': GeomRefColumns(), 'data': DownloadAnuario(year=self.year), } def columns(self): cols = OrderedDict() cols['id_' + self.resolution] = \ self.input()['geom_columns']['id_' + self.resolution] cols.update(self.input()['data_columns']) return cols def populate(self): book = open_workbook(self.infilepath) # determine mapping between column names and columns in excel columns = self.columns() headers = dict() colnum2name = OrderedDict() sheets = book.sheets() for sheetnum, sheet in enumerate(sheets): headers.update(dict([ (cell.value, (sheetnum, cellnum)) for cellnum, cell in enumerate(sheet.row(0)) ])) for out_colname, coltarget in columns.iteritems(): col = coltarget._column if not col.extra or 'source' not in col.extra or 'name' not in col.extra['source']: continue sourcename = coltarget._column.extra['source']['name'] colnum = headers.get(sourcename) year = unicode(int(self.year) - 1) if not colnum: colnum = headers.get(sourcename + u' ' + year) if not colnum: colnum = headers.get(sourcename + u' ' + year) if not colnum: raise Exception('Could not find column "{}" in Excel sheets'.format( sourcename)) colnum2name[colnum] = out_colname # insert data session = current_session() for i in xrange(1, sheets[0].nrows): geom_name = sheets[0].row(i)[0].value.lower() geom_ref = sheets[0].row(i)[1].value # exclude rows that are for a different resolution if 'total c.a.' in geom_name: if self.resolution != 'cca': continue elif 'total prov.' in geom_name: if self.resolution != 'prov': continue elif 'nombre municipio' in sheets[0].row(0)[0].value.lower(): if self.resolution != 'muni' or len(geom_ref) != 5: continue else: raise RuntimeError('Unrecognized geom ref "{}"'.format( geom_ref)) values = [u"'" + geom_ref + u"'"] # geo code values.extend([ str(sheets[sheetnum].row(i)[colnum].value) for sheetnum, colnum in colnum2name.keys() ]) colnames = ['id_' + self.resolution] colnames.extend(colnum2name.values()) stmt = 'INSERT INTO {output} ({colnames}) ' \ 'VALUES ({values})'.format( output=self.output().table, colnames=', '.join(colnames), values=', '.join(values), ) session.execute(stmt)
class QCEWColumns(ColumnsTask): naics_code = Parameter() def version(self): return 3 def requires(self): requirements = { 'sections': SectionTags(), 'subsections': SubsectionTags(), 'units': UnitTags(), 'source': BLSSourceTags(), 'license': LicenseTags(), } parent_code = get_parent_code(self.naics_code) if parent_code: requirements['parent'] = QCEWColumns(naics_code=parent_code) return requirements def columns(self): cols = OrderedDict() code, name, description = self.naics_code, NAICS_CODES[ self.naics_code], '' # This gives us easier access to the tags we defined as dependencies input_ = self.input() units = input_['units'] sections = input_['sections'] subsections = input_['subsections'] parent = input_.get('parent') cols['qtrly_estabs'] = OBSColumn( id=underscore_slugify('qtrly_estabs_{}'.format(code)), type='Numeric', name='Establishments in {}'.format(name), description= 'Count of establishments in a given quarter in the {name} industry (NAICS {code}).' '{name} is {description}.'.format(name=name, code=code, description=description), weight=5, aggregate='sum', tags=[ units['businesses'], sections['united_states'], subsections['commerce_economy'] ], targets={parent['qtrly_estabs']: DENOMINATOR} if parent else {}, ) cols['avg_wkly_wage'] = OBSColumn( # Make sure the column ID is unique within this module # If left blank, will be taken from this column's key in the output OrderedDict id=underscore_slugify('avg_wkly_wage_{}'.format(code)), # The PostgreSQL type of this column. Generally Numeric for numbers and Text # for categories. type='Numeric', # Human-readable name. Will be used as header in the catalog name='Average weekly wage for {} establishments'.format(name), # Human-readable description. Will be used as content in the catalog. description= 'Average weekly wage for a given quarter in the {name} industry (NAICS {code}).' '{name} is {description}.'.format(name=name, code=code, description=description), # Ranking of importance, sometimes used to favor certain measures in auto-selection # Weight of 0 will hide this column from the user. We generally use between 0 and 10 weight=5, # How this measure was derived, for example "sum", "median", "average", etc. # In cases of "sum", this means functions downstream can construct estimates # for arbitrary geographies aggregate='average', # Tags are our way of noting aspects of this measure like its unit, the country # it's relevant to, and which section(s) of the catalog it should appear in. tags=[ units['money'], sections['united_states'], subsections['income'] ], targets={cols['qtrly_estabs']: UNIVERSE}, ) cols['month3_emplvl'] = OBSColumn( id=underscore_slugify('month3_emplvl_{}'.format(code)), type='Numeric', name='Employees in {} establishments'.format(name), description= 'Number of employees in the third month of a given quarter with the {name} ' 'industry (NAICS {code}). {name} is {description}.'.format( name=name, code=code, description=description), weight=5, aggregate='sum', tags=[ units['people'], sections['united_states'], subsections['employment'] ], targets={parent['month3_emplvl']: DENOMINATOR} if parent else {}, ) cols['lq_avg_wkly_wage'] = OBSColumn( id=underscore_slugify('lq_avg_wkly_wage_{}'.format(code)), type='Numeric', name='Average weekly wage location quotient for {} establishments'. format(name), description= 'Location quotient of the average weekly wage for a given quarter relative to ' 'the U.S. (Rounded to the hundredths place) within the {name} industry (NAICS {code}).' '{name} is {description}.'.format(name=name, code=code, description=description), weight=3, aggregate=None, tags=[ units['ratio'], sections['united_states'], subsections['income'] ], ) cols['lq_qtrly_estabs'] = OBSColumn( id=underscore_slugify('lq_qtrly_estabs_{}'.format(code)), type='Numeric', name='Location quotient of establishments in {}'.format(name), description= 'Location quotient of the quarterly establishment count relative to ' 'the U.S. (Rounded to the hundredths place) within the {name} industry (NAICS {code}).' '{name} is {description}.'.format(name=name, code=code, description=description), weight=3, aggregate=None, tags=[ units['ratio'], sections['united_states'], subsections['commerce_economy'] ], ) cols['lq_month3_emplvl'] = OBSColumn( id=underscore_slugify('lq_month3_emplvl_{}'.format(code)), type='Numeric', name='Employment level location quotient in {} establishments'. format(name), description= 'Location quotient of the employment level for the third month of a given quarter ' 'relative to the U.S. (Rounded to the hundredths place) within the {name} ' 'industry (NAICS {code}). {name} is {description}.'.format( name=name, code=code, description=description), weight=3, aggregate=None, tags=[ units['ratio'], sections['united_states'], subsections['employment'] ], ) source = input_['source']['qcew'] license = input_['license']['no-restrictions'] for colname, col in cols.items(): col.tags.append(source) col.tags.append(license) return cols