class InterpolateNHSDAFromCD(Task): topic = Parameter() resolution = Parameter() def requires(self): return { 'nhs': NHS(resolution=GEO_CD, topic=self.topic, survey=SURVEY_NHS), 'geo_cd': Geography(resolution='GEO_CD', year=2011), 'geo_da': Geography(resolution='GEO_DA', year=2011) }
class SimplifiedImportGeometry(SimplifiedTempTableTask): resolution = Parameter() timestamp = Parameter() id_aux = Parameter( ) # X for Peninsula and Balearic Islands, Y for Canary Islands def requires(self): return ImportGeometry(resolution=self.resolution, timestamp=self.timestamp, id_aux=self.id_aux)
class LoadSocrataCSV(Task): domain = Parameter() dataset = Parameter() def requires(self): pass def target(self): pass
class ImportData(CSV2TempTableTask): state = Parameter() tablename = Parameter() encoding = 'latin1' delimiter = ';' def requires(self): return DownloadData(state=self.state) def version(self): return 1 def input_csv(self): if self.state.lower() == 'sp_capital': state_code = 'SP1' elif self.state.lower() == 'sp_exceto_a_capital': state_code = 'SP2' else: state_code = self.state.upper() # The provided CSV files are not well-formed, so we convert the provided XLS files into CSV # All files are {tablename}_{state}.xls (or XLS), except Basico-MG.xls df = pd.read_excel(self._datafile_path(self.tablename, state_code)) if self.tablename != 'Basico': df = df.apply(pd.to_numeric, errors="coerce") # In the 20180416 version, SP_Capital-Domicilio02 files have wrong IDs (all have been rounded to 55031E+14) # But the order of sectors is constant in all files of the same state, so we copy them from a different file # This used to be correct in 2017's version of the file if self.state.lower( ) == 'sp_capital' and self.tablename == 'Domicilio02': domicilio01 = pd.read_excel( self._datafile_path('Domicilio01', state_code)) df['Cod_setor'] = domicilio01['Cod_setor'] df.to_csv(os.path.join( self.input().path, '{tablename}_{state_code}.csv'.format(tablename=self.tablename, state_code=state_code)), index=False, sep=';', encoding=self.encoding) return os.path.join( self.input().path, '{tablename}_{state_code}.csv'.format(tablename=self.tablename, state_code=state_code)) def _datafile_path(self, tablename, state_code): filename = '{tablename}[-_]{state_code}.[xX][lL][sS]'.format( tablename=tablename, state_code=state_code) return glob.glob(os.path.join(self.input().path, '**', filename), recursive=True)[0]
class TigerGeographyShapefileToSQL(TempTableTask): ''' Take downloaded shapefiles and load them into Postgres ''' year = Parameter() geography = Parameter() def requires(self): return UnzipTigerGeography(year=self.year, geography=self.geography) def run(self): shapefiles = shell( 'ls {dir}/*.shp'.format(dir=os.path.join('tmp', classpath( self), str(self.year), self.geography))).strip().split('\n') cmd = 'ogrinfo {shpfile_path}'.format(shpfile_path=shapefiles[0]) resp = shell(cmd) if 'Polygon' in resp: nlt = '-nlt MultiPolygon' else: nlt = '' cmd = 'PG_USE_COPY=yes PGCLIENTENCODING=latin1 ' \ 'ogr2ogr -f PostgreSQL "PG:dbname=$PGDATABASE active_schema={schema}" ' \ '-t_srs "EPSG:4326" {nlt} -nln {tablename} ' \ '-lco OVERWRITE=yes ' \ '-lco SCHEMA={schema} {shpfile_path} '.format( tablename=self.output().tablename, schema=self.output().schema, nlt=nlt, shpfile_path=shapefiles.pop()) shell(cmd) # chunk into 500 shapefiles at a time. for i, shape_group in enumerate(grouper(shapefiles, 500)): shell('export PG_USE_COPY=yes PGCLIENTENCODING=latin1; ' 'echo \'{shapefiles}\' | xargs -P 16 -I shpfile_path ' 'ogr2ogr -f PostgreSQL "PG:dbname=$PGDATABASE ' 'active_schema={schema}" -append ' '-t_srs "EPSG:4326" {nlt} -nln {tablename} ' 'shpfile_path '.format(shapefiles='\n'.join( [shp for shp in shape_group if shp]), tablename=self.output().tablename, nlt=nlt, schema=self.output().schema)) print('imported {} shapefiles'.format((i + 1) * 500)) session = current_session() # Spatial index session.execute( 'ALTER TABLE {qualified_table} RENAME COLUMN ' 'wkb_geometry TO geom'.format(qualified_table=self.output().table)) session.execute( 'CREATE INDEX ON {qualified_table} USING GIST (geom)'.format( qualified_table=self.output().table))
class ImportAllTables(WrapperTask): year = IntParameter() resolution = Parameter() state = Parameter() def requires(self): for table in TABLES[self.year]: yield ImportData(resolution=self.resolution, state=self.state, year=self.year, tablename=table)
class SAMindex(luigi.Task): """Create Hisat Indices from given fasta file.""" fasta = Parameter() workdir = Parameter() def output(self): """Require reference fasta format file.""" if ',' in self.fasta: fas = self.fasta.split(",") return [ LocalTarget( os.path.join(self.workdir, "processes", "novel", os.path.basename(fa) + ".bedfile")) for fa in fas ] def make_index(self, ref): """A function to make index from sam files.""" index_options = ["faidx", ref] novel_folder = os.path.join(self.workdir, "processes", "novel") if os.path.exists(novel_folder) is False: os.makedirs(novel_folder) mv_options = [ref + ".fai", os.path.join(novel_folder)] samtools_cmd = samtools[index_options] mv_cmd = mv[mv_options] samtools_cmd() mv_cmd() fa_name = os.path.basename(ref) return os.path.join(self.workdir, "processes", "novel", fa_name + ".fai") def create_bedfile(self, index_file): """Makes bed file from the sam index file.""" out_bedfile = index_file.split(".fai")[0] + ".bedfile" bedfile_opt = ['BEGIN {FS="\t"}; {print $1 FS "0" FS $2}', index_file] awk_cmd = ((awk[bedfile_opt]) > out_bedfile) awk_cmd() def requires(self): """Expected index output.""" if ',' in self.fasta: fas = self.fasta.split(",") return [RefFile(fa) for fa in fas] else: return [RefFile(self.fasta)] def run(self): """Run hisat2-build command.""" if ',' in self.fasta: fas = self.fasta.split(",") for fa in fas: ind_file = self.make_index(fa) self.create_bedfile(ind_file)
class AllMCCountries(WrapperTask): until_month = Parameter(default=None) month = Parameter(default=None) def requires(self): return [ AllMCData(country=country, until_month=self.until_month, month=self.month) for country in ['us', 'ca', 'uk', 'au'] ]
class SimplifyGeometriesPostGIS(Task): schema = Parameter() table_input = Parameter() table_output = Parameter(default='') geomfield = Parameter(default=DEFAULT_GEOMFIELD) retainfactor = Parameter(default=DEFAULT_P_RETAIN_FACTOR_POSTGIS) def __init__(self, *args, **kwargs): super(SimplifyGeometriesPostGIS, self).__init__(*args, **kwargs) self.table_out = '{tablename}{suffix}'.format( tablename=self.table_input, suffix=SIMPL_SUFFIX) if self.table_output: self.table_out = self.table_output def run(self): session = CurrentSession().get() columns = session.execute( "SELECT column_name " "FROM information_schema.columns " "WHERE table_schema = '{schema}' " "AND table_name = '{table}'".format( schema=self.schema, table=self.table_input.lower())).fetchall() factor = simplification_factor(self.schema, self.table_input, self.geomfield, self.retainfactor) simplified_geomfield = 'ST_CollectionExtract(ST_MakeValid(ST_SimplifyVW({geomfield}, {factor})), 3) ' \ '{geomfield}'.format(geomfield=self.geomfield, factor=factor) session.execute( 'CREATE TABLE "{schema}".{table_out} ' 'AS SELECT {fields} ' 'FROM "{schema}".{table_in} '.format( schema=self.output().schema, table_in=self.table_input, table_out=self.output().tablename, fields=', '.join([ x[0] if x[0] != self.geomfield else simplified_geomfield for x in columns ]))) session.commit() session.execute( 'CREATE INDEX {table_out}_{geomfield}_geo ON ' '"{schema}".{table_out} USING GIST ({geomfield})'.format( table_out=self.output().tablename, geomfield=self.geomfield, schema=self.output().schema)) def output(self): return PostgresTarget(self.schema, self.table_out)
class NHSMetaWrapper(MetaWrapper): resolution = Parameter() topic = Parameter() params = { 'topic': ['t{:03d}'.format(i) for i in range(1, 30)], 'resolution': (GEO_CT, GEO_PR, GEO_CD, GEO_CSD, GEO_CMA) # NHS not available at DA or FSA level } def tables(self): yield Geography(resolution=self.resolution, year=2011) yield NHS(resolution=self.resolution, topic=self.topic, survey=SURVEY_NHS)
class ShapeData(DownloadFromUrl, ExternalTask): date = DateParameter() level = Parameter(default='VTD') ftp_date_format = Parameter(default='%Y%m%d') def output(self): return LocalTarget('../StateData/NC/shapefiles/SBE_{}_{}.zip'.format( self.level, self.date.strftime('%Y%m%d'))) def url(self): return self.BASE_URL + 'ShapeFiles/{}/SBE_{}_{}.zip'.format( self.level, self.level, self.date.strftime(self.ftp_date_format))
class FDAConfig(luigi.WrapperTask): data_dir = Parameter(default='./data') tmp_dir = Parameter(default='./data/openfda-tmp') es_host = Parameter(default='localhost:9200') aws_profile = luigi.Parameter(default='openfda') disable_downloads = luigi.BoolParameter(default=False) snapshot_path = luigi.Parameter( default='elasticsearch-snapshots/opensearch') snapshot_bucket = luigi.Parameter(default='openfda-prod') role_arn = luigi.Parameter( default="arn:aws:iam::806972138196:role/OpenSearchS3Access")
class MainTask(WrapperTask): """ Wrapper Task to trigger individual analysis task """ s3_data_path = Parameter() s3_output_path = Parameter() def requires(self): return { 'data_download': DataDownloadTask(self.s3_data_path, self.s3_output_path) }
class MapPLUTOTmpTable(GeoFile2TempTableTask): borough = Parameter() release = Parameter() def requires(self): return DownloadUnzipMapPLUTO(borough=self.borough, release=self.release) def input_files(self): return os.path.join(self.input().path, '{}MapPLUTO.shp'.format(self.borough.upper()))
class CensusMetaWrapper(MetaWrapper): resolution = Parameter() topic = Parameter() params = { 'topic': ['t{:03d}'.format(i) for i in range(1, 11)], 'resolution': (GEO_CT, GEO_PR, GEO_CD, GEO_CSD, GEO_CMA, GEO_DA, GEO_FSA) } def tables(self): yield Geography(resolution=self.resolution, year=2011) yield Census(resolution=self.resolution, topic=self.topic, survey=SURVEY_CEN)
class Download(Task): url = Parameter() # type: str path = Parameter() # type: str def run(self): tmp_file = get_tmp_filename() shell(f'wget {self.url} -O {tmp_file}') shell(f'mv {tmp_file} {self.path}') shell(f'rm -f {tmp_file}') def output(self): return LocalTarget(self.path)
class Split2ProkEukW(luigi.WrapperTask): """Group chromosomes to prokaryotic and eukaryotic.""" fastq_dic = DictParameter() ref_file = Parameter() workdir = Parameter() def requires(self): """A pipeline that runs from mapping to count for euk and prok.""" for samp, fastq in self.fastq_dic.items(): map_dir = os.path.join(self.workdir, "processes", "mapping", samp) yield Split2ProkEuk(outsam=map_dir + "/" + samp + ".sam", ref_file=self.ref_file, workdir=self.workdir)
class DownloadData(RepoFileUnzipTask): year = IntParameter() resolution = Parameter() profile = Parameter() state = Parameter() def get_url(self): return URL.format( year=self.year, profile=self.profile, resolution=self.resolution, state=self.state, )
class WideZillow(CSV2TempTableTask): geography = Parameter() # example: Zip hometype = Parameter() # example: SingleFamilyResidence measure = Parameter() def requires(self): return DownloadZillow(geography=self.geography, hometype=self.hometype, measure=self.measure) def input_csv(self): return self.input().path
class ImportGeometry(Shp2TempTableTask): resolution = Parameter() timestamp = Parameter() def requires(self): return DownloadGeometry(seq='114023') def input_shp(self): path = os.path.join('SIANE_CARTO_BASE_S_3M', 'anual', self.timestamp, 'SE89_3_ADMIN_{resolution}_A_X.shp'.format( resolution=self.resolution.upper())) return os.path.join(self.input().path, path)
class ConfirmTableExists(Task): ''' Confirm a table exists ''' schema = Parameter(default='observatory') tablename = Parameter() def run(self): raise Exception('Table {} does not exist'.format(self.tablename)) def output(self): return PostgresTarget(self.schema, self.tablename)
class CensusWrapper(MetaWrapper): resolution = Parameter() table = Parameter() params = { 'resolution': set(RESOLUTIONS) - set(['servicios_area']), 'table': list(DEMOGRAPHIC_TABLES.keys()) } def tables(self): yield Geography(resolution=self.resolution) yield Census(resolution=self.resolution, table=self.table)
class CensusDBFromDA(ReverseCoupledInterpolationTask): topic = Parameter() segment = Parameter() def requires(self): deps = { 'source_geom_columns': GeographyColumns(resolution=GEO_DA, year=2016), 'source_geom': Geography(resolution=GEO_DA, year=2016), 'source_data_columns': CensusColumns(resolution=GEO_DA, topic=self.topic), 'source_data': CensusData(resolution=GEO_DA, topic=self.topic, segment=self.segment), 'target_geom_columns': GeographyColumns(resolution=GEO_DB, year=2016), 'target_geom': Geography(resolution=GEO_DB, year=2016), 'target_data_columns': CensusColumns(resolution=GEO_DB, topic=self.topic), } return deps def table_timespan(self): return get_timespan('2016') def columns(self): cols = OrderedDict() input_ = self.input() cols['geom_id'] = input_['target_geom_columns']['geom_id'] for colname, coltarget in input_['target_data_columns'].items(): colid, segment = colname.split('_') if COLUMNS_DEFINITION[colid]['subsection'] in TOPICS[self.topic]: if segment == self.segment or self.segment == SEGMENT_ALL: cols[colname] = coltarget return cols def get_interpolation_parameters(self): params = { 'source_data_geoid': 'geom_id', 'source_geom_geoid': 'geom_id', 'target_data_geoid': 'geom_id', 'target_geom_geoid': 'geom_id', 'source_geom_geomfield': 'the_geom', 'target_geom_geomfield': 'the_geom', } return params
class ArquivoIndexingExternalTask(ExternalProgramTask, ABC): data_collections_folder = Parameter(default="/data/collections") document_server = Parameter(default="p64.arquivo.pt") lucene_jar = Parameter( default="/opt/searcher/scripts/lib/pwalucene-1.0.0-SNAPSHOT.jar") hadoop_bin = Parameter(default="/opt/searcher/hadoop/bin/hadoop") collection_name = Parameter(default='dummy') data_folder = Parameter(default='/data') hadoop_jar = Parameter( default="/opt/searcher/scripts/nutchwax-job-0.11.0-SNAPSHOT.jar") # TODO this can be configured within Luigi conf file hadoop_bin = Parameter(default="/opt/searcher/hadoop/bin/hadoop") def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.checkpoint_file = self.set_checkpoint() @abstractmethod def set_checkpoint(self): pass def on_success(self): with open(self.checkpoint_file, mode='w'): pass super().on_success() def complete(self): return LocalTarget(self.checkpoint_file).exists()
class DownloadGDAC(Task): data_type = Parameter() cohort = Parameter() def run(self): raise NotImplementedError('Downloading and extracting GDAC is not yet ' 'tested. Point output to a pre-populated ' 'folder of extracted GDAC matrices.') tcga.download_gdac(self.data_type, self.cohort) def output(self): fpath = f'{folder}/*{self.data_type}*./{self.cohort}' return ReferenceTarget(fpath)
class DownloadZillow(Task): geography = Parameter() hometype = Parameter() measure = Parameter() URL = 'http://files.zillowstatic.com/research/public/{geography}/{geography}_{measure}_{hometype}.csv' def version(self): return 1 def requires(self): return RepoFile(resource_id=self.task_id, version=self.version(), url=self.url()) def url(self): return self.URL.format(geography=self.geography, hometype=self.hometype, measure=self.measure) @property def last_time(self): if not hasattr(self, '_last_time'): last_time = shell( 'curl -s {url} | head -n 1'.format(url=self.url())) self._last_time = last_time.strip().split(',')[-1].strip('"') return self._last_time def run(self): copyfile(self.input().path, self.output().path) # Fix a problem with Zillow 2018-11. A `ñ` is incorrectly encoded as 0xB1, it should be 0xC3 0xB1 (in UTF-8) # As far as I can see, 0xB1 is not `ñ` in any common encoding (tested all ISO-8859-X and UTF-X) # # 0x61 0xB1 serves to give context and make this a little safe in case they fix this in the future # | E | s | p | a | ñ | o | l | a # Original: Espa.ola | 45 | 73 | 70 | 61 | b1 | 6f | 6c | 61 # Modified: Española | 45 | 73 | 70 | 61 | c3 b1 | 6f | 6c | 61 contents = '' with open(self.output().path, 'rb') as fin: contents = fin.read() contents = contents.replace(b'\x61\xB1', b'\x61\xC3\xB1') with open(self.output().path, 'wb') as fout: fout.write(contents) def output(self): return LocalTarget( os.path.join('tmp', classpath(self), self.task_id) + '_' + underscore_slugify(self.last_time) + '.csv')
class SumLevel4Geo(WrapperTask): ''' Compute the sumlevel for a given geography ''' year = Parameter() geography = Parameter() def requires(self): config = dict(SUMLEVELS.items()).get(self.geography) if config['fields']['name']: yield GeoNamesTable(year=self.year, geography=self.geography) yield SumLevel(year=self.year, geography=self.geography) yield ShorelineClip(year=self.year, geography=self.geography)
class DictPluckTask(ExternalTask): description = "Pluck a single output from a task that produces a dict of outputs" upstream_task = Parameter(description="A task that produces a dict output") key = Parameter( description= "A key in the downstream task's output that should be plucked") def requires(self): return self.upstream_task def output(self): return self.input()[self.key]
class ExtractParsedTextFromXMLTags(Task): """Accepts a Path to a single text file produced by the WikiExtractor. Reads every Document tag and maps the ID to the tokenized text before saving to a parquet file. Also augments a corpus file for the training of word embedding models downstream.""" path_to_xml_fragments = Parameter() path_to_fasttext = Parameter() path_to_word2vec = Parameter() def requires(self): return None def output(self): # Mirror the subdirectory structure of the XML_DIR parquet_subdir = config.WIKIPEDIA_PARQUET_DIR / self.path_to_xml_fragments.parent.stem parquet_subdir.mkdir(parents=True, exist_ok=True) return LocalTarget(parquet_subdir / f'{self.path_to_xml_fragments.stem}.parquet') def run(self): with open(self.path_to_xml_fragments, "r") as input_f: # Needs a root element to parse as XML xml_str = '<root>\n' + input_f.read() + '\n</root>' # Handle malformed XML from WikiExtractor: https://stackoverflow.com/a/9050454/8857601 parser = XMLParser(encoding='utf-8', recover=True, remove_blank_text=True) tree = parse(StringIO(xml_str), parser=parser) root = tree.getroot() # For dataframe construction below doc_rows = [] # For each child tag under the newly-constructed root tag for doc_tag in root.findall('doc'): # Extract target information from this tag doc_id = doc_tag.attrib.get("id") doc_url = doc_tag.attrib.get("url") doc_title = doc_tag.attrib.get("title") doc_text = doc_tag.text.strip() tokenized_text = [word_tokenize(sent) for sent in sent_tokenize(doc_text)] with open(config.LINE_SENTENCE_CORPUS_FILE, "a") as corpus_f: for sent in tokenized_text: corpus_f.write(" ".join(sent) + "\n") doc_rows.append([doc_id, doc_url, doc_title, tokenized_text]) # Construct a dataframe, and then construct parquet file output df = pandas.DataFrame(doc_rows, columns=["id", "url", "title", "tokenized_text"]) df.to_parquet(open(self.output().path, "wb"))
class CensosMetaWrapper(MetaWrapper): resolution = Parameter() tablename = Parameter() params = { #'resolution': GEOGRAPHIES, 'resolution': [GEO_I], # data only for setores_censitarios 'tablename': TABLES } def tables(self): yield Geography(resolution=self.resolution) yield Censos(resolution=self.resolution, tablename=self.tablename)