Example #1
0
class InterpolateNHSDAFromCD(Task):
    topic = Parameter()
    resolution = Parameter()

    def requires(self):
        return {
            'nhs': NHS(resolution=GEO_CD, topic=self.topic, survey=SURVEY_NHS),
            'geo_cd': Geography(resolution='GEO_CD', year=2011),
            'geo_da': Geography(resolution='GEO_DA', year=2011)
        }
Example #2
0
class SimplifiedImportGeometry(SimplifiedTempTableTask):
    resolution = Parameter()
    timestamp = Parameter()
    id_aux = Parameter(
    )  # X for Peninsula and Balearic Islands, Y for Canary Islands

    def requires(self):
        return ImportGeometry(resolution=self.resolution,
                              timestamp=self.timestamp,
                              id_aux=self.id_aux)
class LoadSocrataCSV(Task):

    domain = Parameter()
    dataset = Parameter()

    def requires(self):
        pass

    def target(self):
        pass
Example #4
0
class ImportData(CSV2TempTableTask):

    state = Parameter()
    tablename = Parameter()
    encoding = 'latin1'
    delimiter = ';'

    def requires(self):
        return DownloadData(state=self.state)

    def version(self):
        return 1

    def input_csv(self):
        if self.state.lower() == 'sp_capital':
            state_code = 'SP1'
        elif self.state.lower() == 'sp_exceto_a_capital':
            state_code = 'SP2'
        else:
            state_code = self.state.upper()

        # The provided CSV files are not well-formed, so we convert the provided XLS files into CSV
        # All files are {tablename}_{state}.xls (or XLS), except Basico-MG.xls
        df = pd.read_excel(self._datafile_path(self.tablename, state_code))
        if self.tablename != 'Basico':
            df = df.apply(pd.to_numeric, errors="coerce")

        # In the 20180416 version, SP_Capital-Domicilio02 files have wrong IDs (all have been rounded to 55031E+14)
        # But the order of sectors is constant in all files of the same state, so we copy them from a different file
        # This used to be correct in 2017's version of the file
        if self.state.lower(
        ) == 'sp_capital' and self.tablename == 'Domicilio02':
            domicilio01 = pd.read_excel(
                self._datafile_path('Domicilio01', state_code))
            df['Cod_setor'] = domicilio01['Cod_setor']

        df.to_csv(os.path.join(
            self.input().path,
            '{tablename}_{state_code}.csv'.format(tablename=self.tablename,
                                                  state_code=state_code)),
                  index=False,
                  sep=';',
                  encoding=self.encoding)

        return os.path.join(
            self.input().path,
            '{tablename}_{state_code}.csv'.format(tablename=self.tablename,
                                                  state_code=state_code))

    def _datafile_path(self, tablename, state_code):
        filename = '{tablename}[-_]{state_code}.[xX][lL][sS]'.format(
            tablename=tablename, state_code=state_code)

        return glob.glob(os.path.join(self.input().path, '**', filename),
                         recursive=True)[0]
class TigerGeographyShapefileToSQL(TempTableTask):
    '''
    Take downloaded shapefiles and load them into Postgres
    '''

    year = Parameter()
    geography = Parameter()

    def requires(self):
        return UnzipTigerGeography(year=self.year, geography=self.geography)

    def run(self):
        shapefiles = shell(
            'ls {dir}/*.shp'.format(dir=os.path.join('tmp', classpath(
                self), str(self.year), self.geography))).strip().split('\n')

        cmd = 'ogrinfo {shpfile_path}'.format(shpfile_path=shapefiles[0])
        resp = shell(cmd)
        if 'Polygon' in resp:
            nlt = '-nlt MultiPolygon'
        else:
            nlt = ''

        cmd = 'PG_USE_COPY=yes PGCLIENTENCODING=latin1 ' \
              'ogr2ogr -f PostgreSQL "PG:dbname=$PGDATABASE active_schema={schema}" ' \
              '-t_srs "EPSG:4326" {nlt} -nln {tablename} ' \
              '-lco OVERWRITE=yes ' \
              '-lco SCHEMA={schema} {shpfile_path} '.format(
                  tablename=self.output().tablename,
                  schema=self.output().schema, nlt=nlt,
                  shpfile_path=shapefiles.pop())
        shell(cmd)

        # chunk into 500 shapefiles at a time.
        for i, shape_group in enumerate(grouper(shapefiles, 500)):
            shell('export PG_USE_COPY=yes PGCLIENTENCODING=latin1; '
                  'echo \'{shapefiles}\' | xargs -P 16 -I shpfile_path '
                  'ogr2ogr -f PostgreSQL "PG:dbname=$PGDATABASE '
                  'active_schema={schema}" -append '
                  '-t_srs "EPSG:4326" {nlt} -nln {tablename} '
                  'shpfile_path '.format(shapefiles='\n'.join(
                      [shp for shp in shape_group if shp]),
                                         tablename=self.output().tablename,
                                         nlt=nlt,
                                         schema=self.output().schema))
            print('imported {} shapefiles'.format((i + 1) * 500))

        session = current_session()
        # Spatial index
        session.execute(
            'ALTER TABLE {qualified_table} RENAME COLUMN '
            'wkb_geometry TO geom'.format(qualified_table=self.output().table))
        session.execute(
            'CREATE INDEX ON {qualified_table} USING GIST (geom)'.format(
                qualified_table=self.output().table))
Example #6
0
class ImportAllTables(WrapperTask):
    year = IntParameter()
    resolution = Parameter()
    state = Parameter()

    def requires(self):
        for table in TABLES[self.year]:
            yield ImportData(resolution=self.resolution,
                             state=self.state,
                             year=self.year,
                             tablename=table)
Example #7
0
File: Map.py Project: mshakya/piret
class SAMindex(luigi.Task):
    """Create Hisat Indices from given fasta file."""

    fasta = Parameter()
    workdir = Parameter()

    def output(self):
        """Require reference fasta format file."""
        if ',' in self.fasta:
            fas = self.fasta.split(",")
            return [
                LocalTarget(
                    os.path.join(self.workdir, "processes", "novel",
                                 os.path.basename(fa) + ".bedfile"))
                for fa in fas
            ]

    def make_index(self, ref):
        """A function to make index from sam files."""
        index_options = ["faidx", ref]
        novel_folder = os.path.join(self.workdir, "processes", "novel")
        if os.path.exists(novel_folder) is False:
            os.makedirs(novel_folder)
        mv_options = [ref + ".fai", os.path.join(novel_folder)]
        samtools_cmd = samtools[index_options]
        mv_cmd = mv[mv_options]
        samtools_cmd()
        mv_cmd()
        fa_name = os.path.basename(ref)
        return os.path.join(self.workdir, "processes", "novel",
                            fa_name + ".fai")

    def create_bedfile(self, index_file):
        """Makes bed file from the sam index file."""
        out_bedfile = index_file.split(".fai")[0] + ".bedfile"
        bedfile_opt = ['BEGIN {FS="\t"}; {print $1 FS "0" FS $2}', index_file]
        awk_cmd = ((awk[bedfile_opt]) > out_bedfile)
        awk_cmd()

    def requires(self):
        """Expected index output."""
        if ',' in self.fasta:
            fas = self.fasta.split(",")
            return [RefFile(fa) for fa in fas]
        else:
            return [RefFile(self.fasta)]

    def run(self):
        """Run hisat2-build command."""
        if ',' in self.fasta:
            fas = self.fasta.split(",")
            for fa in fas:
                ind_file = self.make_index(fa)
                self.create_bedfile(ind_file)
Example #8
0
class AllMCCountries(WrapperTask):
    until_month = Parameter(default=None)
    month = Parameter(default=None)

    def requires(self):
        return [
            AllMCData(country=country,
                      until_month=self.until_month,
                      month=self.month)
            for country in ['us', 'ca', 'uk', 'au']
        ]
class SimplifyGeometriesPostGIS(Task):
    schema = Parameter()
    table_input = Parameter()
    table_output = Parameter(default='')
    geomfield = Parameter(default=DEFAULT_GEOMFIELD)
    retainfactor = Parameter(default=DEFAULT_P_RETAIN_FACTOR_POSTGIS)

    def __init__(self, *args, **kwargs):
        super(SimplifyGeometriesPostGIS, self).__init__(*args, **kwargs)

        self.table_out = '{tablename}{suffix}'.format(
            tablename=self.table_input, suffix=SIMPL_SUFFIX)
        if self.table_output:
            self.table_out = self.table_output

    def run(self):
        session = CurrentSession().get()

        columns = session.execute(
            "SELECT column_name "
            "FROM information_schema.columns "
            "WHERE table_schema = '{schema}' "
            "AND table_name   = '{table}'".format(
                schema=self.schema,
                table=self.table_input.lower())).fetchall()

        factor = simplification_factor(self.schema, self.table_input,
                                       self.geomfield, self.retainfactor)

        simplified_geomfield = 'ST_CollectionExtract(ST_MakeValid(ST_SimplifyVW({geomfield}, {factor})), 3) ' \
                               '{geomfield}'.format(geomfield=self.geomfield, factor=factor)

        session.execute(
            'CREATE TABLE "{schema}".{table_out} '
            'AS SELECT {fields} '
            'FROM "{schema}".{table_in} '.format(
                schema=self.output().schema,
                table_in=self.table_input,
                table_out=self.output().tablename,
                fields=', '.join([
                    x[0] if x[0] != self.geomfield else simplified_geomfield
                    for x in columns
                ])))
        session.commit()
        session.execute(
            'CREATE INDEX {table_out}_{geomfield}_geo ON '
            '"{schema}".{table_out} USING GIST ({geomfield})'.format(
                table_out=self.output().tablename,
                geomfield=self.geomfield,
                schema=self.output().schema))

    def output(self):
        return PostgresTarget(self.schema, self.table_out)
Example #10
0
class NHSMetaWrapper(MetaWrapper):
    resolution = Parameter()
    topic = Parameter()

    params = {
        'topic': ['t{:03d}'.format(i) for i in range(1, 30)],
        'resolution': (GEO_CT, GEO_PR, GEO_CD, GEO_CSD, GEO_CMA)  # NHS not available at DA or FSA level
    }

    def tables(self):
        yield Geography(resolution=self.resolution, year=2011)
        yield NHS(resolution=self.resolution, topic=self.topic, survey=SURVEY_NHS)
class ShapeData(DownloadFromUrl, ExternalTask):
    date = DateParameter()
    level = Parameter(default='VTD')
    ftp_date_format = Parameter(default='%Y%m%d')

    def output(self):
        return LocalTarget('../StateData/NC/shapefiles/SBE_{}_{}.zip'.format(
            self.level, self.date.strftime('%Y%m%d')))

    def url(self):
        return self.BASE_URL + 'ShapeFiles/{}/SBE_{}_{}.zip'.format(
            self.level, self.level, self.date.strftime(self.ftp_date_format))
Example #12
0
class FDAConfig(luigi.WrapperTask):
    data_dir = Parameter(default='./data')
    tmp_dir = Parameter(default='./data/openfda-tmp')
    es_host = Parameter(default='localhost:9200')
    aws_profile = luigi.Parameter(default='openfda')
    disable_downloads = luigi.BoolParameter(default=False)

    snapshot_path = luigi.Parameter(
        default='elasticsearch-snapshots/opensearch')
    snapshot_bucket = luigi.Parameter(default='openfda-prod')
    role_arn = luigi.Parameter(
        default="arn:aws:iam::806972138196:role/OpenSearchS3Access")
class MainTask(WrapperTask):
    """
    Wrapper Task to trigger individual analysis task
    """
    s3_data_path = Parameter()
    s3_output_path = Parameter()

    def requires(self):
        return {
            'data_download':
            DataDownloadTask(self.s3_data_path, self.s3_output_path)
        }
Example #14
0
class MapPLUTOTmpTable(GeoFile2TempTableTask):

    borough = Parameter()
    release = Parameter()

    def requires(self):
        return DownloadUnzipMapPLUTO(borough=self.borough,
                                     release=self.release)

    def input_files(self):
        return os.path.join(self.input().path,
                            '{}MapPLUTO.shp'.format(self.borough.upper()))
Example #15
0
class CensusMetaWrapper(MetaWrapper):
    resolution = Parameter()
    topic = Parameter()

    params = {
        'topic': ['t{:03d}'.format(i) for i in range(1, 11)],
        'resolution': (GEO_CT, GEO_PR, GEO_CD, GEO_CSD, GEO_CMA, GEO_DA, GEO_FSA)
    }

    def tables(self):
        yield Geography(resolution=self.resolution, year=2011)
        yield Census(resolution=self.resolution, topic=self.topic, survey=SURVEY_CEN)
Example #16
0
class Download(Task):
    url = Parameter()  # type: str
    path = Parameter()  # type: str

    def run(self):
        tmp_file = get_tmp_filename()
        shell(f'wget {self.url} -O {tmp_file}')
        shell(f'mv {tmp_file} {self.path}')
        shell(f'rm -f {tmp_file}')

    def output(self):
        return LocalTarget(self.path)
Example #17
0
File: Map.py Project: mshakya/piret
class Split2ProkEukW(luigi.WrapperTask):
    """Group chromosomes to prokaryotic and eukaryotic."""
    fastq_dic = DictParameter()
    ref_file = Parameter()
    workdir = Parameter()

    def requires(self):
        """A pipeline that runs from mapping to count for euk and prok."""
        for samp, fastq in self.fastq_dic.items():
            map_dir = os.path.join(self.workdir, "processes", "mapping", samp)
            yield Split2ProkEuk(outsam=map_dir + "/" + samp + ".sam",
                                ref_file=self.ref_file,
                                workdir=self.workdir)
Example #18
0
class DownloadData(RepoFileUnzipTask):
    year = IntParameter()
    resolution = Parameter()
    profile = Parameter()
    state = Parameter()

    def get_url(self):
        return URL.format(
            year=self.year,
            profile=self.profile,
            resolution=self.resolution,
            state=self.state,
        )
Example #19
0
class WideZillow(CSV2TempTableTask):

    geography = Parameter()  # example: Zip
    hometype = Parameter()  # example: SingleFamilyResidence
    measure = Parameter()

    def requires(self):
        return DownloadZillow(geography=self.geography,
                              hometype=self.hometype,
                              measure=self.measure)

    def input_csv(self):
        return self.input().path
Example #20
0
class ImportGeometry(Shp2TempTableTask):

    resolution = Parameter()
    timestamp = Parameter()

    def requires(self):
        return DownloadGeometry(seq='114023')

    def input_shp(self):
        path = os.path.join('SIANE_CARTO_BASE_S_3M', 'anual', self.timestamp,
                            'SE89_3_ADMIN_{resolution}_A_X.shp'.format(
                                resolution=self.resolution.upper()))
        return os.path.join(self.input().path, path)
Example #21
0
class ConfirmTableExists(Task):
    '''
    Confirm a table exists
    '''

    schema = Parameter(default='observatory')
    tablename = Parameter()

    def run(self):
        raise Exception('Table {} does not exist'.format(self.tablename))

    def output(self):
        return PostgresTarget(self.schema, self.tablename)
class CensusWrapper(MetaWrapper):

    resolution = Parameter()
    table = Parameter()

    params = {
        'resolution': set(RESOLUTIONS) - set(['servicios_area']),
        'table': list(DEMOGRAPHIC_TABLES.keys())
    }

    def tables(self):
        yield Geography(resolution=self.resolution)
        yield Census(resolution=self.resolution, table=self.table)
Example #23
0
class CensusDBFromDA(ReverseCoupledInterpolationTask):
    topic = Parameter()
    segment = Parameter()

    def requires(self):
        deps = {
            'source_geom_columns':
            GeographyColumns(resolution=GEO_DA, year=2016),
            'source_geom':
            Geography(resolution=GEO_DA, year=2016),
            'source_data_columns':
            CensusColumns(resolution=GEO_DA, topic=self.topic),
            'source_data':
            CensusData(resolution=GEO_DA,
                       topic=self.topic,
                       segment=self.segment),
            'target_geom_columns':
            GeographyColumns(resolution=GEO_DB, year=2016),
            'target_geom':
            Geography(resolution=GEO_DB, year=2016),
            'target_data_columns':
            CensusColumns(resolution=GEO_DB, topic=self.topic),
        }

        return deps

    def table_timespan(self):
        return get_timespan('2016')

    def columns(self):
        cols = OrderedDict()
        input_ = self.input()
        cols['geom_id'] = input_['target_geom_columns']['geom_id']
        for colname, coltarget in input_['target_data_columns'].items():
            colid, segment = colname.split('_')
            if COLUMNS_DEFINITION[colid]['subsection'] in TOPICS[self.topic]:
                if segment == self.segment or self.segment == SEGMENT_ALL:
                    cols[colname] = coltarget
        return cols

    def get_interpolation_parameters(self):
        params = {
            'source_data_geoid': 'geom_id',
            'source_geom_geoid': 'geom_id',
            'target_data_geoid': 'geom_id',
            'target_geom_geoid': 'geom_id',
            'source_geom_geomfield': 'the_geom',
            'target_geom_geomfield': 'the_geom',
        }

        return params
Example #24
0
class ArquivoIndexingExternalTask(ExternalProgramTask, ABC):
    data_collections_folder = Parameter(default="/data/collections")
    document_server = Parameter(default="p64.arquivo.pt")
    lucene_jar = Parameter(
        default="/opt/searcher/scripts/lib/pwalucene-1.0.0-SNAPSHOT.jar")
    hadoop_bin = Parameter(default="/opt/searcher/hadoop/bin/hadoop")
    collection_name = Parameter(default='dummy')
    data_folder = Parameter(default='/data')
    hadoop_jar = Parameter(
        default="/opt/searcher/scripts/nutchwax-job-0.11.0-SNAPSHOT.jar")
    # TODO this can be configured within Luigi conf file
    hadoop_bin = Parameter(default="/opt/searcher/hadoop/bin/hadoop")

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.checkpoint_file = self.set_checkpoint()

    @abstractmethod
    def set_checkpoint(self):
        pass

    def on_success(self):
        with open(self.checkpoint_file, mode='w'):
            pass
        super().on_success()

    def complete(self):
        return LocalTarget(self.checkpoint_file).exists()
Example #25
0
class DownloadGDAC(Task):

    data_type = Parameter()
    cohort = Parameter()

    def run(self):
        raise NotImplementedError('Downloading and extracting GDAC is not yet '
                                  'tested. Point output to a pre-populated '
                                  'folder of extracted GDAC matrices.')
        tcga.download_gdac(self.data_type, self.cohort)

    def output(self):
        fpath = f'{folder}/*{self.data_type}*./{self.cohort}'
        return ReferenceTarget(fpath)
Example #26
0
class DownloadZillow(Task):

    geography = Parameter()
    hometype = Parameter()
    measure = Parameter()

    URL = 'http://files.zillowstatic.com/research/public/{geography}/{geography}_{measure}_{hometype}.csv'

    def version(self):
        return 1

    def requires(self):
        return RepoFile(resource_id=self.task_id,
                        version=self.version(),
                        url=self.url())

    def url(self):
        return self.URL.format(geography=self.geography,
                               hometype=self.hometype,
                               measure=self.measure)

    @property
    def last_time(self):
        if not hasattr(self, '_last_time'):
            last_time = shell(
                'curl -s {url} | head -n 1'.format(url=self.url()))
            self._last_time = last_time.strip().split(',')[-1].strip('"')
        return self._last_time

    def run(self):
        copyfile(self.input().path, self.output().path)

        # Fix a problem with Zillow 2018-11. A `ñ` is incorrectly encoded as 0xB1, it should be 0xC3 0xB1 (in UTF-8)
        # As far as I can see, 0xB1 is not `ñ` in any common encoding (tested all ISO-8859-X and UTF-X)
        #
        # 0x61 0xB1 serves to give context and make this a little safe in case they fix this in the future
        #                     |  E |  s |  p |  a |  ñ    |  o |  l |  a
        # Original: Espa.ola  | 45 | 73 | 70 | 61 | b1    | 6f | 6c | 61
        # Modified: Española  | 45 | 73 | 70 | 61 | c3 b1 | 6f | 6c | 61
        contents = ''
        with open(self.output().path, 'rb') as fin:
            contents = fin.read()
        contents = contents.replace(b'\x61\xB1', b'\x61\xC3\xB1')
        with open(self.output().path, 'wb') as fout:
            fout.write(contents)

    def output(self):
        return LocalTarget(
            os.path.join('tmp', classpath(self), self.task_id) + '_' +
            underscore_slugify(self.last_time) + '.csv')
Example #27
0
class SumLevel4Geo(WrapperTask):
    '''
    Compute the sumlevel for a given geography
    '''

    year = Parameter()
    geography = Parameter()

    def requires(self):
        config = dict(SUMLEVELS.items()).get(self.geography)
        if config['fields']['name']:
            yield GeoNamesTable(year=self.year, geography=self.geography)
        yield SumLevel(year=self.year, geography=self.geography)
        yield ShorelineClip(year=self.year, geography=self.geography)
Example #28
0
class DictPluckTask(ExternalTask):

    description = "Pluck a single output from a task that produces a dict of outputs"

    upstream_task = Parameter(description="A task that produces a dict output")
    key = Parameter(
        description=
        "A key in the downstream task's output that should be plucked")

    def requires(self):
        return self.upstream_task

    def output(self):
        return self.input()[self.key]
class ExtractParsedTextFromXMLTags(Task):
    """Accepts a Path to a single text file produced by the WikiExtractor. Reads every Document tag and
    maps the ID to the tokenized text before saving to a parquet file. Also augments a corpus file for the training
    of word embedding models downstream."""
    
    path_to_xml_fragments = Parameter()
    path_to_fasttext = Parameter()
    path_to_word2vec = Parameter()
    
    def requires(self):
        return None
    
    def output(self):
        # Mirror the subdirectory structure of the XML_DIR
        parquet_subdir = config.WIKIPEDIA_PARQUET_DIR / self.path_to_xml_fragments.parent.stem
        parquet_subdir.mkdir(parents=True, exist_ok=True)
        return LocalTarget(parquet_subdir / f'{self.path_to_xml_fragments.stem}.parquet')
    
    def run(self):
        with open(self.path_to_xml_fragments, "r") as input_f:
            # Needs a root element to parse as XML
            xml_str = '<root>\n' + input_f.read() + '\n</root>'
            
            # Handle malformed XML from WikiExtractor: https://stackoverflow.com/a/9050454/8857601
            parser = XMLParser(encoding='utf-8', recover=True, remove_blank_text=True)
            tree = parse(StringIO(xml_str), parser=parser)
            root = tree.getroot()
        
        # For dataframe construction below
        doc_rows = []

        # For each child tag under the newly-constructed root tag
        for doc_tag in root.findall('doc'):
            # Extract target information from this tag
            doc_id = doc_tag.attrib.get("id")
            doc_url = doc_tag.attrib.get("url")
            doc_title = doc_tag.attrib.get("title")
            doc_text = doc_tag.text.strip()
            
            tokenized_text = [word_tokenize(sent) for sent in sent_tokenize(doc_text)]
            
            with open(config.LINE_SENTENCE_CORPUS_FILE, "a") as corpus_f:
                for sent in tokenized_text:
                    corpus_f.write(" ".join(sent) + "\n")
            
            doc_rows.append([doc_id, doc_url, doc_title, tokenized_text])
            
        # Construct a dataframe, and then construct parquet file output
        df = pandas.DataFrame(doc_rows, columns=["id", "url", "title", "tokenized_text"])
        df.to_parquet(open(self.output().path, "wb"))
Example #30
0
class CensosMetaWrapper(MetaWrapper):

    resolution = Parameter()
    tablename = Parameter()

    params = {
        #'resolution': GEOGRAPHIES,
        'resolution': [GEO_I],  # data only for setores_censitarios
        'tablename': TABLES
    }

    def tables(self):
        yield Geography(resolution=self.resolution)
        yield Censos(resolution=self.resolution, tablename=self.tablename)