class SyncColumn(WrapperTask):
    '''
    Upload tables relevant to updating a particular column by keyword.
    '''
    keywords = Parameter()

    def requires(self):
        session = current_session()
        cols = session.query(OBSColumn).filter(
            OBSColumn.id.ilike('%' + self.keywords + '%'))
        if cols.count():
            for col in cols:
                for coltable in col.tables:
                    yield SyncData(exact_id=coltable.table.id)
        else:
            tables = session.query(OBSTable).filter(
                OBSTable.id.ilike('%' + self.keywords + '%'))
            if tables.count():
                for table in tables:
                    yield SyncData(exact_id=table.id)
            else:
                raise Exception('Unable to find any tables or columns with ID '
                                'that matched "{keywords}" via ILIKE'.format(
                                    keywords=self.keywords))
Exemple #2
0
class ImportEnglandWalesLocal(TempTableTask):

    table = Parameter()

    def requires(self):
        return DownloadEnglandWalesLocal()

    def run(self):
        session = current_session()
        infile = os.path.join(self.input().path, self.table + 'DATA.CSV')
        headers = shell('head -n 1 {csv}'.format(csv=infile))
        cols = ['{} NUMERIC'.format(h) for h in headers.split(',')[1:]]
        session.execute('CREATE TABLE {output} (GeographyCode TEXT, {cols})'.format(
            output=self.output().table,
            cols=', '.join(cols)
        ))
        session.commit()
        shell("cat '{infile}' | psql -c 'COPY {output} FROM STDIN WITH CSV HEADER'".format(
            output=self.output().table,
            infile=infile,
        ))
        session.execute('ALTER TABLE {output} ADD PRIMARY KEY (geographycode)'.format(
            output=self.output().table
        ))
Exemple #3
0
class TarballTask(Task):
    """
    A task that puts another task's output (assuming it outputs a FileTarget) into a tarball)
    """

    describe = "Package a task's output into an uncompressed tarball."

    upstream_task = TaskParameter(
        description="Task that produces a local file")
    output_path = Parameter(description="Where the output archive should go")

    def requires(self):
        return self.upstream_task

    def run(self):
        input_path = self.input().path
        output_path = self.output().path

        if not os.path.exists(input_path):
            raise FileNotFoundError(
                "{input_path}: no such file or directory: should be a *local* file/dir to be archived"
                .format(input_path=input_path))

        logger.info(
            "Putting {input_path} into a tar located at {output_path}".format(
                input_path=input_path, output_path=output_path))

        input_path = self.input().path
        with tarfile.open(self.output().path, "w") as tar:
            tar.add(input_path, arcname=os.path.basename(input_path))

        logger.info("{output_path}: tar created: size = {size} bytes".format(
            output_path=output_path, size=os.stat(output_path).st_size))

    def output(self):
        return LocalTarget(self.output_path)
class DownloadUK(Task):
    API_URL = 'https://www.nomisweb.co.uk/api/v01/dataset/def.sdmx.json?search={}*'
    DOWNLOAD_URL = 'https://www.nomisweb.co.uk/api/v01/dataset/{id}.bulk.csv?time=2011&measures=20100&geography={geo}'

    table = Parameter()

    def version(self):
        return 1

    def requires(self):
        requirements = {}
        # Query API, extract table ID from name
        meta = requests.get(self.API_URL.format(self.table)).json()
        api_id = (meta['structure']['keyfamilies']['keyfamily'][0]['id']).lower()
        for geo in self.GEO_TYPES:
            requirements[geo] = RepoFile(resource_id='{task_id}_{geo}'.format(task_id=self.task_id, geo=geo),
                                         version=self.version(),
                                         url=self.DOWNLOAD_URL.format(id=api_id, geo=geo))

        return requirements

    def run(self):
        # Download for SA (EW,S) and OA (NI) in a single file
        with self.output().temporary_path() as tmp, open(os.path.join(tmp, '{}.csv'.format(self.table)), 'wb') as outcsv:
            skip_header = False
            for geo in self.GEO_TYPES:
                with open(self.input()[geo].path, 'rb') as remote_file:
                    if skip_header:
                        next(remote_file)
                    else:
                        skip_header = True
                    for l in remote_file:
                        outcsv.write(l)

    def output(self):
        return DirectoryTarget(self)
Exemple #5
0
class Sample(object):

    sample_id = Parameter()

    @property
    def sample(self):
        if not hasattr(self, '_sample'):
            self._sample = client.get_record_by_name(
                self.sample_id, cfg['AIRTABLE_SAMPLE_TABLE'])['fields']
        return self._sample

    @property
    def sample_folder(self):
        return '{expt}/{sample}'.format(bucket=cfg['S3_BUCKET'],
                                        expt=self.experiment['Name'],
                                        sample=self.sample_id)

    @property
    def experiment(self):
        if not hasattr(self, '_experiment'):
            expt_key = self.sample['Experiment'][0]
            self._experiment = client.get_record(
                expt_key, cfg['AIRTABLE_EXPT_TABLE'])['fields']
        return self._experiment
Exemple #6
0
class StartIndexCollection(WrapperTask):
    collection_name = Parameter(default='dummy')
    data_folder = Parameter(default='/data')
    hadoop_jar = Parameter(
        default="/opt/searcher/scripts/nutchwax-job-0.11.0-SNAPSHOT.jar")
    hadoop_bin = Parameter(default="/opt/searcher/hadoop/bin/hadoop")
    lucene_jar = Parameter(
        default="/opt/searcher/scripts/lib/pwalucene-1.0.0-SNAPSHOT.jar")
    document_server = Parameter(default="p64.arquivo.pt")

    def complete(self):
        return False

    def requires(self):
        return PruneIndexes(collection_name=self.collection_name,
                            data_folder=self.data_folder,
                            hadoop_jar=self.hadoop_jar,
                            lucene_jar=self.lucene_jar,
                            hadoop_bin=self.hadoop_bin,
                            document_server=self.document_server)
Exemple #7
0
class StaticJSON(ExternalTask):
    filename = Parameter()

    def output(self):
        return LocalTarget("data/{}.json".format(self.filename))
class AnalyzeModelResults(Task):  # pragma: no cover
    """
    Manages the analysis of the model results:
    1. Term of interest is fed to trained model
    2. Model returns top N words that are most similar

    This similarity decision is made based on the cosine
    similarity of the words' vector representations
    """

    # Parameter representing the local data root
    RESULTS_ROOT = Parameter(
        default=os.path.abspath(os.path.join("data", "models")))

    # Parameter representing the model results target file
    model_results_target_file = Parameter(default="model_results.csv")

    # Parameter representing the term for which to retrieve
    # the top N similar words
    search_term = Parameter(default="news")

    def requires(self):
        return self.clone(FeedToModel)

    def output(self):

        # Represents the file path to store the video captions
        video_ids_path = str(self.channel_author) + '_' + str(
            self.query) + '_captions'

        paths_list = [
            str(self.RESULTS_ROOT),
            video_ids_path,
            str(self.model_results_target_file),
        ]

        return SuffixPreservingLocalTarget(reduce(os.path.join, paths_list))

    def run(self):
        # Loads the saved model
        with self.input().open(mode="r") as input_target:
            loaded_model = Word2Vec.load(input_target.name)

        # Determines the words that are most similar to the term of interest
        most_similar_words = loaded_model.similar_by_word(str(
            self.search_term),
                                                          topn=30)

        # Stores results to DataFrame
        words_df = pd.DataFrame(most_similar_words,
                                columns=["word", "probability"])

        # Writes results to target file
        with self.output().open(mode="w") as output_target:
            words_df.to_csv(output_target)

        # Displays results
        self.print_results()

    def print_results(self):
        print(pd.read_csv(self.output().path))
class VaccineDataGlobalCleanupTask(Task):
    """Luigi Task to clean Vaccine time series data. The input is from
    External Task that specifies files in GIT. The cleaning from below code handles
    removing rows with null date and doses administered values are non-zero.
    The default parameters can be overridden for testing and I have overridden for
    all test cases.

    Parameters:
        subset: bool, True to process one partition, False to process the entire dataset
                    default: True
        data_root: str, base directory to store cleaned output files

    Output:
        Dataframe stored in compressed Parquet format

    """

    # default parameters
    subset = BoolParameter(default=True)
    data_root = Parameter(default="./data/vaccine/")

    # External task completion is required, to work with GIT / CSVTarget
    requires = Requires()
    input_data = Requirement(VaccineDataGlobalTask)

    # TargetOutput returns ParquetTarget
    output = TargetOutput(
        "{task.data_root}",
        ext="subset-{task.subset}/",
        target_class=ParquetTarget,
        flag="_SUCCESS",
        storage_options=None,
    )

    def run(self):
        """
        Clean Vaccine data from Task input and stores dataframe in Parquet format.

        :return:
            File content is stored in the data directory
        """

        # The columns ["Doses_admin", "People_partially_vaccinated", "People_fully_vaccinated"]
        # are all integers. However, given there are missing values, you must first
        # read them as floats, fill nan's as 0, then convert to int.
        # You can provide a dict of {col: dtype} when providing the dtype arg in places like
        # read_parquet and astype.
        number_columns = [
            "Doses_admin",
            "People_partially_vaccinated",
            "People_fully_vaccinated",
        ]
        # Ensure that the date column is parsed as a pandas datetime using parse_dates
        vdg_dask = self.input()["input_data"].read_dask(
            parse_dates=["Date"], dtype={c: "float"
                                         for c in number_columns})

        if self.subset:
            vdg_dask = vdg_dask.get_partition(0)

        # perform data cleaning
        # Remove any blank countries
        vdg_dask = vdg_dask[~vdg_dask.Country_Region.isnull()]
        # Filter out invalid dates
        vdg_dask = vdg_dask[~vdg_dask.Date.isnull()]

        # You should set the index to Country_Region and ensure the output reads back with meaningful divisions
        # vdg_dask = vdg_dask.set_index("Country_Region")
        vdg_dask[number_columns] = vdg_dask[number_columns].fillna(0).astype(
            int)

        # write_dask parquet file output with gzip compression.
        vdg_output = vdg_dask
        self.output().write_dask(vdg_output, compression="gzip")
Exemple #10
0
class BaseParams(metaclass=ABCMeta):
    resolution = Parameter(default=GEO_PR)
    survey = Parameter(default=SURVEY_CEN)
Exemple #11
0
class ImportAllSurveys(WrapperTask):
    resolution = Parameter(default=GEO_PR)

    def requires(self):
        for survey in SURVEYS:
            yield ImportData(resolution=self.resolution, survey=survey)
class SoiaMetricsFetcher(sqla.CopyToTable):

    columns = [(["id", Integer()], {
        "autoincrement": True,
        "primary_key": True
    }), (["start", BigInteger()], {}), (["end", BigInteger()], {}),
               (["insert_date", BigInteger()], {}), (["path", Text()], {}),
               (["metric_anomaly", Text()], {}), (["metric_whole",
                                                   Text()], {})]
    connection_string = "sqlite:///data/soia_email.db"
    table = "soia_with_values"

    path = Parameter()
    date = DateParameter()

    def requires(self):
        return SoiaEmailFetcher(date=datetime.now()), MetricFetcher(
            path_prefix=self.path)

    def copy(self, conn, ins_rows, table):
        bound_cols = dict((c, bindparam("_" + c.key)) for c in table.columns
                          if c.key != "id")
        ins = table.insert().values(bound_cols)
        conn.execute(ins, ins_rows)

    def rows(self):
        for start, end, path, metric, whole in deduplicated(
                self.generate_rows()):
            yield "auto", start, end, datetime.now().strftime(
                '%s'), path, metric, whole

    def generate_rows(self):

        now = int(datetime.now().strftime('%s'))
        _14_days_ago = int(
            (datetime.now() - timedelta(days=14)).strftime('%s'))
        _, preloaded_metrics = self.input()

        metrics = json.loads(preloaded_metrics.open('r').read())

        conn = sqlite3.connect('data/soia_email.db')
        c = conn.cursor()
        c.execute("select distinct start, end from soia;")
        rows = c.fetchall()
        conn.close()
        formed_rows = []
        for start, end in rows:
            if start < _14_days_ago or end < _14_days_ago:
                logging.warning(
                    f"date to early :C - {datetime.fromtimestamp(start)}, {datetime.fromtimestamp(end)}"
                )
            else:
                logging.info(
                    f"date good to go! - {datetime.fromtimestamp(start)}, {datetime.fromtimestamp(end)}"
                )
                for metric in metrics:
                    shorter = list(
                        filter(lambda tup: tup[1] >= start and tup[1] <= end,
                               metric['datapoints']))
                    formed_rows.append(
                        (start, end, metric['target'], json.dumps(shorter),
                         json.dumps(metric['datapoints'])))
                    print(len(formed_rows[0]))

        return formed_rows
Exemple #13
0
class ImagesForMeasure(Task):
    '''
    Generate a set of static images for a measure
    '''

    MAP_URL = '{cartodb_url}/api/v1/map'.format(
        cartodb_url=os.environ['CARTODB_URL'])

    BASEMAP = {
        "type": "http",
        "options": {
            "urlTemplate":
            "http://{s}.basemaps.cartocdn.com/light_nolabels/{z}/{x}/{y}.png",
            "subdomains": "abcd",
        }
    }

    LABELS = {
        "type": "http",
        "options": {
            "urlTemplate":
            "http://{s}.basemaps.cartocdn.com/light_only_labels/{z}/{x}/{y}.png",
            "subdomains": "abcd",
        }
    }
    CENTER_ZOOM_BOUNDS = {
        'es': [
            (
                (40.4139017, -3.7350414),
                6,
                None,
            ),
            (
                (40.4139017, -3.7350414),
                8,
                None,
            ),
            (
                (40.4139017, -3.7350414),
                11,
                None,
            ),
            (
                (40.4139017, -3.7050414),
                13,
                None,
            ),
        ],
        'mx': [
            (
                (22.979, -101.777),
                4,
                'mx.inegi.entidad',
            ),
            (
                (19.316, -99.152),
                7,
                'mx.inegi.municipio',
            ),
            (
                (19.441989391028706, -99.14474487304688),
                11,
                'mx.inegi.ageb',
            ),
            (
                (19.441989391028706, -99.14474487304688),
                13,
                'mx.inegi.manzana',
            ),
        ],
        'uk': [
            (
                (52.51622086393074, -1.197509765625),
                5,
                None,
            ),  # All England
            (
                (51.50190410761811, -0.120849609375),
                9,
                None,
            ),  # London
            (
                (52.47274306920925, -3.982543945312),
                7,
                None,
            ),  # Wales
            (
                (53.491313790532956, -2.9706787109375),
                9,
                None,
            ),  # Manchester
        ],
        'us': [
            (
                (37.996162679728116, -97.6904296875),
                3,
                'us.census.tiger.state_clipped',
            ),
            (
                (38.16911413556086, -114.884033203125),
                5,
                'us.census.tiger.county_clipped',
            ),
            (
                (37.75225820732333, -122.11584777832031),
                9,
                'us.census.tiger.census_tract_clipped',
            ),
            (
                (37.75225820732333, -122.44584777832031),
                12,
                'us.census.tiger.block_group_clipped',
            ),
        ],
    }

    PALETTES = {
        'tags.people':
        '''
@5:#6c2167;
@4:#a24186;
@3:#ca699d;
@2:#e498b4;
@1:#f3cbd3;''',
        'tags.money':
        '''
@5:#1d4f60;
@4:#2d7974;
@3:#4da284;
@2:#80c799;
@1:#c4e6c3;''',
        'tags.households':
        '''
@5:#63589f;
@4:#9178c4;
@3:#b998dd;
@2:#dbbaed;
@1:#f3e0f7;''',
        'tags.housing':
        '''
@5:#2a5674;
@4:#45829b;
@3:#68abb8;
@2:#96d0d1;
@1:#d1eeea;''',
        'tags.ratio':
        '''
@5:#eb4a40;
@4:#f17854;
@3:#f59e72;
@2:#f9c098;
@1:#fde0c5;''',
        'tags.segmentation':
        '''
@1:#7F3C8D;
@2:#11A579;
@3:#3969AC;
@4:#F2B701;
@5:#E73F74;
@6:#80BA5A;
@7:#E68310;
@8:#008695;
@9:#CF1C90;
@10:#f97b72;
@11:#A5AA99;''',
    }

    measure = Parameter()
    force = BooleanParameter(default=False)

    def __init__(self, *args, **kwargs):
        if kwargs.get('force'):
            target_path = self.output(measure=kwargs['measure']).path
            try:
                os.unlink(target_path)
            except OSError:
                pass
        super(ImagesForMeasure, self).__init__(*args, **kwargs)

    def _generate_config(self, zoom, lon, lat, boundary=None):
        layers = []
        layers.append(self.BASEMAP)
        session = current_session()
        measure = session.query(OBSColumn).get(self.measure)
        mainquery = '''
SELECT numer_aggregate, numer_type,
       numer_colname, numer_geomref_colname,
       numer_tablename,
       geom_geomref_colname,
       geom_colname, geom_tablename,
       denom_colname, denom_tablename, denom_geomref_colname
FROM observatory.obs_meta
WHERE numer_id = '{measure}' {boundary_clause}
ORDER BY geom_weight DESC, numer_timespan DESC, geom_colname DESC;
        '''
        query = mainquery.format(
            measure=self.measure,
            boundary_clause="AND geom_id = '{}'".format(boundary)
            if boundary else '')

        resp = session.execute(query)
        results = resp.fetchone()

        # how should we determine fallback resolution?
        if results is None:
            query = mainquery.format(measure=self.measure, boundary_clause="")
            resp = session.execute(query)
            results = resp.fetchone()

        numer_aggregate, numer_type, numer_colname, numer_geomref_colname, \
                numer_tablename, geom_geomref_colname, geom_colname, \
                geom_tablename, denom_colname, \
                denom_tablename, denom_geomref_colname = results

        if denom_colname:
            cartosql = "SELECT geom.cartodb_id, geom.{geom_colname} as the_geom, " \
                    "geom.the_geom_webmercator, " \
                    "numer.{numer_colname} / NULLIF(denom.{denom_colname}, 0) measure " \
                    "FROM {geom_tablename} as geom, {numer_tablename} as numer, " \
                    "     {denom_tablename} as denom " \
                    "WHERE geom.{geom_geomref_colname} = numer.{numer_geomref_colname} " \
                    "  AND numer.{numer_geomref_colname} = denom.{denom_geomref_colname} "
            statssql = "SELECT  " \
                    'CDB_HeadsTailsBins(array_agg(distinct( ' \
                    '      (numer.{numer_colname} / ' \
                    '      NULLIF(denom.{denom_colname}, 0))::NUMERIC)), 4) as "headtails" ' \
                    "FROM {geom_tablename} as geom, " \
                    "     {numer_tablename} as numer, " \
                    "     {denom_tablename} as denom " \
                    "WHERE geom.{geom_geomref_colname} = numer.{numer_geomref_colname} " \
                    "  AND numer.{numer_geomref_colname} = denom.{denom_geomref_colname} "
        elif numer_aggregate == 'sum':
            cartosql = "SELECT geom.cartodb_id, geom.{geom_colname} as the_geom, " \
                    "geom.the_geom_webmercator, " \
                    "numer.{numer_colname} / " \
                    "  ST_Area(geom.the_geom) * 1000000.0 measure " \
                    "FROM {geom_tablename} as geom, {numer_tablename} as numer " \
                    "WHERE geom.{geom_geomref_colname} = numer.{numer_geomref_colname} "
            statssql = "SELECT CDB_HeadsTailsBins(array_agg(distinct( " \
                    '  (numer.{numer_colname} / ST_Area(geom.the_geom) ' \
                    '      * 1000000.0)::NUMERIC)), 4) as "headtails" ' \
                    "FROM {geom_tablename} as geom, " \
                    "     {numer_tablename} as numer " \
                    "WHERE geom.{geom_geomref_colname} = numer.{numer_geomref_colname} "
        else:
            cartosql = "SELECT geom.cartodb_id, geom.{geom_colname} as the_geom, " \
                    "  geom.the_geom_webmercator, " \
                    "  numer.{numer_colname} measure " \
                    "FROM {geom_tablename} as geom, {numer_tablename} as numer " \
                    "  WHERE geom.{geom_geomref_colname} = numer.{numer_geomref_colname} "
            if numer_type.lower() == 'numeric':
                statssql = "SELECT " \
                        'CDB_HeadsTailsBins(array_agg( ' \
                        '  distinct(numer.{numer_colname}::NUMERIC)), 4) as "headtails" ' \
                        "FROM {geom_tablename} as geom, " \
                        "     {numer_tablename} as numer " \
                        "WHERE geom.{geom_geomref_colname} = numer.{numer_geomref_colname} "
            else:
                statssql = '''
                SELECT array_agg(category) categories FROM (
                SELECT row_number() over () catname, {numer_colname} as category, COUNT(*) cnt
                FROM {numer_tablename}
                GROUP BY {numer_colname} ORDER BY COUNT(*) DESC
                LIMIT 10
                ) foo'''

        cartosql = cartosql.format(geom_colname=geom_colname,
                                   numer_colname=numer_colname,
                                   geom_tablename=geom_tablename,
                                   numer_tablename=numer_tablename,
                                   geom_geomref_colname=geom_geomref_colname,
                                   numer_geomref_colname=numer_geomref_colname,
                                   denom_colname=denom_colname,
                                   denom_tablename=denom_tablename,
                                   denom_geomref_colname=denom_geomref_colname)
        statssql = statssql.format(geom_colname=geom_colname,
                                   numer_colname=numer_colname,
                                   geom_tablename=geom_tablename,
                                   numer_tablename=numer_tablename,
                                   geom_geomref_colname=geom_geomref_colname,
                                   numer_geomref_colname=numer_geomref_colname,
                                   denom_colname=denom_colname,
                                   denom_tablename=denom_tablename,
                                   denom_geomref_colname=denom_geomref_colname)

        resp = query_cartodb(statssql)
        if resp.status_code != 200:
            raise Exception("Unable to obtain statssql: {}".format(resp.text))

        if measure.unit():
            ramp = self.PALETTES.get(measure.unit().id,
                                     self.PALETTES['tags.ratio'])
        else:
            ramp = self.PALETTES['tags.ratio']

        bucket_css = u''
        if numer_type.lower() == 'numeric':
            buckets = resp.json()['rows'][0]['headtails']

            for i, bucket in enumerate(buckets):
                bucket_css = u'''
    [measure <= {bucket}] {{
       polygon-fill: @{i};
    }}
                '''.format(bucket=bucket, i=i + 1) + bucket_css
        else:
            buckets = resp.json()['rows'][0]['categories']
            for i, bucket in enumerate(buckets):
                bucket_css = u'''
    [measure = "{bucket}"] {{
       polygon-fill: @{i};
    }}
                '''.format(bucket=bucket, i=i + 1) + bucket_css

        layers.append({
            'type': 'mapnik',
            'options': {
                'layer_name':
                geom_tablename,
                'cartocss':
                '''/** choropleth visualization */

{ramp}

#data {{
  polygon-opacity: 0.9;
  polygon-gamma: 0.5;
  line-color: #000000;
  line-width: 0.25;
  line-opacity: 0.2;
  line-comp-op: hard-light;
  polygon-fill: @{bucketlen};

  [measure=null]{{
     polygon-fill: #cacdce;
  }}
  {bucket_css}
}}'''.format(ramp=ramp, bucketlen=len(buckets) + 1, bucket_css=bucket_css),
                'cartocss_version':
                "2.1.1",
                'sql':
                cartosql,
                "table_name":
                "\"\"."
            }
        })
        #layers.append(self.LABELS)
        return {
            'layers': layers,
            'center': [lon, lat],
            #'bounds': self.bounds,
            'zoom': zoom
        }

    def get_named_map(self, map_config):

        config = {"version": "1.3.0", "layers": map_config}
        resp = requests.get(self.MAP_URL,
                            headers={
                                'content-type': 'application/json'
                            },
                            params={
                                'config': json.dumps(config)
                            }).json()
        if 'layergroupid' not in resp:
            raise Exception('Named map returned no layergroupid: {}'.format(
                pprint(resp)))
        return resp

    def run(self):
        self.output().makedirs()

        image_urls = []
        country = self.measure.split('.')[0]
        for center, zoom, boundary in self.CENTER_ZOOM_BOUNDS[country]:
            lon, lat = center

            if country == 'uk':
                image_size = (
                    300,
                    700,
                )
            else:
                image_size = (
                    500,
                    500,
                )

            config = self._generate_config(zoom, lon, lat, boundary)

            named_map = self.get_named_map(config['layers'])
            image_urls.append('{cartodb_url}/api/v1/map/static/center/' \
                              '{layergroupid}/{zoom}/{center_lon}/{center_lat}/{x}/{y}.png'.format(
                                  cartodb_url=os.environ['CARTODB_URL'],
                                  layergroupid=named_map['layergroupid'],
                                  zoom=zoom,
                                  center_lon=lon,
                                  center_lat=lat,
                                  x=image_size[0],
                                  y=image_size[1],
                              ))

        url1 = image_urls.pop(0)
        LOGGER.info(url1)
        file1 = StringIO(requests.get(url1, stream=True).content)
        image1 = ImageOps.expand(Image.open(file1), border=10, fill='white')

        for url2 in image_urls:
            LOGGER.info(url2)
            file2 = StringIO(requests.get(url2, stream=True).content)

            image2 = ImageOps.expand(Image.open(file2),
                                     border=10,
                                     fill='white')

            (width1, height1) = image1.size
            (width2, height2) = image2.size

            result_width = width1 + width2
            result_height = max(height1, height2)

            result = Image.new('RGB', (result_width, result_height))
            result.paste(im=image1, box=(0, 0))
            result.paste(im=image2, box=(width1, 0))

            image1 = result
        image1.save(self.output().path)

    def complete(self):
        '''
        If we support this country,
        '''
        country = self.measure.split('.')[0]
        if country in self.CENTER_ZOOM_BOUNDS:
            return super(ImagesForMeasure, self).complete()
        else:
            LOGGER.warn('No info to create images for %s', self.measure)
            return True

    def output(self, measure=None):
        if measure is None:
            measure = self.measure
        return LocalTarget(os.path.join('catalog/img', measure + '.png'))
Exemple #14
0
class Hisat(luigi.Task):
    """Mapping the QCed sequences to reference."""

    fastqs = ListParameter()
    indexfile = Parameter()
    outsam = Parameter()
    map_dir = Parameter()
    workdir = Parameter()
    num_cpus = Parameter()
    sample = Parameter()
    min_introlen = luigi.IntParameter()
    max_introlen = luigi.IntParameter()
    rna_strandness = luigi.Parameter()
    kingdom = luigi.Parameter()

    def output(self):
        """SAM file output of the mapping."""
        bam_file = self.outsam.split(".sam")[0] + ".bam"
        return luigi.LocalTarget(bam_file)

    def run(self):
        """Run hisat2."""
        if self.kingdom == "prokarya":
            hisat2_nosplice_option = [
                "-p", self.num_cpus, "-x", self.indexfile, "-1",
                self.fastqs[0], "-2", self.fastqs[1], "-S", self.outsam,
                "--min-intronlen", self.min_introlen, "--max-intronlen",
                self.max_introlen, "--rna-strandness", self.rna_strandness,
                "--no-spliced-alignment", "--no-unal", "--un-conc",
                os.path.join(self.map_dir, "unaligned.fastq"), "2>",
                os.path.join(self.map_dir, "mapping.log")
            ]
            hisat2_cmd = hisat2[hisat2_nosplice_option]
            hisat2_cmd()
            self.sam2bam()
            self.sort_bam()
        else:
            h2_splice_option = [
                "-p", self.num_cpus, "-x", self.indexfile, "-1",
                self.fastqs[0], "-2", self.fastqs[1], "-S", self.outsam,
                "--min-intronlen", self.min_introlen, "--max-intronlen",
                self.max_introlen, "--rna-strandness", self.rna_strandness,
                "--no-unal", "--un-conc",
                os.path.join(self.map_dir, "unaligned.fastq"), "2>",
                os.path.join(self.map_dir, "mapping.log")
            ]
            hisat2_cmd = hisat2[h2_splice_option]
            hisat2_cmd()
            self.sam2bam()
            self.sort_bam()

    def sam2bam(self):
        """Convert SAM to BAM file."""
        bam_file = self.outsam.split(".sam")[0] + ".bam"
        options = ["view", "-bS", "-F", "4", self.outsam, "-o", bam_file]
        samtools_cmd = samtools[options]
        samtools_cmd()

    def sort_bam(self):
        """Sort BAM file."""
        bam_file = self.outsam.split(".sam")[0] + ".bam"
        sorted_bam_file = bam_file.split(".bam")[0] + "_srt.bam"
        options = ["sort", bam_file, "-o", sorted_bam_file]
        samtools_cmd = samtools[options]
        samtools_cmd()
class MockTask(MixinNaiveBulkComplete, Task):
    param_a = Parameter()
    param_b = Parameter(default="Not Mandatory")

    def complete(self):
        return self.param_a in COMPLETE_TASKS
Exemple #16
0
class InputModel(ExternalTask):
    MODEL_ROOT = os.path.abspath('data')
    model = Parameter(default="rnn.pth")  # Filename of the model

    def output(self):
        return SuffixPreservingLocalTarget(self.MODEL_ROOT + '/' + 'models' + '/' + self.model, format=format.Nop)
Exemple #17
0
class InputData(ExternalTask):
    IMAGE_ROOT = os.path.abspath('data')
    data = Parameter(default="names.txt")  # Filename of the model

    def output(self):
        return SuffixPreservingLocalTarget(self.IMAGE_ROOT + '/' + 'input' + '/' + self.data, format=format.Nop)
Exemple #18
0
class GenerateStaticImage(Task):

    BASEMAP = {
        "type": "http",
        "options": {
            #"urlTemplate": "https://{s}.maps.nlp.nokia.com/maptile/2.1/maptile/newest/satellite.day/{z}/{x}/{y}/256/jpg?lg=eng&token=A7tBPacePg9Mj_zghvKt9Q&app_id=KuYppsdXZznpffJsKT24",
            #"subdomains": "1234",
            # Dark Matter
            "urlTemplate":
            "http://{s}.basemaps.cartocdn.com/dark_nolabels/{z}/{x}/{y}.png",
            "subdomains": "abcd",
            #"urlTemplate": "http://{s}.basemaps.cartocdn.com/dark_nolabels/{z}/{x}/{y}.png",
            #"subdomains": ["a", "b", "c"]
        }
    }

    LABELS = {
        "type": "http",
        "options": {
            "urlTemplate":
            "http://{s}.basemaps.cartocdn.com/dark_only_labels/{z}/{x}/{y}.png",
            "subdomains": "abcd",
        }
    }

    #57d9408e-0351-11e6-9c12-0e787de82d45

    viz = Parameter()
    VIZ_URL = '{cartodb_url}/api/v2/viz/{{viz}}/viz.json'.format(
        cartodb_url=os.environ['CARTODB_URL'])
    MAP_URL = '{cartodb_url}/api/v1/map'.format(
        cartodb_url=os.environ['CARTODB_URL'])

    def viz_to_config(self):
        resp = requests.get(self.VIZ_URL.format(viz=self.viz))

        assert resp.status_code == 200
        data = resp.json()
        layers = []
        layers.append(self.BASEMAP)
        for data_layer in data['layers']:
            if data_layer['type'] == 'layergroup':
                for layer in data_layer['options']['layer_definition'][
                        'layers']:
                    if layer['visible'] is True:
                        layers.append({
                            'type': 'mapnik',
                            'options': layer['options']
                        })
        layers.append(self.LABELS)
        return {
            'layers': layers,
            'center': json.loads(data['center']),
            'bounds': data['bounds'],
            'zoom': data['zoom']
        }

    def get_named_map(self, map_config):

        config = {"version": "1.3.0", "layers": map_config}
        resp = requests.get(self.MAP_URL,
                            headers={
                                'content-type': 'application/json'
                            },
                            params={
                                'config': json.dumps(config)
                            }).json()
        if 'layergroupid' not in resp:
            raise Exception('Named map returned no layergroupid: {}'.format(
                pprint(resp)))
        return resp

    def run(self):
        self.output().makedirs()
        config = self.viz_to_config()
        named_map = self.get_named_map(config['layers'])
        img_url = '{cartodb_url}/api/v1/map/static/center/' \
                '{layergroupid}/{zoom}/{center_lon}/{center_lat}/800/500.png'.format(
                    cartodb_url=os.environ['CARTODB_URL'],
                    layergroupid=named_map['layergroupid'],
                    zoom=config['zoom'],
                    center_lon=config['center'][0],
                    center_lat=config['center'][1]
                )
        LOGGER.info(img_url)
        shell('curl "{img_url}" > {output}'.format(img_url=img_url,
                                                   output=self.output().path))

    def output(self):
        return LocalTarget(
            os.path.join('catalog/source/img', self.task_id + '.png'))
Exemple #19
0
class ImportAllResolutions(WrapperTask):
    survey = Parameter(default=SURVEY_CEN)

    def requires(self):
        for resolution in GEOGRAPHIES:
            yield ImportData(resolution=resolution, survey=self.survey)
Exemple #20
0
class PerformSpectralClustering(Task):
    """Coalesce the Wikipedia article parquet files as a Dask Dataframe, and perform spectral clustering using
       the magic of Dask-ML."""

    num_clusters = IntParameter()
    word_vectors = Parameter()

    def requires(self):
        return [GenerateDocumentEmbeddings(model=self.word_vectors)]

    def output(self):
        return LocalTarget(
            config.CLUSTERING_RESULTS_DIR /
            f'cluster_{self.num_clusters}_{self.word_vectors}.txt')

    def run(self):
        if self.word_vectors not in {"fasttext", "word2vec"}:
            raise ValueError(
                f'Expected fasttext or word2vec; got {self.word_vectors}')

        print(
            f'Initializing dask dataframe of word embeddings at {datetime.now()}'
        )
        ddf = dask.dataframe.read_csv(config.ARTICLE_EMBEDDINGS_DIR /
                                      f'{self.word_vectors}_to_csv' / "*.part")

        print(
            f'Dropping columns and converting to design matrix (dask array) at {datetime.now()}'
        )
        X = ddf.drop(['Unnamed: 0', "id", "url", "title"], axis=1)
        X = X.to_dask_array(lengths=True)

        # Perform k-means clustering
        print(f'Starting K-Means clustering at {datetime.now()}')
        k_means_clustering_model = KMeans(n_clusters=self.num_clusters,
                                          n_jobs=-1,
                                          max_iter=config.K_MEANS_MAX_ITER)
        k_means_cluster_labels = k_means_clustering_model.fit(X)

        # Write k-means results to disk
        print(
            f'Joining K-means results and writing to disk at {datetime.now()}')
        k_means_results_ddf = ddf.join(k_means_cluster_labels)
        k_means_ddf_output_path = config.CLUSTERING_RESULTS_DIR / f'{self.word_vectors}_w_k_means'
        k_means_ddf_output_path.mkdir(parents=True, exist_ok=True)
        dask.dataframe.to_csv(k_means_results_ddf, k_means_ddf_output_path)

        # Perform spectral clustering
        print(f'Starting Spectral clustering at {datetime.now()}')
        spectral_clustering_model = SpectralClustering(
            n_clusters=self.num_clusters,
            n_jobs=-1,
            persist_embedding=True,
            kmeans_params={"max_iter": config.K_MEANS_MAX_ITER})
        spectral_cluster_labels = spectral_clustering_model.fit(X)

        # Write spectral results to disk
        print(
            f'Joining Spectral results and writing to disk at {datetime.now()}'
        )
        spectral_results_ddf = ddf.join(spectral_cluster_labels)
        spectral_ddf_output_path = config.CLUSTERING_RESULTS_DIR / f'{self.word_vectors}_w_spectral'
        spectral_ddf_output_path.mkdir(parents=True, exist_ok=True)
        dask.dataframe.to_csv(spectral_results_ddf, spectral_ddf_output_path)

        # And save the success flag
        with self.output().open("w") as f:
            # f.write(f'Clustering {self.word_vectors} k={self.num_clusters}: {silhouette_score_result}' + "\n")
            # f.write(spectral_clustering_model.get_params(deep=True))
            f.write(f'{self.word_vectors}: Success!')
Exemple #21
0
class Survey(BaseParams, TableTask):

    topic = Parameter(default='t001')

    def version(self):
        return 6

    def requires(self):
        '''
        Subclasses must override this.
        '''
        raise NotImplementedError('Survey must define requires()')

    def timespan(self):
        '''
        Subclasses must override this.
        '''
        raise NotImplementedError('Survey must define timespan()')

    def columns(self):
        cols = OrderedDict()
        input_ = self.input()
        cols['geo_code'] = input_['geometa']['geom_id']
        for colname, coltarget in input_['meta'].items():
            if coltarget._id.split('.')[-1].lower().startswith(self.topic.lower()):
                cols[colname] = coltarget
        return cols

    def populate(self):
        if self.survey == SURVEY_NHS:
            if self.resolution == GEO_DA:
                self.populate_da_from_cd()
            elif self.resolution == GEO_FSA:
                self.populate_fsa_from_csd()
            else:
                self.populate_general()
        else:
            self.populate_general()

    def populate_da_from_cd(self):
        session = current_session()
        columns = self.columns()
        colnames = list(columns.keys())
        out_colnames = [oc for oc in colnames if oc is not None]
        in_colnames = ['da.geom_id']
        for colname in out_colnames:
            if colname != 'geo_code':
                # We reduce the number of decimals to reduce the size of the row to avoid hit
                # the limit which is 8Kb. More info https://github.com/CartoDB/bigmetadata/issues/527
                in_colnames.append('round(cast(float8 ({colname} * (ST_Area(da.the_geom)/ST_Area(cd.the_geom))) as numeric), 2) {colname}'.format(colname=colname))

        insert_query = '''
                INSERT INTO {output} ({out_colnames})
                SELECT {in_colnames} FROM {da_geom} da
                INNER JOIN {cd_geom} cd ON (cd.geom_id = left(da.geom_id,4))
                INNER JOIN {cd_data} data ON (cd.geom_id = data.geo_code)
                '''.format(output=self.output().table,
                           da_geom=self.input()['geo'].table,
                           cd_geom=self.input()['geo_source'].table,
                           cd_data=self.input()['data_source'].table,
                           in_colnames=', '.join(in_colnames),
                           out_colnames=', '.join(out_colnames))

        LOGGER.debug(insert_query)
        session.execute(insert_query)

    def populate_fsa_from_csd(self):
        session = current_session()
        columns = self.columns()
        colnames = list(columns.keys())
        out_colnames = [oc for oc in colnames if oc is not None]
        in_colnames = [x for x in out_colnames if x != 'geo_code']

        insert_query = '''
                INSERT INTO {output} ({out_colnames})
                SELECT fsa_geom_id, {in_colnames_group}
                  FROM {csd_data} data_csd,
                       {interpolation_table} interp
                 WHERE data_csd.geo_code = interp.csd_geom_id
                 GROUP BY fsa_geom_id
                '''.format(
                    output=self.output().table,
                    csd_data=self.input()['data_source'].table,
                    interpolation_table=self.input()['geo_interpolation'].table,
                    in_colnames_group=', '.join(['round(sum({x} * area_ratio)::numeric, 2) as {x}'.format(x=x) for x in in_colnames]),
                    out_colnames=', '.join(out_colnames)
                )

        LOGGER.debug(insert_query)
        session.execute(insert_query)

    def populate_general(self):
        session = current_session()
        columns = self.columns()
        out_colnames = list(columns.keys())
        in_table = self.input()['data']
        in_colnames = [ct._id.split('.')[-1] for ct in list(columns.values())]
        in_colnames[0] = 'geo_code'
        for i, in_c in enumerate(in_colnames):
            cmd =   "SELECT 'exists' FROM information_schema.columns " \
                    "WHERE table_schema = '{schema}' " \
                    "  AND table_name = '{tablename}' " \
                    "  AND column_name = '{colname}' " \
                    "  LIMIT 1".format(
                        schema=in_table.schema,
                        tablename=in_table.tablename.lower(),
                        colname=in_c.lower())
            # remove columns that aren't in input table
            if session.execute(cmd).fetchone() is None:
                in_colnames[i] = None
                out_colnames[i] = None
        in_colnames = [
            "CASE {ic}::TEXT WHEN '-6' THEN NULL ELSE {ic} END".format(ic=ic) for ic in in_colnames if ic is not None]
        out_colnames = [oc for oc in out_colnames if oc is not None]

        cmd = 'INSERT INTO {output} ({out_colnames}) ' \
              'SELECT {in_colnames} FROM {input} '.format(
                    output=self.output().table,
                    input=in_table.table,
                    in_colnames=', '.join(in_colnames),
                    out_colnames=', '.join(out_colnames))
        session.execute(cmd)
Exemple #22
0
class TigerBlocksInterpolation(Task):
    '''
    Task used to create a table with the block and blockgroups geoid and the
    percentage of the block in the block group
    '''
    year = Parameter()

    def requires(self):
        return {
            'shoreline_block': ShorelineClip(year=self.year, geography='block'),
            'shoreline_blockgroup': ShorelineClip(year=self.year, geography='block_group'),
        }

    def run(self):
        session = current_session()
        with session.no_autoflush:
            tiger_tables = {}
            tiger_tables_query = '''SELECT id,tablename
                                    FROM observatory.obs_table
                                    WHERE id ilike 'us.census.tiger.shoreline_clip_block%'
                                 '''

            tiger_tables_result = session.execute(tiger_tables_query)
            if tiger_tables_result:
                for tiger_table in tiger_tables_result.fetchall():
                    if re.search('block_group_{}'.format(self.year), tiger_table['id']):
                        tiger_tables['block_group'] = tiger_table['tablename']
                    elif re.search('block_{}'.format(self.year), tiger_table['id']):
                        tiger_tables['block'] = tiger_table['tablename']

                # Create the table with block/blockgroups and percentage field empty
                start_time = time.time()
                LOGGER.info("Start creating the interpolation table...")
                query = '''
                        CREATE TABLE {table_output} AS
                        SELECT geoid blockid, left(geoid,12) blockgroupid, 0::float percentage, the_geom block_geom
                        FROM "{schema_input}".{block_table} b
                        '''.format(schema_input='observatory',
                                   block_table=tiger_tables['block'],
                                   table_output=self.output().table)
                session.execute(query)
                end_time = time.time()
                LOGGER.info("Time creating the table {}".format((end_time - start_time)))
                # Creating indexes
                LOGGER.info("Start creating the indexes for the interpolation table...")
                start_time = time.time()
                indexes_query = '''
                    CREATE INDEX blocks_idx ON {table_output} (blockid);
                    CREATE INDEX block_groups_idx ON {table_output} (blockgroupid);
                '''.format(table_output=self.output().table)
                session.execute(indexes_query)
                end_time = time.time()
                LOGGER.info("Indexes created in {}".format((end_time - start_time)))
                # Set the interpolation percentages in the table
                LOGGER.info("Start updating the table...")
                start_time = time.time()
                update_percentage_query = '''
                        UPDATE {table_output} b
                        SET percentage = (
                            SELECT (ST_Area(b.block_geom)/ST_Area(bg.the_geom))::float*100.00
                            FROM "{schema_input}".{bg_table} bg
                            WHERE b.blockgroupid = bg.geoid
                        )
                        '''.format(schema_input='observatory',
                                   bg_table=tiger_tables['block_group'],
                                   table_output=self.output().table)
                session.execute(update_percentage_query)
                session.commit()
                end_time = time.time()
                LOGGER.info("Time creating the table {}".format((end_time - start_time)))
            else:
                LOGGER.error('Cant retrieve tiger tables for block and block group')

    def output(self):
        schema = 'tiger{year}'.format(year=self.year)
        return PostgresTarget(schema, 'blocks_interpolation')
class ETLAnalysis(Task):
    """Created an abstract class for conducting analysis of vaccine data
    at different levels - by country, by year, by month and by week.  This is a luigi
    task and sub-classed by the different levels of covid data analysis tasks.  The analysis
    abstract class requires Cleanup and the parquet files for performing
    the analysis and display.

    This abstract class has one analysis method to override / implement in their
    respective tasks.

    Each analysis should be a separate Luigi task, which computes its analysis and writes
    the result to parquet. To display to the terminal or answer a quiz, the output should
    be read back from the written parquet file.

    Parameters:
        subset: bool, True to process just one partition, False to process
            the entire dataset, default: True
        analysis_path: str, base directory to store output files

    Output:
        Dataframe stored in compressed Parquet format in
            {task.analysis_path}/{task.sub_dir}/subset-{task.subset}/
    """

    subset = BoolParameter(default=True)
    analysis_path = Parameter(default="./data/vaccine/")

    requires = Requires()
    input_data = Requirement(VaccineDataGlobalCleanupTask)

    # the output references a "sub_dir" parameter, which is expected to be defined
    # in a subclass
    output = TargetOutput(
        "{task.analysis_path}{task.sub_dir}",
        ext="subset-{task.subset}/",
        target_class=ParquetTarget,
        flag="_SUCCESS",
    )

    def perform_analysis(self, df):
        """ this method will be implemented by sub-classes. """
        raise NotImplementedError

    def run(self):
        """
        Uses the three data points we need for analysis -> Country_Region and Date
        calls the implemented perform_analysis method to do the calculations
        """
        analysis_dataframe = self.input()["input_data"].read_dask(columns=[
            "Country_Region",
            "Date",
            "Doses_admin",
            "People_partially_vaccinated",
            "People_fully_vaccinated",
            "Report_Date_String",
            "UID",
        ])

        # invoke perform_analysis from the implemented sub-classes
        # only gets the aggregated analysis column (stars, year, decade and weekday) and the review length
        output_dataframe = self.perform_analysis(analysis_dataframe)
        # write_dask parquet file output with gzip compression.
        self.output().write_dask(output_dataframe,
                                 write_index=True,
                                 compression="gzip")
Exemple #24
0
class SimplifiedUnionTigerWaterGeoms(SimplifiedTempTableTask):
    year = IntParameter()
    geography = Parameter()

    def requires(self):
        return UnionTigerWaterGeoms(year=self.year, geography=self.geography)
Exemple #25
0
class SoiaEmailFetcher(sqla.CopyToTable):
    email_address = Parameter()
    password = Parameter()
    date = DateParameter()

    columns = [(["id", Integer()], {
        "autoincrement": True,
        "primary_key": True
    }), (["start", BigInteger()], {}), (["end", BigInteger()], {}),
               (["insert_date", BigInteger()], {})]
    connection_string = "sqlite:///data/soia_email.db"
    table = "soia"
    regexes = [(r"<b>Duration:</b>.*<br>", ["<b>Duration:</b>", "<br>"]),
               (r"(\d{2,4}.){2,4}.*<o", ["<o"])]

    def rows(self):
        for start, end in deduplicated(self.generate_rows()):
            yield "auto", start, end, datetime.now().strftime('%s')

    def copy(self, conn, ins_rows, table):
        bound_cols = dict((c, bindparam("_" + c.key)) for c in table.columns
                          if c.key != "id")
        ins = table.insert().values(bound_cols)
        conn.execute(ins, ins_rows)

    def generate_rows(self):
        imap_client = create_imap_client(self.email_address, self.password)
        try:
            code, data = imap_client.search(None, "ALL")

            soia_timestamps = []
            # iterate over emails
            for number in data[0].split(b" "):
                code, data = imap_client.fetch(number, '(RFC822)')
                message = email.message_from_string(data[0][1].decode())

                # get actual email content
                date = dateparser.parse(message["Date"])
                content = message.get_payload()

                # handle base64 content
                try:
                    unbased_content = unbase64_content(content)
                except ValueError:
                    continue

                # iterate over regexes trying to match date
                for regex, replaces in self.regexes:
                    match = re.search(regex, unbased_content)
                    if match is not None:
                        dates = remove_occurances(match.group(), replaces)

                        start, end = dates.rsplit("-", maxsplit=1)
                        if " " not in end.strip():
                            end = f"{date.year}-{date.month}-{date.day} {end}"
                        if " " not in start.strip():
                            start = f"{date.year}-{date.month}-{date.day} {start}"

                        parsed_start = dateparser.parse(start)
                        parsed_end = dateparser.parse(end)
                        if parsed_end is None or parsed_start is None:
                            logger.warning("coudn't parse the following: %s",
                                           match)
                            continue
                        row = (parsed_start.strftime('%s'),
                               parsed_end.strftime('%s'))
                        logger.debug("Adding the following row: %s", row)
                        soia_timestamps.append(row)
                        break
        except Exception as err:
            logger.error("Something went terribly wrong! %s", err)

        finally:
            imap_client.close()
            imap_client.logout()
        return soia_timestamps
Exemple #26
0
class SumLevel(TableTask):

    geography = Parameter()
    year = IntParameter()

    @property
    def geoid(self):
        return SUMLEVELS[self.geography]['fields']['geoid']

    @property
    def aland(self):
        return SUMLEVELS[self.geography]['fields']['aland']

    @property
    def awater(self):
        return SUMLEVELS[self.geography]['fields']['awater']

    @property
    def input_tablename(self):
        return SUMLEVELS[self.geography]['table'] + SIMPLIFIED_SUFFIX

    def version(self):
        return 15

    def requires(self):
        if self.geography == BLOCK:
            tiger = SimplifyGeoByState(geography=self.geography, year=self.year)
        else:
            tiger = SimplifiedDownloadTiger(geography=self.geography, year=self.year)
        return {
            'attributes': Attributes(),
            'geoids': GeoidColumns(year=self.year),
            'geoms': GeomColumns(year=self.year),
            'data': tiger,
        }

    def columns(self):
        input_ = self.input()
        return OrderedDict([
            ('geoid', input_['geoids'][self.geography + '_{}'.format(self.year) + GEOID_SUMLEVEL_COLUMN]),
            ('the_geom', input_['geoms'][self.geography + '_{}'.format(self.year)]),
            ('aland', input_['attributes']['aland']),
            ('awater', input_['attributes']['awater']),
        ])

    def table_timespan(self):
        return get_timespan(str(self.year))

    # TODO: https://github.com/CartoDB/bigmetadata/issues/435
    def targets(self):
        return {
            OBSTable(id='.'.join([self.schema(), self.name()])): GEOM_REF,
        }

    def populate(self):
        session = current_session()
        from_clause = '{inputschema}.{input_tablename}'.format(
            inputschema='tiger' + str(self.year),
            input_tablename=self.input_tablename,
        )
        in_colnames = [self.geoid, 'geom', self.aland, self.awater]

        out_colnames = list(self.columns().keys())
        session.execute('INSERT INTO {output} ({out_colnames}) '
                        'SELECT {in_colnames} '
                        'FROM {from_clause} '.format(
                            output=self.output().table,
                            in_colnames=', '.join(in_colnames),
                            out_colnames=', '.join(out_colnames),
                            from_clause=from_clause
                        ))
class Measurements2CSV(Task):
    geography = Parameter()
    file_name = Parameter()

    def __init__(self, *args, **kwargs):
        super(Measurements2CSV, self).__init__(*args, **kwargs)

    def requires(self):
        requirements = {}
        if self.geography == 'GEO_PA':
            requirements['data'] = CensusPostcodeAreas()
        elif self.geography == 'GEO_PD':
            requirements['data'] = CensusPostcodeDistricts()
        elif self.geography == 'GEO_PS':
            requirements['data'] = CensusPostcodeSectors()

        return requirements

    def _get_config_data(self):
        dir_path = os.path.dirname(os.path.realpath(__file__))
        with (open('{}/{}'.format(dir_path, 'measurements.json'))) as f:
            return json.load(f)

    def run(self):
        session = current_session()
        measurements = self._get_config_data()
        for measure in measurements:
            measure['geom_id'] = GEOGRAPHY_LEVELS[self.geography]
        json_metadata = json.dumps(measurements)

        result = session.execute(self._get_meta_query(json_metadata))
        if result:
            join_data = {}
            join_data['numer'] = {}
            if self.geography == 'GEO_PA':
                colnames = ['geom.pa_id as geoid']
            else:
                colnames = ['geom.geographycode as geoid']

            for data in result.fetchall():
                join_data['numer'][data['numer_table']] = {
                    'table': 'observatory.{}'.format(data['numer_table']),
                    'join_column': data['numer_join_col']
                }
                # All come from the same geometry tables so we use, by now, just one geometry
                # TODO Make it possible to have multiple geometry tables
                join_data['geom'] = {
                    'table': 'observatory.{}'.format(data['geom_table']),
                    'join_column': data['geom_join_col']
                }
                colnames.append(data['numer_col'])

            measurement_result = session.execute(
                self._get_measurements_query(join_data, colnames))
            if measurement_result:
                measurements = measurement_result.fetchall()
                self._generate_csv_file(colnames, measurements)
            else:
                LOGGER.error('No results for the queried measurements')

        else:
            LOGGER.error(
                'No results for the defined measurements in the JSON file')

    def _get_meta_query(self, metadata):
        return '''SELECT meta->>'numer_tablename' numer_table, meta->>'numer_geomref_colname' numer_join_col,
                         meta->>'numer_colname' numer_col, meta->>'geom_tablename' geom_table,
                         meta->>'geom_geomref_colname' geom_join_col, meta->>'geom_colname' geom_col
                  FROM json_array_elements(cdb_observatory.OBS_GetMeta(
                       ST_MakeEnvelope(-179, 89, 179, -89, 4326), -- World bbox
                       '{}'::json, 1, 1, 1)) meta
            '''.format(metadata)

    def _get_measurements_query(self, join_data, colnames):
        joins = []
        for join_table in join_data['numer'].values():
            joins.append(
                'LEFT JOIN {table} ON (geom.{geomcol} = {table}.{numercol})'.
                format(table=join_table['table'],
                       geomcol=join_data['geom']['join_column'],
                       numercol=join_table['join_column']))
        return '''SELECT {cols}
                  FROM {geom} geom {joins}
               '''.format(cols=' ,'.join(colnames),
                          geom=join_data['geom']['table'],
                          joins=' '.join(joins))

    def _generate_csv_file(self, headers, measurements):
        try:
            self.output().makedirs()
            with (open(self.output().path, 'w+')) as csvfile:
                headers[0] = 'geoid'
                writer = csv.DictWriter(csvfile, fieldnames=headers)
                writer.writeheader()
                for measurement in measurements:
                    writer.writerow(dict(measurement))
        except BaseException:
            self.output().remove

    def output(self):
        csv_filename = 'tmp/geographica/uk/{}'.format(self.file_name)
        return LocalTarget(path=csv_filename, format='csv')
Exemple #28
0
class StringTieScoresW(luigi.WrapperTask):
    """From Mapping to Counting step for Eukaryotic reference."""

    gff_file = Parameter()
    kingdom = Parameter()

    def requires(self):
        """A wrapper for running Stringtie scores on all samples."""
        splice_list = [
            self.workdir + "/" + f for f in os.listdir(self.workdir)
            if f.endswith('.splice')
        ]
        if len(splice_list) > 1:
            splice_file = ','.join(splice_list)
        elif len(splice_list) == 1:
            splice_file = splice_list[0]
        for samp, fastq in self.fastq_dic.iteritems():
            map_dir = self.workdir + "/" + samp + "/mapping_results"
            trim_dir = self.workdir + "/" + samp + "/trimming_results"
            if os.path.isdir(map_dir) is False:
                os.makedirs(map_dir)
            if self.kingdom in ['prokarya', 'eukarya']:
                if ',' in self.gff_file:
                    gff_list = [
                        os.path.abspath(gff)
                        for gff in self.gff_file.split(",")
                    ]
                    for gff in gff_list:
                        gtf = self.workdir + "/" + gff.split("/")[-1].split(
                            ".gff")[0] + ".gtf"
                        gff_name = gtf.split(".gtf")[0].split("/")[-1]
                        yield StringTieScores(
                            fastq1=trim_dir + "/" + samp + ".1.trimmed.fastq",
                            fastq2=trim_dir + "/" + samp + ".2.trimmed.fastq",
                            numCPUs=self.numCPUs,
                            indexfile=self.indexfile,
                            spliceFile=splice_file,
                            mappingLogFile=map_dir + "/mapping.log",
                            unalned=map_dir + "/unligned.fastq",
                            outsam=map_dir + "/" + samp + ".sam",
                            bam_file=map_dir + "/" + samp + ".bam",
                            sorted_bam_file=map_dir + "/" + samp + "_srt.bam",
                            ref_file=self.ref_file,
                            in_gtf=gtf,
                            gff_file=self.gff_file,
                            out_gtf=map_dir + "/" + samp + "_" + gff_name +
                            "_sTie.gtf",
                            out_cover=map_dir + "/" + samp + "_" + gff_name +
                            "_covered_sTie.gtf",
                            out_abun=map_dir + "/" + samp + "_" + gff_name +
                            "_sTie.tab",
                            in_bam_file=map_dir + "/" + samp + "_srt.bam",
                            bindir=self.bindir,
                            workdir=self.workdir)
            elif self.kingdom == 'both':
                prok_gtf = self.workdir + "/" + \
                    self.gff_file.split(";")[0].split("/")[-1].split(".gff")[0] + ".gtf"
                euk_gtf = self.workdir + "/" + \
                    self.gff_file.split(";")[1].split("/")[-1].split(".gff")[0] + ".gtf"
                yield StringTieScores(
                    fastq1=trim_dir + "/" + samp + ".1.trimmed.fastq",
                    fastq2=trim_dir + "/" + samp + ".2.trimmed.fastq",
                    numCPUs=self.numCPUs,
                    indexfile=self.indexfile,
                    spliceFile=splice_file,
                    mappingLogFile=map_dir + "/mapping.log",
                    unalned=map_dir + "/unligned.fastq",
                    outsam=map_dir + "/" + samp + ".sam",
                    bam_file=map_dir + "/" + samp + ".bam",
                    sorted_bam_file=map_dir + "/" + samp + "_srt.bam",
                    ref_file=self.ref_file,
                    gtf=prok_gtf,
                    out_gtf=map_dir + "/" + samp + "_prok_sTie.gtf",
                    out_cover=map_dir + "/" + samp + "_prok_covered_sTie.gtf",
                    out_abun=map_dir + "/" + samp + "_prok_sTie.tab",
                    in_bam_file=map_dir + "/prokarya.bam",
                    bindir=self.bindir,
                    workdir=self.workdir,
                    gff_file=self.gff_file)
                yield StringTieScores(
                    fastq1=trim_dir + "/" + samp + ".1.trimmed.fastq",
                    fastq2=trim_dir + "/" + samp + ".2.trimmed.fastq",
                    numCPUs=self.numCPUs,
                    indexfile=self.indexfile,
                    spliceFile=splice_file,
                    mappingLogFile=map_dir + "/mapping.log",
                    unalned=map_dir + "/unligned.fastq",
                    outsam=map_dir + "/" + samp + ".sam",
                    bam_file=map_dir + "/" + samp + ".bam",
                    sorted_bam_file=map_dir + "/" + samp + "_srt.bam",
                    ref_file=self.ref_file,
                    gtf=euk_gtf,
                    out_gtf=map_dir + "/" + samp + "_euk_sTie.gtf",
                    out_cover=map_dir + "/" + samp + "_euk_covered_sTie.gtf",
                    out_abun=map_dir + "/" + samp + "_euk_sTie.tab",
                    in_bam_file=map_dir + "/eukarya.bam",
                    bindir=self.bindir,
                    workdir=self.workdir,
                    gff_file=self.gff_file)
Exemple #29
0
class Anuario(TableTask):

    resolution = Parameter()
    year = Parameter()

    @property
    def infilepath(self):
        base = self.input()['data'].path
        year = str(self.year)[-2:]
        if self.resolution == 'prov':
            fname = 'AE{year}_Provincial_Completo.xls'.format(year=year)
        elif self.resolution == 'muni':
            fname = 'AE{year}_Municipal_Completo.xls'.format(year=year)
        else:
            raise RuntimeError('Unknown resolution "{}"'.format(self.resolution))

        return os.path.join(base, fname)

    def version(self):
        return 4

    def timespan(self):
        return self.year

    def requires(self):
        return {
            'data_columns': AnuarioColumns(),
            'geom_columns': GeomRefColumns(),
            'data': DownloadAnuario(year=self.year),
        }

    def columns(self):
        cols = OrderedDict()
        cols['id_' + self.resolution] = \
                self.input()['geom_columns']['id_' + self.resolution]
        cols.update(self.input()['data_columns'])
        return cols

    def populate(self):
        book = open_workbook(self.infilepath)

        # determine mapping between column names and columns in excel
        columns = self.columns()
        headers = dict()
        colnum2name = OrderedDict()
        sheets = book.sheets()
        for sheetnum, sheet in enumerate(sheets):
            headers.update(dict([
                (cell.value, (sheetnum, cellnum))
                for cellnum, cell in enumerate(sheet.row(0))
            ]))
        for out_colname, coltarget in columns.iteritems():
            col = coltarget._column
            if not col.extra or 'source' not in col.extra or 'name' not in col.extra['source']:
                continue
            sourcename = coltarget._column.extra['source']['name']
            colnum = headers.get(sourcename)
            year = unicode(int(self.year) - 1)
            if not colnum:
                colnum = headers.get(sourcename + u'  ' + year)
            if not colnum:
                colnum = headers.get(sourcename + u' ' + year)
            if not colnum:
                raise Exception('Could not find column "{}" in Excel sheets'.format(
                    sourcename))
            colnum2name[colnum] = out_colname

        # insert data
        session = current_session()
        for i in xrange(1, sheets[0].nrows):
            geom_name = sheets[0].row(i)[0].value.lower()
            geom_ref = sheets[0].row(i)[1].value
            # exclude rows that are for a different resolution
            if 'total c.a.' in geom_name:
                if self.resolution != 'cca':
                    continue
            elif 'total prov.' in geom_name:
                if self.resolution != 'prov':
                    continue
            elif 'nombre municipio' in sheets[0].row(0)[0].value.lower():
                if self.resolution != 'muni' or len(geom_ref) != 5:
                    continue
            else:
                raise RuntimeError('Unrecognized geom ref "{}"'.format(
                    geom_ref))
            values = [u"'" + geom_ref + u"'"] # geo code
            values.extend([
                str(sheets[sheetnum].row(i)[colnum].value)
                for sheetnum, colnum in colnum2name.keys()
            ])
            colnames = ['id_' + self.resolution]
            colnames.extend(colnum2name.values())
            stmt = 'INSERT INTO {output} ({colnames}) ' \
                    'VALUES ({values})'.format(
                        output=self.output().table,
                        colnames=', '.join(colnames),
                        values=', '.join(values),
                    )
            session.execute(stmt)
class QCEWColumns(ColumnsTask):

    naics_code = Parameter()

    def version(self):
        return 3

    def requires(self):
        requirements = {
            'sections': SectionTags(),
            'subsections': SubsectionTags(),
            'units': UnitTags(),
            'source': BLSSourceTags(),
            'license': LicenseTags(),
        }
        parent_code = get_parent_code(self.naics_code)
        if parent_code:
            requirements['parent'] = QCEWColumns(naics_code=parent_code)

        return requirements

    def columns(self):
        cols = OrderedDict()
        code, name, description = self.naics_code, NAICS_CODES[
            self.naics_code], ''

        # This gives us easier access to the tags we defined as dependencies
        input_ = self.input()
        units = input_['units']
        sections = input_['sections']
        subsections = input_['subsections']
        parent = input_.get('parent')
        cols['qtrly_estabs'] = OBSColumn(
            id=underscore_slugify('qtrly_estabs_{}'.format(code)),
            type='Numeric',
            name='Establishments in {}'.format(name),
            description=
            'Count of establishments in a given quarter in the {name} industry (NAICS {code}).'
            '{name} is {description}.'.format(name=name,
                                              code=code,
                                              description=description),
            weight=5,
            aggregate='sum',
            tags=[
                units['businesses'], sections['united_states'],
                subsections['commerce_economy']
            ],
            targets={parent['qtrly_estabs']: DENOMINATOR} if parent else {},
        )
        cols['avg_wkly_wage'] = OBSColumn(
            # Make sure the column ID is unique within this module
            # If left blank, will be taken from this column's key in the output OrderedDict
            id=underscore_slugify('avg_wkly_wage_{}'.format(code)),
            # The PostgreSQL type of this column.  Generally Numeric for numbers and Text
            # for categories.
            type='Numeric',
            # Human-readable name.  Will be used as header in the catalog
            name='Average weekly wage for {} establishments'.format(name),
            # Human-readable description.  Will be used as content in the catalog.
            description=
            'Average weekly wage for a given quarter in the {name} industry (NAICS {code}).'
            '{name} is {description}.'.format(name=name,
                                              code=code,
                                              description=description),
            # Ranking of importance, sometimes used to favor certain measures in auto-selection
            # Weight of 0 will hide this column from the user.  We generally use between 0 and 10
            weight=5,
            # How this measure was derived, for example "sum", "median", "average", etc.
            # In cases of "sum", this means functions downstream can construct estimates
            # for arbitrary geographies
            aggregate='average',
            # Tags are our way of noting aspects of this measure like its unit, the country
            # it's relevant to, and which section(s) of the catalog it should appear in.
            tags=[
                units['money'], sections['united_states'],
                subsections['income']
            ],
            targets={cols['qtrly_estabs']: UNIVERSE},
        )
        cols['month3_emplvl'] = OBSColumn(
            id=underscore_slugify('month3_emplvl_{}'.format(code)),
            type='Numeric',
            name='Employees in {} establishments'.format(name),
            description=
            'Number of employees in the third month of a given quarter with the {name} '
            'industry (NAICS {code}). {name} is {description}.'.format(
                name=name, code=code, description=description),
            weight=5,
            aggregate='sum',
            tags=[
                units['people'], sections['united_states'],
                subsections['employment']
            ],
            targets={parent['month3_emplvl']: DENOMINATOR} if parent else {},
        )
        cols['lq_avg_wkly_wage'] = OBSColumn(
            id=underscore_slugify('lq_avg_wkly_wage_{}'.format(code)),
            type='Numeric',
            name='Average weekly wage location quotient for {} establishments'.
            format(name),
            description=
            'Location quotient of the average weekly wage for a given quarter relative to '
            'the U.S. (Rounded to the hundredths place) within the {name} industry (NAICS {code}).'
            '{name} is {description}.'.format(name=name,
                                              code=code,
                                              description=description),
            weight=3,
            aggregate=None,
            tags=[
                units['ratio'], sections['united_states'],
                subsections['income']
            ],
        )
        cols['lq_qtrly_estabs'] = OBSColumn(
            id=underscore_slugify('lq_qtrly_estabs_{}'.format(code)),
            type='Numeric',
            name='Location quotient of establishments in {}'.format(name),
            description=
            'Location quotient of the quarterly establishment count relative to '
            'the U.S. (Rounded to the hundredths place) within the {name} industry (NAICS {code}).'
            '{name} is {description}.'.format(name=name,
                                              code=code,
                                              description=description),
            weight=3,
            aggregate=None,
            tags=[
                units['ratio'], sections['united_states'],
                subsections['commerce_economy']
            ],
        )
        cols['lq_month3_emplvl'] = OBSColumn(
            id=underscore_slugify('lq_month3_emplvl_{}'.format(code)),
            type='Numeric',
            name='Employment level location quotient in {} establishments'.
            format(name),
            description=
            'Location quotient of the employment level for the third month of a given quarter '
            'relative to the U.S. (Rounded to the hundredths place) within the {name} '
            'industry (NAICS {code}). {name} is {description}.'.format(
                name=name, code=code, description=description),
            weight=3,
            aggregate=None,
            tags=[
                units['ratio'], sections['united_states'],
                subsections['employment']
            ],
        )

        source = input_['source']['qcew']
        license = input_['license']['no-restrictions']
        for colname, col in cols.items():
            col.tags.append(source)
            col.tags.append(license)
        return cols