Ejemplo n.º 1
0
 def __init__(self, **kwargs):
     if kwargs.get('force'):
         try:
             shell('aws s3 rm s3://data-observatory/observatory.pdf')
         except:
             pass
     super(PDFCatalogToS3, self).__init__()
Ejemplo n.º 2
0
 def run(self):
     self.output().makedirs()
     shell('wget -O {output}.zip {url}'.format(
         output=self.output().path, url=self.URL.format(seq=self.seq)))
     os.makedirs(self.output().path)
     shell(
         'unzip -d {output} {output}.zip'.format(output=self.output().path))
Ejemplo n.º 3
0
 def run(self):
     self.output().makedirs()
     try:
         shell('wget {url} -O {target}'.format(url=self.url(),
                                               target=self.output().path))
     except subprocess.CalledProcessError:
         shell('rm -f {target}'.format(target=self.output().path))
Ejemplo n.º 4
0
 def input_shp(self):
     cmd = 'ls {input}/scince/shps/'.format(
         input=self.input().path
     )
     # handle differeing file naming conventions between the geographies
     # and the census data
     if self.resolution == 'municipio':
         resolution = 'municipal'
     elif self.resolution == 'entidad':
         resolution = 'estatal'
     elif self.resolution == 'localidad_urbana_y_rural_amanzanada':
         resolution = 'loc_urb'
     else:
         resolution = self.resolution
     for ent in shell(cmd).strip().split('\n'):
         if ent.lower() == 'national':
             continue
         if self.table.lower().startswith('pob'):
             path = 'ls {input}/scince/shps/{ent}/{ent}_{resolution}*.dbf'
         else:
             path = 'ls {{input}}/scince/shps/{{ent}}/tablas/' \
                 '{{ent}}_cpv2010_{{resolution}}*_{table}.dbf'.format(
                     table=DEMOGRAPHIC_TABLES[self.table])
         cmd = path.format(
             input=self.input().path,
             ent=ent,
             resolution=resolution,
         )
         for shp in shell(cmd).strip().split('\n'):
             yield shp
Ejemplo n.º 5
0
    def input_files(self):
        cmd = 'ls {input}/{scince}/shps/'.format(
            input=self.input().path,
            scince=SCINCE_DIRECTORY
        )
        # handle differeing file naming conventions between the geographies
        # and the census data
        if self.resolution == 'municipio':
            resolution = 'municipal'
        elif self.resolution == 'entidad':
            resolution = 'estatal'
        elif self.resolution == 'localidad_urbana_y_rural_amanzanada':
            resolution = 'loc_urb'
        else:
            resolution = self.resolution
        for ent in shell(cmd).strip().split('\n'):
            if ent.lower() == 'national':
                continue
            if self.table.lower().startswith('pob'):
                path = 'ls {input}/{scince}/shps/{ent}/{ent}_{resolution}*.dbf'
            else:
                path = 'ls {{input}}/{{scince}}/shps/{{ent}}/tablas/' \
                    '{{ent}}_cpv2010_{{resolution}}*_{table}.dbf'.format(
                        table=DEMOGRAPHIC_TABLES[self.table])
            cmd = path.format(
                input=self.input().path,
                ent=ent,
                resolution=resolution,
                scince=SCINCE_DIRECTORY,
            )

            for shp in shell(cmd).strip().split('\n'):
                yield shp
Ejemplo n.º 6
0
 def __init__(self, **kwargs):
     if kwargs.get('force'):
         try:
             shell('aws s3 rm s3://data-observatory/observatory.pdf')
         except:
             pass
     super(PDFCatalogToS3, self).__init__()
 def run(self):
     self.output().makedirs()
     try:
         self.download();
     except subprocess.CalledProcessError:
         shell('rm -f {target}'.format(target=self.output().path))
         raise
Ejemplo n.º 8
0
    def run(self):
        resp = requests.get(self.URL.format(resolution=self.resolution))
        encoded = resp.text.encode(
            resp.headers['Content-Type'].split('charset=')[1])
        reader = DictReader(encoded.split('\r\n'))

        for i, line in enumerate(reader):
            # TODO would be much, much faster in parallel...
            url = 'https://whosonfirst.mapzen.com/data/{path}'.format(
                path=line['path'])
            lfs_url = 'https://github.com/whosonfirst/whosonfirst-data/raw/master/data/{path}'.format(
                path=line['path'])
            cmd = 'wget \'{url}\' -O - | ogr2ogr -{operation} ' \
                    '-nlt MULTIPOLYGON -nln \'{table}\' ' \
                    '-f PostgreSQL PG:"dbname=$PGDATABASE ' \
                    'active_schema={schema}" /vsistdin/'.format(
                        url=url,
                        schema=self.output().schema,
                        table=self.output().tablename,
                        operation='append' if i > 0 else 'overwrite -lco OVERWRITE=yes'
                    )
            try:
                shell(cmd)
            except subprocess.CalledProcessError:
                cmd = cmd.replace(url, lfs_url)
                shell(cmd)
Ejemplo n.º 9
0
 def download(self):
     self.output().makedirs()
     referer = 'http://www.eurogeographics.org/content/euroglobalmap-opendata?sid=10868'
     shell("wget -O {output}.7z --referer='{referer}' '{url}'".format(
         output=self.output().path,
         referer=referer,
         url=self.URL,
     ))
Ejemplo n.º 10
0
 def populate(self):
     for infile in self.input()['data']:
         # gunzip each CSV into the table
         cmd = r"gunzip -c '{input}' | psql -c '\copy {tablename} FROM STDIN " \
               r"WITH CSV HEADER'".format(input=infile.path,
                                          tablename=self.output().table)
         print cmd
         shell(cmd)
Ejemplo n.º 11
0
 def input_shp(self):
     cmd = 'ls {input}/encuesta_intercensal_2015/shps/'.format(
         input=self.input().path)
     for ent in shell(cmd).strip().split('\n'):
         cmd = 'ls {input}/encuesta_intercensal_2015/shps/{ent}/{ent}_{resolution}*.shp'.format(
             input=self.input().path, ent=ent, resolution=self.resolution)
         for shp in shell(cmd).strip().split('\n'):
             yield shp
Ejemplo n.º 12
0
def cdrc_downloader(url, output_path):
    if 'CDRC_COOKIE' not in os.environ:
        raise ValueError(
            'This task requires a CDRC cookie. Put it in the `.env` file\n'
            'e.g: CDRC_COOKIE=\'auth_tkt="00000000000000000username!userid_type:unicode"\''
        )
    shell('wget --header=\'Cookie: {cookie}\' -O {output} {url}'.format(
        output=output_path, url=url, cookie=os.environ['CDRC_COOKIE']))
Ejemplo n.º 13
0
 def populate(self):
     for infile in self.input()['data']:
         # gunzip each CSV into the table
         cmd = r"gunzip -c '{input}' | psql -c '\copy {tablename} FROM STDIN " \
               r"WITH CSV HEADER'".format(input=infile.path,
                                          tablename=self.output().table)
         print cmd
         shell(cmd)
Ejemplo n.º 14
0
 def merge(self):
     first = True
     output_dir = os.path.dirname(self.output().path)
     for key in self.URLS.keys():
         if first:
             shell('ogr2ogr -f "ESRI Shapefile" {output_file} {output}/{key}/MB_2016_{key_upper}.shp'.format(output_file=self.output().path, output=output_dir, key=key, key_upper=key.upper()))
             first = False
         else:
             shell('ogr2ogr -f "ESRI Shapefile" -update -append {output_file} {output}/{key}/MB_2016_{key_upper}.shp'.format(output_file=self.output().path, output=output_dir, key=key, key_upper=key.upper()))
Ejemplo n.º 15
0
def test_download_unzip_task():
    '''
    Download unzip task should download remote assets and unzip them locally.
    '''
    task = TestDownloadUnzipTask()
    if task.output().exists():
        shell('rm -r {}'.format(task.output().path))
    assert_false(task.output().exists())
    runtask(task)
    assert_true(task.output().exists())
Ejemplo n.º 16
0
 def output(self):
     path = self.input().path.replace('tmp/carto/Dump_', 'do-release-')
     path = path.replace('.dump', '/obs.dump')
     path = 's3://cartodb-observatory-data/{path}'.format(path=path)
     LOGGER.info(path)
     target = S3Target(path)
     if self.force:
         shell('aws s3 rm {output}'.format(output=path))
         self.force = False
     return target
Ejemplo n.º 17
0
def test_download_unzip_task():
    '''
    Download unzip task should download remote assets and unzip them locally.
    '''
    task = TestRepoFileUnzipTask()
    if task.output().exists():
        shell('rm -r {}'.format(task.output().path))
    assert_false(task.output().exists())
    runtask(task)
    assert_true(task.output().exists())
 def rename_files(self):
     rename = {
         'ustractclustersnew-41530.bin' : 'US_tract_clusters_new.dbf',
         'ustractclustersnew-41538.txt' : 'US_tract_clusters_new.prj',
         'ustractclustersnew-41563.bin' : 'US_tract_clusters_new.shp',
         'ustractclustersnew-41555.bin' : 'US_tract_clusters_new.shx'
     }
     for old_name, new_name in rename.iteritems():
         shell('mv {folder}/{old_name} {folder}/{new_name}'.format(
             old_name=old_name,
             new_name=new_name,
             folder=self.decompressed_folder()))
Ejemplo n.º 19
0
 def input_shp(self):
     cmd = 'ls {input}/encuesta_intercensal_2015/shps/'.format(
         input=self.input().path
     )
     for ent in shell(cmd).strip().split('\n'):
         cmd = 'ls {input}/encuesta_intercensal_2015/shps/{ent}/{ent}_{resolution}*.shp'.format(
             input=self.input().path,
             ent=ent,
             resolution=self.resolution
         )
         for shp in shell(cmd).strip().split('\n'):
             yield shp
Ejemplo n.º 20
0
 def run(self):
     session = current_session()
     try:
         self.output().makedirs()
         session.execute(
             'INSERT INTO observatory.obs_dump_version (dump_id) '
             "VALUES ('{task_id}')".format(task_id=self.task_id))
         session.commit()
         shell('pg_dump -Fc -Z0 -x -n observatory -f {output}'.format(
             output=self.output().path))
     except Exception as err:
         session.rollback()
         raise err
Ejemplo n.º 21
0
 def run(self):
     session = current_session()
     try:
         self.output().makedirs()
         session.execute(
             'INSERT INTO observatory.obs_dump_version (dump_id) '
             "VALUES ('{task_id}')".format(task_id=self.task_id))
         session.commit()
         shell('pg_dump -Fc -Z0 -x -n observatory -f {output}'.format(
             output=self.output().path))
     except Exception as err:
         session.rollback()
         raise err
Ejemplo n.º 22
0
 def __init__(self, *args, **kwargs):
     super(GenerateRST, self).__init__(*args, **kwargs)
     if self.force:
         shell('rm -rf catalog/source/*/*')
     shell('cp -R catalog/img catalog/source/')
     shell('mkdir -p catalog/img_thumb')
     shell('cp -R catalog/img_thumb catalog/source/')
Ejemplo n.º 23
0
 def output(self):
     path = self.input().path.replace('tmp/carto/Dump_', 'do-release-')
     path = path.replace('.dump', '/obs.dump')
     path = 's3://cartodb-observatory-data/{path}'.format(
         path=path
     )
     print path
     target = S3Target(path)
     if self.force:
         shell('aws s3 rm {output}'.format(
             output=path
         ))
         self.force = False
     return target
Ejemplo n.º 24
0
 def __init__(self, *args, **kwargs):
     super(GenerateRST, self).__init__(*args, **kwargs)
     if self.force:
         shell('rm -rf catalog/source/*/*')
     shell('cp -R catalog/img catalog/source/')
     shell('mkdir -p catalog/img_thumb')
     shell('cp -R catalog/img_thumb catalog/source/')
Ejemplo n.º 25
0
 def run(self):
     self.output().makedirs()
     config = self.viz_to_config()
     named_map = self.get_named_map(config['layers'])
     img_url = '{cartodb_url}/api/v1/map/static/center/' \
             '{layergroupid}/{zoom}/{center_lon}/{center_lat}/800/500.png'.format(
                 cartodb_url=os.environ['CARTODB_URL'],
                 layergroupid=named_map['layergroupid'],
                 zoom=config['zoom'],
                 center_lon=config['center'][0],
                 center_lat=config['center'][1]
             )
     LOGGER.info(img_url)
     shell('curl "{img_url}" > {output}'.format(img_url=img_url,
                                                output=self.output().path))
Ejemplo n.º 26
0
 def run(self):
     self.output().makedirs()
     config = self.viz_to_config()
     named_map = self.get_named_map(config['layers'])
     img_url = '{cartodb_url}/api/v1/map/static/center/' \
             '{layergroupid}/{zoom}/{center_lon}/{center_lat}/800/500.png'.format(
                 cartodb_url=os.environ['CARTODB_URL'],
                 layergroupid=named_map['layergroupid'],
                 zoom=config['zoom'],
                 center_lon=config['center'][0],
                 center_lat=config['center'][1]
             )
     print img_url
     shell('curl "{img_url}" > {output}'.format(img_url=img_url,
                                                output=self.output().path))
Ejemplo n.º 27
0
 def complete(self):
     try:
         exists = shell('ls {}'.format(os.path.join(self.directory,
                                                    '*.shp')))
         return exists != ''
     except subprocess.CalledProcessError:
         return False
Ejemplo n.º 28
0
 def input_files(self):
     cmd = 'ls {input}/*.{extension}'.format(
         input=self.input().path,
         extension=self.extension
     )
     for geofile in shell(cmd).strip().split('\n'):
         yield geofile
Ejemplo n.º 29
0
 def input_files(self):
     if self.resolution == GEO_MB:
         yield self.input().path
     else:
         cmd = 'ls {input}/*.shp'.format(input=self.input().path)
         for shp in shell(cmd).strip().split('\n'):
             yield shp
Ejemplo n.º 30
0
 def run(self):
     infiles = shell("ls {input}/LC*DATA.CSV".format(input=self.input().path))
     fhandle = self.output().open("w")
     for infile in infiles.strip().split("\n"):
         table = os.path.split(infile)[-1].split("DATA.CSV")[0]
         data = yield ImportEnglandWalesLocal(table=table)
         fhandle.write("{table}\n".format(table=data.table))
     fhandle.close()
Ejemplo n.º 31
0
    def _get_filename(self):
        cmd = 'curl -s {url}'.format(url=self.URL)
        cmd += ' | '
        cmd += 'awk \'{print $9}\''
        cmd += ' | '
        cmd += 'grep -i ^{state}_[0-9]*\.zip$'.format(state=self.state)

        return shell(cmd)
Ejemplo n.º 32
0
 def run(self):
     session = current_session()
     infile = os.path.join(self.input().path, self.table + "DATA.CSV")
     headers = shell("head -n 1 {csv}".format(csv=infile))
     cols = ["{} NUMERIC".format(h) for h in headers.split(",")[1:]]
     session.execute(
         "CREATE TABLE {output} (GeographyCode TEXT, {cols})".format(
             output=self.output().table, cols=", ".join(cols)
         )
     )
     session.commit()
     shell(
         "cat '{infile}' | psql -c 'COPY {output} FROM STDIN WITH CSV HEADER'".format(
             output=self.output().table, infile=infile
         )
     )
     session.execute("ALTER TABLE {output} ADD PRIMARY KEY (geographycode)".format(output=self.output().table))
Ejemplo n.º 33
0
 def run(self):
     session = current_session()
     infile = os.path.join(self.input().path, self.table + 'DATA.CSV')
     headers = shell('head -n 1 {csv}'.format(csv=infile))
     cols = ['{} NUMERIC'.format(h) for h in headers.split(',')[1:]]
     session.execute('CREATE TABLE {output} (GeographyCode TEXT, {cols})'.format(
         output=self.output().table,
         cols=', '.join(cols)
     ))
     session.commit()
     shell("cat '{infile}' | psql -c 'COPY {output} FROM STDIN WITH CSV HEADER'".format(
         output=self.output().table,
         infile=infile,
     ))
     session.execute('ALTER TABLE {output} ADD PRIMARY KEY (geographycode)'.format(
         output=self.output().table
     ))
Ejemplo n.º 34
0
 def run(self):
     infiles = shell('ls {input}/*.csv'.format(
         input=self.input().path))
     fhandle = self.output().open('w')
     for infile in infiles.strip().split('\n'):
         topic = os.path.split(infile)[-1].split('.csv')[0]
         data = yield CopyDataToTable(resolution=self.resolution, survey=self.survey, topic=topic)
         fhandle.write('{table}\n'.format(table=data.table))
     fhandle.close()
Ejemplo n.º 35
0
 def run(self):
     infiles = shell('ls {input}/{survey_code}-{geo_code}*.[cC][sS][vV]'.format(
         input=self.input().path,
         survey_code=SURVEY_CODES[self.survey],
         geo_code=GEOGRAPHY_CODES[self.resolution]
     ))
     in_csv_files = infiles.strip().split('\n')
     os.makedirs(self.output().path)
     StatCanParser().parse_csv_to_files(in_csv_files, self.output().path)
Ejemplo n.º 36
0
 def run(self):
     infiles = shell(
         'ls {input}/{survey_code}-{geo_code}*.[cC][sS][vV]'.format(
             input=self.input().path,
             survey_code=SURVEY_CODES[self.survey],
             geo_code=GEOGRAPHY_CODES[self.resolution]))
     in_csv_files = infiles.strip().split('\n')
     os.makedirs(self.output().path)
     StatCanParser().parse_csv_to_files(in_csv_files, self.output().path)
Ejemplo n.º 37
0
    def run(self):
        infile = os.path.join(self.input().path, self.topic + '.csv')
        headers = shell('head -n 1 {csv}'.format(csv=infile))
        cols = ['{} NUMERIC'.format(h) for h in headers.split(',')[1:]]

        session = current_session()
        session.execute('CREATE TABLE {output} (Geo_Code TEXT, {cols})'.format(
            output=self.output().table,
            cols=', '.join(cols)
        ))
        session.commit()
        shell("cat '{infile}' | psql -c 'COPY {output} FROM STDIN WITH CSV HEADER'".format(
            output=self.output().table,
            infile=infile,
        ))
        session.execute('ALTER TABLE {output} ADD PRIMARY KEY (geo_code)'.format(
            output=self.output().table
        ))
Ejemplo n.º 38
0
 def run(self):
     infiles = shell('ls {input}/*.csv'.format(
         input=self.input().path))
     fhandle = self.output().open('w')
     for infile in infiles.strip().split('\n'):
         topic = os.path.split(infile)[-1].split('.csv')[0]
         data = yield CopyDataToTable(resolution=self.resolution, survey=self.survey, topic=topic)
         fhandle.write('{table}\n'.format(table=data.table))
     fhandle.close()
Ejemplo n.º 39
0
 def run(self):
     infiles = shell(
         'ls {input}/LC*DATA.CSV'.format(input=self.input().path))
     fhandle = self.output().open('w')
     for infile in infiles.strip().split('\n'):
         table = os.path.split(infile)[-1].split('DATA.CSV')[0]
         data = yield ImportEnglandWalesLocal(table=table)
         fhandle.write('{table}\n'.format(table=data.table))
     fhandle.close()
Ejemplo n.º 40
0
    def run(self):
        # make the table
        session = current_session()
        session.execute('''
DROP TABLE IF EXISTS {tablename};
CREATE TABLE {tablename} (
    {columns}
);
                       '''.format(tablename=self.output().table,
                                  columns=self.columns()))
        session.commit()

        #cursor.connection.commit()

        for infile in self.input():
            print infile.path
            # gunzip each CSV into the table
            cmd = r"gunzip -c '{input}' | psql -c '\copy {tablename} FROM STDIN " \
                  r"WITH CSV HEADER'".format(input=infile.path, tablename=self.output().table)
            shell(cmd)
Ejemplo n.º 41
0
 def input_files(self):
     '''
     We don't know precise name of file inside zip archive beforehand, so
     use find to track it down.
     '''
     return shell(
         "find '{dirpath}' -iname *_{resolution}_*_{aux}.shp | grep {timestamp}"
         .format(dirpath=self.input().path,
                 timestamp=self.timestamp,
                 aux=self.id_aux,
                 resolution=self.resolution)).strip()
Ejemplo n.º 42
0
    def run(self):
        # make the table
        session = current_session()
        session.execute('''
DROP TABLE IF EXISTS {tablename};
CREATE TABLE {tablename} (
    {columns}
);
                       '''.format(tablename=self.output().table,
                                  columns=self.columns()))
        session.commit()

        #cursor.connection.commit()

        for infile in self.input():
            print infile.path
            # gunzip each CSV into the table
            cmd = r"gunzip -c '{input}' | psql -c '\copy {tablename} FROM STDIN " \
                  r"WITH CSV HEADER'".format(input=infile.path, tablename=self.output().table)
            shell(cmd)
    def populate(self):
        table_name = self.output().table
        shell(r"psql -c '\copy {table} FROM {file_path} WITH CSV HEADER'".format(
            table=table_name,
            file_path=self.input()['data_file'].path
        ))
        for name, segment_id in SpielmanSingletonColumns.x10_mapping.iteritems():
            current_session().execute("update {table} set X10 = '{name}' "
                                      "where X10 ='{segment_id}'; ".format(
                                          table=table_name,
                                          name=name,
                                          segment_id=segment_id
                                      ))

        for name, segment_id in SpielmanSingletonColumns.x55_mapping.iteritems():
            current_session().execute("update {table} set X55 = '{name}' "
                                      "where X55 ='{segment_id}'; ".format(
                                          table=table_name,
                                          name=name,
                                          segment_id=segment_id
                                      ))
Ejemplo n.º 44
0
    def download(self):

        themes = {
            'population': 'infra-population-12/',
            'housing': 'infra-logement-12/',
            'education': 'infra-formation-12/',
            'household': 'infra-famille-12/',
            'employment': 'infra-activite-resident-12/'
        }

        iris_overseas = {
            'population': 'base-ic-evol-struct-pop-2012-com.xls',
            'housing': 'base-ic-logement-2012-com.xls',
            'education': 'base-ic-diplomes-formation-2012-com.xls',
            'household': 'base-ic-couples-familles-menages-2012-com.xls',
            'employment': 'base-ic-activite-residents-2012-com.xls'
        }

        URL = self.URL_base + themes.get(self.table_theme) + iris_overseas.get(
            self.table_theme)

        shell('wget -P {output} {url}'.format(output=self.output().path,
                                              url=URL))
Ejemplo n.º 45
0
    def download(self):

        themes = {
            'population': 'infra-population-12/',
            'housing': 'infra-logement-12/',
            'education': 'infra-formation-12/',
            'household': 'infra-famille-12/',
            'employment': 'infra-activite-resident-12/'
        }

        iris = {
            'population': 'infra-population-2012.zip',
            'housing': 'infra-logement-2012.zip',
            'education': 'infra-formation-2012.zip',
            'household': 'infra-famille-2012.zip',
            'employment': 'infra-activite-resident-2012.zip'
        }

        URL = self.URL_base + themes.get(self.table_theme) + iris.get(
            self.table_theme)

        shell('wget -O {output}.zip {url}'.format(output=self.output().path,
                                                  url=URL))
Ejemplo n.º 46
0
 def run(self):
     infiles = shell('ls {input}/{survey_code}-{geo_code}*.[cC][sS][vV]'.format(
         input=self.input().path,
         survey_code=SURVEY_CODES[self.survey],
         geo_code=GEOGRAPHY_CODES[self.resolution]
     ))
     in_csv_files = []
     for in_csv_file in infiles.strip().split('\n'):
         if not self._is_ignored_suffix(in_csv_file):
             in_csv_files.append(in_csv_file)
         else:
             LOGGER.warning('Ignoring file %s' % in_csv_file)
     os.makedirs(self.output().path)
     StatCanParser(self.DIVISION_SPLITTED[self.survey][self.resolution]).parse_csv_to_files(in_csv_files, self.output().path)
Ejemplo n.º 47
0
    def run(self):
        shapefiles = shell('ls {dir}/*.shp'.format(
            dir=os.path.join('tmp', classpath(self), str(self.year), self.geography)
        )).strip().split('\n')

        cmd = 'PG_USE_COPY=yes PGCLIENTENCODING=latin1 ' \
                'ogr2ogr -f PostgreSQL "PG:dbname=$PGDATABASE active_schema={schema}" ' \
                '-t_srs "EPSG:4326" -nlt MultiPolygon -nln {tablename} ' \
                '-lco OVERWRITE=yes ' \
                '-lco SCHEMA={schema} {shpfile_path} '.format(
                    tablename=self.output().tablename,
                    schema=self.output().schema,
                    shpfile_path=shapefiles.pop())
        shell(cmd)

        # chunk into 500 shapefiles at a time.
        for i, shape_group in enumerate(grouper(shapefiles, 500)):
            shell(
                'export PG_USE_COPY=yes PGCLIENTENCODING=latin1; '
                'echo \'{shapefiles}\' | xargs -P 16 -I shpfile_path '
                'ogr2ogr -f PostgreSQL "PG:dbname=$PGDATABASE '
                'active_schema={schema}" -append '
                '-t_srs "EPSG:4326" -nlt MultiPolygon -nln {tablename} '
                'shpfile_path '.format(
                    shapefiles='\n'.join([shp for shp in shape_group if shp]),
                    tablename=self.output().tablename,
                    schema=self.output().schema))
            print 'imported {} shapefiles'.format((i + 1) * 500)

        session = current_session()
        # Spatial index
        session.execute('ALTER TABLE {qualified_table} RENAME COLUMN '
                        'wkb_geometry TO geom'.format(
                            qualified_table=self.output().table))
        session.execute('CREATE INDEX ON {qualified_table} USING GIST (geom)'.format(
            qualified_table=self.output().table))
Ejemplo n.º 48
0
    def run(self):
        shapefiles = shell('ls {dir}/*.shp'.format(
            dir=os.path.join('tmp', classpath(self), str(self.year), self.geography)
        )).strip().split('\n')

        cmd = 'PG_USE_COPY=yes PGCLIENTENCODING=latin1 ' \
                'ogr2ogr -f PostgreSQL "PG:dbname=$PGDATABASE active_schema={schema}" ' \
                '-t_srs "EPSG:4326" -nlt MultiPolygon -nln {tablename} ' \
                '-lco OVERWRITE=yes ' \
                '-lco SCHEMA={schema} {shpfile_path} '.format(
                    tablename=self.output().tablename,
                    schema=self.output().schema,
                    shpfile_path=shapefiles.pop())
        shell(cmd)

        # chunk into 500 shapefiles at a time.
        for i, shape_group in enumerate(grouper(shapefiles, 500)):
            shell(
                'export PG_USE_COPY=yes PGCLIENTENCODING=latin1; '
                'echo \'{shapefiles}\' | xargs -P 16 -I shpfile_path '
                'ogr2ogr -f PostgreSQL "PG:dbname=$PGDATABASE '
                'active_schema={schema}" -append '
                '-t_srs "EPSG:4326" -nlt MultiPolygon -nln {tablename} '
                'shpfile_path '.format(
                    shapefiles='\n'.join([shp for shp in shape_group if shp]),
                    tablename=self.output().tablename,
                    schema=self.output().schema))
            print 'imported {} shapefiles'.format((i + 1) * 500)

        session = current_session()
        # Spatial index
        session.execute('ALTER TABLE {qualified_table} RENAME COLUMN '
                        'wkb_geometry TO geom'.format(
                            qualified_table=self.output().table))
        session.execute('CREATE INDEX ON {qualified_table} USING GIST (geom)'.format(
            qualified_table=self.output().table))
Ejemplo n.º 49
0
    def run(self):
        resp = requests.get(self.URL.format(resolution=self.resolution))
        encoded = resp.text.encode(resp.headers['Content-Type'].split('charset=')[1])
        reader = DictReader(encoded.split('\r\n'))

        for i, line in enumerate(reader):
            # TODO would be much, much faster in parallel...
            url = 'https://whosonfirst.mapzen.com/data/{path}'.format(path=line['path'])
            lfs_url = 'https://github.com/whosonfirst/whosonfirst-data/raw/master/data/{path}'.format(
                path=line['path'])
            cmd = 'wget \'{url}\' -O - | ogr2ogr -{operation} ' \
                    '-nlt MULTIPOLYGON -nln \'{table}\' ' \
                    '-f PostgreSQL PG:"dbname=$PGDATABASE ' \
                    'active_schema={schema}" /vsistdin/'.format(
                        url=url,
                        schema=self.output().schema,
                        table=self.output().tablename,
                        operation='append' if i > 0 else 'overwrite -lco OVERWRITE=yes'
                    )
            try:
                shell(cmd)
            except subprocess.CalledProcessError:
                cmd = cmd.replace(url, lfs_url)
                shell(cmd)
Ejemplo n.º 50
0
    def run(self):

        resp = query_cartodb('SELECT tablename FROM obs_table')
        tablenames = set([r['tablename'] for r in resp.json()['rows']])
        remote_tables = []
        for page in range(self.start, self.end + 1):
            remote_tables.extend(
                shell("curl -s '{cartodb_url}/datasets?page={page}' "
                      "| grep -Eo 'obs_[0-f]{{40}}' | uniq".format(
                          cartodb_url=os.environ['CARTODB_URL'],
                          page=page)).strip().split('\n'))
        for table in remote_tables:
            LOGGER.info('keeping %s', table)
            if table not in tablenames:
                LOGGER.info('removing %s', table)
                try:
                    CartoDBTarget(table).remove()
                except Exception as err:
                    LOGGER.warn(err)
Ejemplo n.º 51
0
 def run(self):
     self.output().makedirs()
     shell('wget \'{url}\' -O {output}'.format(url=self.url(),
                                               output=self.output().path))
Ejemplo n.º 52
0
 def run(self):
     schema = 'tiger{year}'.format(year=self.year)
     shell("psql -c 'DROP SCHEMA IF EXISTS \"{schema}\" CASCADE'".format(schema=schema))
     shell("psql -c 'CREATE SCHEMA \"{schema}\"'".format(schema=schema))
     url = self.url_template.format(year=self.year)
     self.load_from_url(url)
Ejemplo n.º 53
0
 def download(self):
     shell('wget --header=\'Cookie: auth_tkt="96a4778a0e3366127d4a47cf19a9c7d65751e5a9talos!userid_type:unicode"; auth_tkt="96a4778a0e3366127d4a47cf19a9c7d65751e5a9talos!userid_type:unicode";\' -O {output}.zip {url}'.format(
         output=self.output().path,
         url=self.URL))
Ejemplo n.º 54
0
 def output(self):
     shps = shell('ls {}'.format(os.path.join(self.directory, '*.shp')))
     for path in shps:
         yield LocalTarget(path)
Ejemplo n.º 55
0
 def run(self):
     #for infile in self.input():
     cmd = "cd {path} && find -iname '*.zip' -print0 | xargs -0 -n1 unzip -n -q ".format(
         path=self.directory)
     shell(cmd)
Ejemplo n.º 56
0
 def output(self):
     filenames = shell('ls {}'.format(os.path.join(
         self.directory, self.geography, '*.zip'))).split('\n')
     for path in filenames:
         yield LocalTarget(path)
Ejemplo n.º 57
0
 def run(self):
     shell('wget --recursive --continue --accept=*.zip '
           '--no-parent --cut-dirs=3 --no-host-directories '
           '--directory-prefix={directory} '
           '{url}'.format(directory=self.directory, url=self.url))
Ejemplo n.º 58
0
 def run(self):
     shell('aws s3 cp {input} {output}'.format(
         input=self.input().path,
         output=self.output().path
     ))
Ejemplo n.º 59
0
'''
Test ACS columns
'''

from tasks.util import shell

# TODO clean this up in a more general init script

try:
    shell('createdb test')
except:
    pass

from nose.tools import assert_equals, with_setup, assert_false, assert_true

from tasks.meta import (OBSColumnTable, OBSColumn, OBSColumnToColumn, OBSTable,
                        OBSTag, OBSColumnTag, Base)

from tasks.us.census.lodes import WorkplaceAreaCharacteristicsColumns

from tests.util import runtask, setup, teardown


@with_setup(setup, teardown)
def test_wac_columns_run():
    runtask(WorkplaceAreaCharacteristicsColumns())

Ejemplo n.º 60
0
 def complete(self):
     try:
         exists = shell('ls {}'.format(os.path.join(self.directory, '*.shp')))
         return exists != ''
     except subprocess.CalledProcessError:
         return False