def __init__(self, **kwargs): if kwargs.get('force'): try: shell('aws s3 rm s3://data-observatory/observatory.pdf') except: pass super(PDFCatalogToS3, self).__init__()
def run(self): self.output().makedirs() shell('wget -O {output}.zip {url}'.format( output=self.output().path, url=self.URL.format(seq=self.seq))) os.makedirs(self.output().path) shell( 'unzip -d {output} {output}.zip'.format(output=self.output().path))
def run(self): self.output().makedirs() try: shell('wget {url} -O {target}'.format(url=self.url(), target=self.output().path)) except subprocess.CalledProcessError: shell('rm -f {target}'.format(target=self.output().path))
def input_shp(self): cmd = 'ls {input}/scince/shps/'.format( input=self.input().path ) # handle differeing file naming conventions between the geographies # and the census data if self.resolution == 'municipio': resolution = 'municipal' elif self.resolution == 'entidad': resolution = 'estatal' elif self.resolution == 'localidad_urbana_y_rural_amanzanada': resolution = 'loc_urb' else: resolution = self.resolution for ent in shell(cmd).strip().split('\n'): if ent.lower() == 'national': continue if self.table.lower().startswith('pob'): path = 'ls {input}/scince/shps/{ent}/{ent}_{resolution}*.dbf' else: path = 'ls {{input}}/scince/shps/{{ent}}/tablas/' \ '{{ent}}_cpv2010_{{resolution}}*_{table}.dbf'.format( table=DEMOGRAPHIC_TABLES[self.table]) cmd = path.format( input=self.input().path, ent=ent, resolution=resolution, ) for shp in shell(cmd).strip().split('\n'): yield shp
def input_files(self): cmd = 'ls {input}/{scince}/shps/'.format( input=self.input().path, scince=SCINCE_DIRECTORY ) # handle differeing file naming conventions between the geographies # and the census data if self.resolution == 'municipio': resolution = 'municipal' elif self.resolution == 'entidad': resolution = 'estatal' elif self.resolution == 'localidad_urbana_y_rural_amanzanada': resolution = 'loc_urb' else: resolution = self.resolution for ent in shell(cmd).strip().split('\n'): if ent.lower() == 'national': continue if self.table.lower().startswith('pob'): path = 'ls {input}/{scince}/shps/{ent}/{ent}_{resolution}*.dbf' else: path = 'ls {{input}}/{{scince}}/shps/{{ent}}/tablas/' \ '{{ent}}_cpv2010_{{resolution}}*_{table}.dbf'.format( table=DEMOGRAPHIC_TABLES[self.table]) cmd = path.format( input=self.input().path, ent=ent, resolution=resolution, scince=SCINCE_DIRECTORY, ) for shp in shell(cmd).strip().split('\n'): yield shp
def run(self): self.output().makedirs() try: self.download(); except subprocess.CalledProcessError: shell('rm -f {target}'.format(target=self.output().path)) raise
def run(self): resp = requests.get(self.URL.format(resolution=self.resolution)) encoded = resp.text.encode( resp.headers['Content-Type'].split('charset=')[1]) reader = DictReader(encoded.split('\r\n')) for i, line in enumerate(reader): # TODO would be much, much faster in parallel... url = 'https://whosonfirst.mapzen.com/data/{path}'.format( path=line['path']) lfs_url = 'https://github.com/whosonfirst/whosonfirst-data/raw/master/data/{path}'.format( path=line['path']) cmd = 'wget \'{url}\' -O - | ogr2ogr -{operation} ' \ '-nlt MULTIPOLYGON -nln \'{table}\' ' \ '-f PostgreSQL PG:"dbname=$PGDATABASE ' \ 'active_schema={schema}" /vsistdin/'.format( url=url, schema=self.output().schema, table=self.output().tablename, operation='append' if i > 0 else 'overwrite -lco OVERWRITE=yes' ) try: shell(cmd) except subprocess.CalledProcessError: cmd = cmd.replace(url, lfs_url) shell(cmd)
def download(self): self.output().makedirs() referer = 'http://www.eurogeographics.org/content/euroglobalmap-opendata?sid=10868' shell("wget -O {output}.7z --referer='{referer}' '{url}'".format( output=self.output().path, referer=referer, url=self.URL, ))
def populate(self): for infile in self.input()['data']: # gunzip each CSV into the table cmd = r"gunzip -c '{input}' | psql -c '\copy {tablename} FROM STDIN " \ r"WITH CSV HEADER'".format(input=infile.path, tablename=self.output().table) print cmd shell(cmd)
def input_shp(self): cmd = 'ls {input}/encuesta_intercensal_2015/shps/'.format( input=self.input().path) for ent in shell(cmd).strip().split('\n'): cmd = 'ls {input}/encuesta_intercensal_2015/shps/{ent}/{ent}_{resolution}*.shp'.format( input=self.input().path, ent=ent, resolution=self.resolution) for shp in shell(cmd).strip().split('\n'): yield shp
def cdrc_downloader(url, output_path): if 'CDRC_COOKIE' not in os.environ: raise ValueError( 'This task requires a CDRC cookie. Put it in the `.env` file\n' 'e.g: CDRC_COOKIE=\'auth_tkt="00000000000000000username!userid_type:unicode"\'' ) shell('wget --header=\'Cookie: {cookie}\' -O {output} {url}'.format( output=output_path, url=url, cookie=os.environ['CDRC_COOKIE']))
def merge(self): first = True output_dir = os.path.dirname(self.output().path) for key in self.URLS.keys(): if first: shell('ogr2ogr -f "ESRI Shapefile" {output_file} {output}/{key}/MB_2016_{key_upper}.shp'.format(output_file=self.output().path, output=output_dir, key=key, key_upper=key.upper())) first = False else: shell('ogr2ogr -f "ESRI Shapefile" -update -append {output_file} {output}/{key}/MB_2016_{key_upper}.shp'.format(output_file=self.output().path, output=output_dir, key=key, key_upper=key.upper()))
def test_download_unzip_task(): ''' Download unzip task should download remote assets and unzip them locally. ''' task = TestDownloadUnzipTask() if task.output().exists(): shell('rm -r {}'.format(task.output().path)) assert_false(task.output().exists()) runtask(task) assert_true(task.output().exists())
def output(self): path = self.input().path.replace('tmp/carto/Dump_', 'do-release-') path = path.replace('.dump', '/obs.dump') path = 's3://cartodb-observatory-data/{path}'.format(path=path) LOGGER.info(path) target = S3Target(path) if self.force: shell('aws s3 rm {output}'.format(output=path)) self.force = False return target
def test_download_unzip_task(): ''' Download unzip task should download remote assets and unzip them locally. ''' task = TestRepoFileUnzipTask() if task.output().exists(): shell('rm -r {}'.format(task.output().path)) assert_false(task.output().exists()) runtask(task) assert_true(task.output().exists())
def rename_files(self): rename = { 'ustractclustersnew-41530.bin' : 'US_tract_clusters_new.dbf', 'ustractclustersnew-41538.txt' : 'US_tract_clusters_new.prj', 'ustractclustersnew-41563.bin' : 'US_tract_clusters_new.shp', 'ustractclustersnew-41555.bin' : 'US_tract_clusters_new.shx' } for old_name, new_name in rename.iteritems(): shell('mv {folder}/{old_name} {folder}/{new_name}'.format( old_name=old_name, new_name=new_name, folder=self.decompressed_folder()))
def input_shp(self): cmd = 'ls {input}/encuesta_intercensal_2015/shps/'.format( input=self.input().path ) for ent in shell(cmd).strip().split('\n'): cmd = 'ls {input}/encuesta_intercensal_2015/shps/{ent}/{ent}_{resolution}*.shp'.format( input=self.input().path, ent=ent, resolution=self.resolution ) for shp in shell(cmd).strip().split('\n'): yield shp
def run(self): session = current_session() try: self.output().makedirs() session.execute( 'INSERT INTO observatory.obs_dump_version (dump_id) ' "VALUES ('{task_id}')".format(task_id=self.task_id)) session.commit() shell('pg_dump -Fc -Z0 -x -n observatory -f {output}'.format( output=self.output().path)) except Exception as err: session.rollback() raise err
def __init__(self, *args, **kwargs): super(GenerateRST, self).__init__(*args, **kwargs) if self.force: shell('rm -rf catalog/source/*/*') shell('cp -R catalog/img catalog/source/') shell('mkdir -p catalog/img_thumb') shell('cp -R catalog/img_thumb catalog/source/')
def output(self): path = self.input().path.replace('tmp/carto/Dump_', 'do-release-') path = path.replace('.dump', '/obs.dump') path = 's3://cartodb-observatory-data/{path}'.format( path=path ) print path target = S3Target(path) if self.force: shell('aws s3 rm {output}'.format( output=path )) self.force = False return target
def run(self): self.output().makedirs() config = self.viz_to_config() named_map = self.get_named_map(config['layers']) img_url = '{cartodb_url}/api/v1/map/static/center/' \ '{layergroupid}/{zoom}/{center_lon}/{center_lat}/800/500.png'.format( cartodb_url=os.environ['CARTODB_URL'], layergroupid=named_map['layergroupid'], zoom=config['zoom'], center_lon=config['center'][0], center_lat=config['center'][1] ) LOGGER.info(img_url) shell('curl "{img_url}" > {output}'.format(img_url=img_url, output=self.output().path))
def run(self): self.output().makedirs() config = self.viz_to_config() named_map = self.get_named_map(config['layers']) img_url = '{cartodb_url}/api/v1/map/static/center/' \ '{layergroupid}/{zoom}/{center_lon}/{center_lat}/800/500.png'.format( cartodb_url=os.environ['CARTODB_URL'], layergroupid=named_map['layergroupid'], zoom=config['zoom'], center_lon=config['center'][0], center_lat=config['center'][1] ) print img_url shell('curl "{img_url}" > {output}'.format(img_url=img_url, output=self.output().path))
def complete(self): try: exists = shell('ls {}'.format(os.path.join(self.directory, '*.shp'))) return exists != '' except subprocess.CalledProcessError: return False
def input_files(self): cmd = 'ls {input}/*.{extension}'.format( input=self.input().path, extension=self.extension ) for geofile in shell(cmd).strip().split('\n'): yield geofile
def input_files(self): if self.resolution == GEO_MB: yield self.input().path else: cmd = 'ls {input}/*.shp'.format(input=self.input().path) for shp in shell(cmd).strip().split('\n'): yield shp
def run(self): infiles = shell("ls {input}/LC*DATA.CSV".format(input=self.input().path)) fhandle = self.output().open("w") for infile in infiles.strip().split("\n"): table = os.path.split(infile)[-1].split("DATA.CSV")[0] data = yield ImportEnglandWalesLocal(table=table) fhandle.write("{table}\n".format(table=data.table)) fhandle.close()
def _get_filename(self): cmd = 'curl -s {url}'.format(url=self.URL) cmd += ' | ' cmd += 'awk \'{print $9}\'' cmd += ' | ' cmd += 'grep -i ^{state}_[0-9]*\.zip$'.format(state=self.state) return shell(cmd)
def run(self): session = current_session() infile = os.path.join(self.input().path, self.table + "DATA.CSV") headers = shell("head -n 1 {csv}".format(csv=infile)) cols = ["{} NUMERIC".format(h) for h in headers.split(",")[1:]] session.execute( "CREATE TABLE {output} (GeographyCode TEXT, {cols})".format( output=self.output().table, cols=", ".join(cols) ) ) session.commit() shell( "cat '{infile}' | psql -c 'COPY {output} FROM STDIN WITH CSV HEADER'".format( output=self.output().table, infile=infile ) ) session.execute("ALTER TABLE {output} ADD PRIMARY KEY (geographycode)".format(output=self.output().table))
def run(self): session = current_session() infile = os.path.join(self.input().path, self.table + 'DATA.CSV') headers = shell('head -n 1 {csv}'.format(csv=infile)) cols = ['{} NUMERIC'.format(h) for h in headers.split(',')[1:]] session.execute('CREATE TABLE {output} (GeographyCode TEXT, {cols})'.format( output=self.output().table, cols=', '.join(cols) )) session.commit() shell("cat '{infile}' | psql -c 'COPY {output} FROM STDIN WITH CSV HEADER'".format( output=self.output().table, infile=infile, )) session.execute('ALTER TABLE {output} ADD PRIMARY KEY (geographycode)'.format( output=self.output().table ))
def run(self): infiles = shell('ls {input}/*.csv'.format( input=self.input().path)) fhandle = self.output().open('w') for infile in infiles.strip().split('\n'): topic = os.path.split(infile)[-1].split('.csv')[0] data = yield CopyDataToTable(resolution=self.resolution, survey=self.survey, topic=topic) fhandle.write('{table}\n'.format(table=data.table)) fhandle.close()
def run(self): infiles = shell('ls {input}/{survey_code}-{geo_code}*.[cC][sS][vV]'.format( input=self.input().path, survey_code=SURVEY_CODES[self.survey], geo_code=GEOGRAPHY_CODES[self.resolution] )) in_csv_files = infiles.strip().split('\n') os.makedirs(self.output().path) StatCanParser().parse_csv_to_files(in_csv_files, self.output().path)
def run(self): infiles = shell( 'ls {input}/{survey_code}-{geo_code}*.[cC][sS][vV]'.format( input=self.input().path, survey_code=SURVEY_CODES[self.survey], geo_code=GEOGRAPHY_CODES[self.resolution])) in_csv_files = infiles.strip().split('\n') os.makedirs(self.output().path) StatCanParser().parse_csv_to_files(in_csv_files, self.output().path)
def run(self): infile = os.path.join(self.input().path, self.topic + '.csv') headers = shell('head -n 1 {csv}'.format(csv=infile)) cols = ['{} NUMERIC'.format(h) for h in headers.split(',')[1:]] session = current_session() session.execute('CREATE TABLE {output} (Geo_Code TEXT, {cols})'.format( output=self.output().table, cols=', '.join(cols) )) session.commit() shell("cat '{infile}' | psql -c 'COPY {output} FROM STDIN WITH CSV HEADER'".format( output=self.output().table, infile=infile, )) session.execute('ALTER TABLE {output} ADD PRIMARY KEY (geo_code)'.format( output=self.output().table ))
def run(self): infiles = shell( 'ls {input}/LC*DATA.CSV'.format(input=self.input().path)) fhandle = self.output().open('w') for infile in infiles.strip().split('\n'): table = os.path.split(infile)[-1].split('DATA.CSV')[0] data = yield ImportEnglandWalesLocal(table=table) fhandle.write('{table}\n'.format(table=data.table)) fhandle.close()
def run(self): # make the table session = current_session() session.execute(''' DROP TABLE IF EXISTS {tablename}; CREATE TABLE {tablename} ( {columns} ); '''.format(tablename=self.output().table, columns=self.columns())) session.commit() #cursor.connection.commit() for infile in self.input(): print infile.path # gunzip each CSV into the table cmd = r"gunzip -c '{input}' | psql -c '\copy {tablename} FROM STDIN " \ r"WITH CSV HEADER'".format(input=infile.path, tablename=self.output().table) shell(cmd)
def input_files(self): ''' We don't know precise name of file inside zip archive beforehand, so use find to track it down. ''' return shell( "find '{dirpath}' -iname *_{resolution}_*_{aux}.shp | grep {timestamp}" .format(dirpath=self.input().path, timestamp=self.timestamp, aux=self.id_aux, resolution=self.resolution)).strip()
def populate(self): table_name = self.output().table shell(r"psql -c '\copy {table} FROM {file_path} WITH CSV HEADER'".format( table=table_name, file_path=self.input()['data_file'].path )) for name, segment_id in SpielmanSingletonColumns.x10_mapping.iteritems(): current_session().execute("update {table} set X10 = '{name}' " "where X10 ='{segment_id}'; ".format( table=table_name, name=name, segment_id=segment_id )) for name, segment_id in SpielmanSingletonColumns.x55_mapping.iteritems(): current_session().execute("update {table} set X55 = '{name}' " "where X55 ='{segment_id}'; ".format( table=table_name, name=name, segment_id=segment_id ))
def download(self): themes = { 'population': 'infra-population-12/', 'housing': 'infra-logement-12/', 'education': 'infra-formation-12/', 'household': 'infra-famille-12/', 'employment': 'infra-activite-resident-12/' } iris_overseas = { 'population': 'base-ic-evol-struct-pop-2012-com.xls', 'housing': 'base-ic-logement-2012-com.xls', 'education': 'base-ic-diplomes-formation-2012-com.xls', 'household': 'base-ic-couples-familles-menages-2012-com.xls', 'employment': 'base-ic-activite-residents-2012-com.xls' } URL = self.URL_base + themes.get(self.table_theme) + iris_overseas.get( self.table_theme) shell('wget -P {output} {url}'.format(output=self.output().path, url=URL))
def download(self): themes = { 'population': 'infra-population-12/', 'housing': 'infra-logement-12/', 'education': 'infra-formation-12/', 'household': 'infra-famille-12/', 'employment': 'infra-activite-resident-12/' } iris = { 'population': 'infra-population-2012.zip', 'housing': 'infra-logement-2012.zip', 'education': 'infra-formation-2012.zip', 'household': 'infra-famille-2012.zip', 'employment': 'infra-activite-resident-2012.zip' } URL = self.URL_base + themes.get(self.table_theme) + iris.get( self.table_theme) shell('wget -O {output}.zip {url}'.format(output=self.output().path, url=URL))
def run(self): infiles = shell('ls {input}/{survey_code}-{geo_code}*.[cC][sS][vV]'.format( input=self.input().path, survey_code=SURVEY_CODES[self.survey], geo_code=GEOGRAPHY_CODES[self.resolution] )) in_csv_files = [] for in_csv_file in infiles.strip().split('\n'): if not self._is_ignored_suffix(in_csv_file): in_csv_files.append(in_csv_file) else: LOGGER.warning('Ignoring file %s' % in_csv_file) os.makedirs(self.output().path) StatCanParser(self.DIVISION_SPLITTED[self.survey][self.resolution]).parse_csv_to_files(in_csv_files, self.output().path)
def run(self): shapefiles = shell('ls {dir}/*.shp'.format( dir=os.path.join('tmp', classpath(self), str(self.year), self.geography) )).strip().split('\n') cmd = 'PG_USE_COPY=yes PGCLIENTENCODING=latin1 ' \ 'ogr2ogr -f PostgreSQL "PG:dbname=$PGDATABASE active_schema={schema}" ' \ '-t_srs "EPSG:4326" -nlt MultiPolygon -nln {tablename} ' \ '-lco OVERWRITE=yes ' \ '-lco SCHEMA={schema} {shpfile_path} '.format( tablename=self.output().tablename, schema=self.output().schema, shpfile_path=shapefiles.pop()) shell(cmd) # chunk into 500 shapefiles at a time. for i, shape_group in enumerate(grouper(shapefiles, 500)): shell( 'export PG_USE_COPY=yes PGCLIENTENCODING=latin1; ' 'echo \'{shapefiles}\' | xargs -P 16 -I shpfile_path ' 'ogr2ogr -f PostgreSQL "PG:dbname=$PGDATABASE ' 'active_schema={schema}" -append ' '-t_srs "EPSG:4326" -nlt MultiPolygon -nln {tablename} ' 'shpfile_path '.format( shapefiles='\n'.join([shp for shp in shape_group if shp]), tablename=self.output().tablename, schema=self.output().schema)) print 'imported {} shapefiles'.format((i + 1) * 500) session = current_session() # Spatial index session.execute('ALTER TABLE {qualified_table} RENAME COLUMN ' 'wkb_geometry TO geom'.format( qualified_table=self.output().table)) session.execute('CREATE INDEX ON {qualified_table} USING GIST (geom)'.format( qualified_table=self.output().table))
def run(self): resp = requests.get(self.URL.format(resolution=self.resolution)) encoded = resp.text.encode(resp.headers['Content-Type'].split('charset=')[1]) reader = DictReader(encoded.split('\r\n')) for i, line in enumerate(reader): # TODO would be much, much faster in parallel... url = 'https://whosonfirst.mapzen.com/data/{path}'.format(path=line['path']) lfs_url = 'https://github.com/whosonfirst/whosonfirst-data/raw/master/data/{path}'.format( path=line['path']) cmd = 'wget \'{url}\' -O - | ogr2ogr -{operation} ' \ '-nlt MULTIPOLYGON -nln \'{table}\' ' \ '-f PostgreSQL PG:"dbname=$PGDATABASE ' \ 'active_schema={schema}" /vsistdin/'.format( url=url, schema=self.output().schema, table=self.output().tablename, operation='append' if i > 0 else 'overwrite -lco OVERWRITE=yes' ) try: shell(cmd) except subprocess.CalledProcessError: cmd = cmd.replace(url, lfs_url) shell(cmd)
def run(self): resp = query_cartodb('SELECT tablename FROM obs_table') tablenames = set([r['tablename'] for r in resp.json()['rows']]) remote_tables = [] for page in range(self.start, self.end + 1): remote_tables.extend( shell("curl -s '{cartodb_url}/datasets?page={page}' " "| grep -Eo 'obs_[0-f]{{40}}' | uniq".format( cartodb_url=os.environ['CARTODB_URL'], page=page)).strip().split('\n')) for table in remote_tables: LOGGER.info('keeping %s', table) if table not in tablenames: LOGGER.info('removing %s', table) try: CartoDBTarget(table).remove() except Exception as err: LOGGER.warn(err)
def run(self): self.output().makedirs() shell('wget \'{url}\' -O {output}'.format(url=self.url(), output=self.output().path))
def run(self): schema = 'tiger{year}'.format(year=self.year) shell("psql -c 'DROP SCHEMA IF EXISTS \"{schema}\" CASCADE'".format(schema=schema)) shell("psql -c 'CREATE SCHEMA \"{schema}\"'".format(schema=schema)) url = self.url_template.format(year=self.year) self.load_from_url(url)
def download(self): shell('wget --header=\'Cookie: auth_tkt="96a4778a0e3366127d4a47cf19a9c7d65751e5a9talos!userid_type:unicode"; auth_tkt="96a4778a0e3366127d4a47cf19a9c7d65751e5a9talos!userid_type:unicode";\' -O {output}.zip {url}'.format( output=self.output().path, url=self.URL))
def output(self): shps = shell('ls {}'.format(os.path.join(self.directory, '*.shp'))) for path in shps: yield LocalTarget(path)
def run(self): #for infile in self.input(): cmd = "cd {path} && find -iname '*.zip' -print0 | xargs -0 -n1 unzip -n -q ".format( path=self.directory) shell(cmd)
def output(self): filenames = shell('ls {}'.format(os.path.join( self.directory, self.geography, '*.zip'))).split('\n') for path in filenames: yield LocalTarget(path)
def run(self): shell('wget --recursive --continue --accept=*.zip ' '--no-parent --cut-dirs=3 --no-host-directories ' '--directory-prefix={directory} ' '{url}'.format(directory=self.directory, url=self.url))
def run(self): shell('aws s3 cp {input} {output}'.format( input=self.input().path, output=self.output().path ))
''' Test ACS columns ''' from tasks.util import shell # TODO clean this up in a more general init script try: shell('createdb test') except: pass from nose.tools import assert_equals, with_setup, assert_false, assert_true from tasks.meta import (OBSColumnTable, OBSColumn, OBSColumnToColumn, OBSTable, OBSTag, OBSColumnTag, Base) from tasks.us.census.lodes import WorkplaceAreaCharacteristicsColumns from tests.util import runtask, setup, teardown @with_setup(setup, teardown) def test_wac_columns_run(): runtask(WorkplaceAreaCharacteristicsColumns())