def rows_from_source(raw_source): source = raw_source.copy() try: f_source = io.BytesIO(source['source']) byteslike = True except (TypeError, AttributeError, KeyError): byteslike = False if byteslike: source['source'] = f_source stream = tabulator.Stream(**source, encoding='utf-8') else: stream = tabulator.Stream(source, headers=1, encoding='utf-8') stream.open() # This will get the first row try: hs = next(stream.iter(extended=True))[1] # nothing in the stream except StopIteration: hs = [] # Reset the pointer to the beginning stream.reset() o_headers = get_ordered_headers(hs) result = OrderedDict() for (row_num, headers, vals) in stream.iter(extended=True): data = dict(zip(headers, vals)) o_data = OrderedDict((h, data.get(h, '')) for h in o_headers) result[row_num] = o_data return (o_headers, result)
def load(logger, dp, url, res_name): try: s = tabulator.Stream(url) s.open() logging.info('OPENED %s', url) rows = slower(s.iter()) # rows = s.iter() canary = list(islice(rows, 0, 5)) maxlen = max(len(r) for r in canary) headers = ['Col%d' % i for i in range(1, maxlen + 1)] dp['resources'].append({ PROP_STREAMING: True, 'name': res_name, 'path': '.', 'schema': { 'fields': [{ 'name': h, 'type': 'string' } for h in headers] } }) rows = chain(canary, rows) rows = map(lambda row: dict(zip(headers, row)), rows) def aux(rows_): yield from rows_ # islice(rows_,10) return aux(rows) except: logger.error('Failed to load from source') raise
def get_tabular_stream(self, url: str, **kwargs: Any) -> tabulator.Stream: """Get Tabulator stream. Args: url (str): URL or path to download **kwargs: headers (Union[int, List[int], List[str]]): Number of row(s) containing headers or list of headers file_type (Optional[str]): Type of file. Defaults to inferring. delimiter (Optional[str]): Delimiter used for values in each row. Defaults to inferring. Returns: tabulator.Stream: Tabulator Stream object """ self.close_response() file_type = kwargs.get("file_type") if file_type is not None: kwargs["format"] = file_type del kwargs["file_type"] if "http_session" not in kwargs: kwargs["http_session"] = self.session try: self.response = tabulator.Stream(url, **kwargs) self.response.open() return self.response except Exception as e: raise DownloadError( f"Getting tabular stream for {url} failed!" ) from e
def cli(source, limit, **options): """Command-line interface ``` Usage: tabulator [OPTIONS] SOURCE Options: --headers INTEGER --scheme TEXT --format TEXT --encoding TEXT --limit INTEGER --version Show the version and exit. --help Show this message and exit. ``` """ options = {key: value for key, value in options.items() if value is not None} with tabulator.Stream(source, **options) as stream: cast = str if six.PY2: cast = unicode # noqa if stream.headers: click.echo(click.style(', '.join(map(cast, stream.headers)), bold=True)) for count, row in enumerate(stream, start=1): click.echo(','.join(map(cast, row))) if count == limit: break
def get_tabular_stream(self, url, **kwargs): # type: (str, Any) -> tabulator.Stream """Get Tabulator stream. Args: url (str): URL to download **kwargs: headers (Union[int, List[int], List[str]]): Number of row(s) containing headers or list of headers file_type (Optional[str]): Type of file. Defaults to inferring. delimiter (Optional[str]): Delimiter used for values in each row. Defaults to inferring. Returns: tabulator.Stream: Tabulator Stream object """ self.close_response() file_type = kwargs.get('file_type') if file_type is not None: kwargs['format'] = file_type del kwargs['file_type'] try: self.response = tabulator.Stream(url, **kwargs) self.response.open() return self.response except TabulatorException as e: raisefrom(DownloadError, 'Getting tabular stream for %s failed!' % url, e)
def get_csv_as_utf_8_byte_stream( cls, full_url, delimiter=",", quotechar='"', encoding=None, ): try: with tabulator.Stream( full_url, headers=1, ignore_blank_headers=True, delimiter=delimiter, quotechar=quotechar, encoding=encoding, format='csv', post_parse=[cls.check_and_clean_up_row], ) as csv_file: file_contents_utf_8 = BytesIO() writer = unicodecsv.writer( file_contents_utf_8, encoding='utf-8', delimiter=delimiter, quotechar=quotechar, ) writer.writerow(csv_file.headers) for row in csv_file.iter(): writer.writerow(row) file_contents_utf_8.seek(0) return file_contents_utf_8 except (TabulatorException, csv.Error) as e: error_message = str(e) app.logger.error(error_message) raise e
def opener(): _params = dict(headers=1) skip_rows = __resource.get('skip_rows', 0) format = __resource.get("format") if format == "txt": # datapackage-pipelines processing requires having a header row # for txt format we add a single "data" column _params["headers"] = ["data"] _params["custom_parsers"] = {"txt": TXTParser} _params["allow_html"] = True else: if format is None: _, format = tabulator.helpers.detect_scheme_and_format( __url) try: parser_cls = tabulator.helpers.import_attribute( tabulator.config.PARSERS[format]) except KeyError: logging.error("Unknown format %r", format) raise _params.update( dict(x for x in __resource.items() if x[0] in parser_cls.options)) _params.update( dict( x for x in __resource.items() if x[0] in { 'headers', 'scheme', 'encoding', 'sample_size', 'allow_html', 'force_strings', 'force_parse' })) _params['format'] = format constants = _resource.get('constants', {}) constant_headers = list(constants.keys()) constant_values = [constants.get(k) for k in constant_headers] _stream = tabulator.Stream(__url, **_params, post_parse=[ row_skipper(skip_rows), suffix_remover(format), add_constants( constant_headers, constant_values) ]) try: _stream.open() _headers = dedupe(_stream.headers + constant_headers) _schema = __resource.get('schema') if _schema is not None: _schema = Schema(_schema) return _schema, _headers, _stream, _stream.close except tabulator.exceptions.TabulatorException as e: logging.warning("Error while opening resource from url %s: %r", _url, e) _stream.close() if not _ignore_missing: raise return {}, [], [], lambda: None
def rows_from_source(raw_source): source = dict(raw_source) source['source'] = io.BytesIO(source['source']) stream = tabulator.Stream(**source) stream.open() result = OrderedDict( (row_num, OrderedDict((k, v) for (k, v) in zip(headers, vals) if k)) for (row_num, headers, vals) in stream.iter(extended=True)) return (stream.headers, result)
def _get_registry(self, registry_path_or_url): '''dict: Return the registry as dict with profiles keyed by id.''' table = tabulator.Stream(registry_path_or_url, headers=1).open() try: registry = dict([(o['id'], o) for o in table.read(keyed=True)]) return registry except KeyError as e: msg = ('Registry at "{path}" has no "id" column.').format( path=registry_path_or_url) six.raise_from(ValueError(msg), e)
def cli(source, limit, **options): """Command-line interface ``` Usage: tabulator [OPTIONS] SOURCE Options: --headers INTEGER --scheme TEXT --format TEXT --encoding TEXT --limit INTEGER --sheet TEXT/INTEGER (excel) --fill-merged-cells BOOLEAN (excel) --preserve-formatting BOOLEAN (excel) --adjust-floating-point-error BOOLEAN (excel) --table TEXT (sql) --order_by TEXT (sql) --resource TEXT/INTEGER (datapackage) --property TEXT (json) --keyed BOOLEAN (json) --version Show the version and exit. --help Show this message and exit. ``` """ # Normalize options options = { key: value for key, value in options.items() if value is not None } try: options['sheet'] = int(options.get('sheet')) options['resource'] = int(options.get('resource')) except Exception: pass # Read the table try: with tabulator.Stream(source, **options) as stream: cast = str if six.PY2: cast = unicode # noqa if stream.headers: click.echo( click.style(', '.join(map(cast, stream.headers)), bold=True)) for count, row in enumerate(stream, start=1): click.echo(','.join(map(cast, row))) if count == limit: break except exceptions.TabulatorException as exception: click.echo('[error] %s' % str(exception)) exit(1)
def extracted(self): """ An iterator of data from the upload This default implementation does not transform the data at all. Complex data sources, like spreadsheets with data in cells that aren't arranged in tables, will need to override it. """ stream = tabulator.Stream(io.BytesIO(self.upload.raw), format=self.upload.file_type) stream.open() return stream
def cli(source, limit, **options): """https://github.com/frictionlessdata/tabulator-py#cli """ options = {key: value for key, value in options.items() if value is not None} with tabulator.Stream(source, **options) as stream: cast = str if six.PY2: cast = unicode # noqa if stream.headers: click.echo(click.style(', '.join(map(cast, stream.headers)), bold=True)) for count, row in enumerate(stream, start=1): click.echo(', '.join(map(cast, row))) if count == limit: break
def test_dump_to_path_use_titles(): from dataflows import Flow, dump_to_path, set_type import tabulator Flow( [{'hello': 'world', 'hola': 'mundo'}, {'hello': 'עולם', 'hola': 'عالم'}], *(set_type(name, resources=['res_1'], title=title) for name, title in (('hello', 'שלום'), ('hola', 'aloha'))), dump_to_path('data/dump_with_titles', use_titles=True) ).process() with tabulator.Stream('data/dump_with_titles/res_1.csv') as stream: assert stream.read() == [['שלום', 'aloha'], ['world', 'mundo'], ['עולם', 'عالم']]
def test_stream(self): """ Given 2 remote pages of a JSON-API resource When I parse it with tabulator Then I get back a single item list with the full json in it """ url = "https://raw.githubusercontent.com/strets123/frictionless-pres/master/data/smdataset%3Fpage%5Bnumber%5D%3D0" with tabulator.Stream( url, format="json-api", custom_parsers={"json-api": jsonapi_parser.JSONAPIParser}, property='data', ) as stream: for index, item in enumerate(stream): self.assertTrue(isinstance(item[0], dict)) self.assertIn("attributes", item[0]) self.assertIn("id", item[0]) self.assertIn("links", item[0]) self.assertEqual(len(item), 1)
def transpose(sheet): stream = tabulator.Stream(sheet).open() cells = list(stream.iter()) num_rows = len(cells) num_cols = len(cells[0]) headers = None outputed = 0 for i in range(num_cols): row = [(cells_row[i] if len(cells_row) > i else None) for cells_row in cells] if any(row): if i == 0: headers = row all_headers.update(headers) else: outputed += 1 yield dict(zip(headers, row)) else: break print(sheet, num_rows, '->', outputed)
def stream(self): if self._stream is None: source = copy.deepcopy(self.config._unflatten().get('source', {})) structure = self._structure_params() try: path = source.pop('path') if not path: return None logger.info('Opening stream %s', path) if 'workbook_cache' in source: source['workbook_cache'] = _workbook_cache self._stream = tabulator.Stream(path, **source, **structure, http_session=self.http_session()).open() for k in source.keys(): self.config.get('source.' + k) for k in structure.keys(): self.config.get('structure.' + k) except Exception: logger.exception('Failed to open URL, source=%r, structure=%r', source, structure) raise return self._stream
def process_build(build, jenkins_user_token): build_url = '{}api/json'.format(build['url']) build = jenkins_driver.curl(*jenkins_user_token, build_url) try: build_timestamp = datetime.datetime.utcfromtimestamp( build["timestamp"] / 1000) except Exception: build_timestamp = None if build_timestamp: output_url = '{}artifact/output.csv'.format(build['url']) try: with tabulator.Stream(output_url, headers=1, http_session=jenkins_driver.get_session( *jenkins_user_token)) as stream: for row in stream.iter(keyed=True): row['timestamp'] = build_timestamp yield row except tabulator.exceptions.HTTPError: print('failed to get build artifact', build) else: print('failed to get build timestamp', build)
def stream(self): """ An iterator of data from the upload This default implementation does not transform the data at all. Complex data sources, like spreadsheets with data in cells that aren't arranged in tables, will need to override it. """ stream = tabulator.Stream(io.BytesIO(self.upload.raw), format=self.upload.file_type, **UPLOAD_SETTINGS['STREAM_ARGS']) stream.open() if UPLOAD_SETTINGS['OLD_HEADER_ROW'] is not None: if not UPLOAD_SETTINGS['HEADERS']: raise exceptions.ImproperlyConfigured( "use DATA_INGEST['OLD_HEADER_ROW'] only with DATA_INGEST['HEADERS']" ) for row in range(UPLOAD_SETTINGS['OLD_HEADER_ROW']): next(stream) # discard rows before header return stream
def opener(): _params = dict(headers=1) _params.update( dict(x for x in __resource.items() if x[0] not in {'path', 'name', 'schema', 'mediatype', 'skip_rows'})) skip_rows = __resource.get('skip_rows', 0) _stream = tabulator.Stream(__url, **_params, post_parse=[row_skipper(skip_rows)]) try: _stream.open() _headers = dedupe(_stream.headers) _schema = __resource.get('schema') if _schema is not None: _schema = Schema(_schema) return _schema, _headers, _stream, _stream.close except tabulator.exceptions.TabulatorException as e: logging.warning("Error while opening resource from url %s: %r", _url, e) if not _ignore_missing: raise return {}, [], [], lambda: None
def import_manifests(source_files): """ Loops through the source files and streams them into a dataframe, then converts the dataframe to a list of manifest dicts. """ # Set up the storage functions for pandas dataframes storage = Storage() storage.create( 'data', { 'primaryKey': 'name', 'fields': [{ 'name': 'name', 'type': 'string' }, { 'name': 'metapath', 'type': 'string' }, { 'name': 'namespace', 'type': 'string' }, { 'name': 'title', 'type': 'string' }, { 'name': 'id', 'type': 'string' }, { 'name': '_id', 'type': 'string' }, { 'name': 'description', 'type': 'string' }, { 'name': 'version', 'type': 'string' }, { 'name': 'shortTitle', 'type': 'string' }, { 'name': 'label', 'type': 'string' }, { 'name': 'notes', 'type': 'string' }, { 'name': 'keywords', 'type': 'string' }, { 'name': 'image', 'type': 'string' }, { 'name': 'publisher', 'type': 'string' }, { 'name': 'webpage', 'type': 'string' }, { 'name': 'authors', 'type': 'string' }, { 'name': 'date', 'type': 'string' }, { 'name': 'edition', 'type': 'string' }, { 'name': 'contentType', 'type': 'string' }, { 'name': 'country', 'type': 'string' }, { 'name': 'language', 'type': 'string' }, { 'name': 'citation', 'type': 'string' }] }) path = os.path.join('app', current_app.config['UPLOAD_FOLDER']) error_list = [] print('source_files') print(source_files) for item in source_files: if item.endswith('.xlsx') or item.endswith('.xls'): options = {'format': 'xlsx', 'sheet': 1, 'headers': 1} else: options = {'headers': 1} filepath = os.path.join(path, item) with tabulator.Stream(filepath, **options) as stream: try: stream.headers == [ 'name', 'metapath', 'namespace', 'title', 'id', '_id', 'description', 'version', 'shortTitle', 'label', 'notes', 'keywords', 'image', 'publisher', 'webpage', 'authors', 'date', 'edition', 'contentType', 'country', 'language', 'citation' ] except: col_order = 'name, metapath, namespace, title, id, _id, description, version, shortTitle, label, notes, keywords, image, publisher, webpage, authors, date, edition, contentType, country, language, citation' error_list.append( 'Error: The table headings in ' + item + ' do not match the Sources schema. Please use the headings ' + col_order + ' in that order.') with tabulator.Stream(filepath, **options) as stream: try: storage.write('data', stream) except: error_list.append('Error: Could not stream tabular data.') os.remove(filepath) manifests = [] properties = {} data_dict = storage['data'].to_dict('index') print(data_dict) for key, values in data_dict.items(): properties = {k: v for k, v in values.items() if v is not None} properties = {k: v.replace('\\n', '\n') for k, v in properties.items()} properties['name'] = key properties['namespace'] = 'we1sv2.0' properties['metapath'] = 'Sources' if validate_manifest(properties) == True: manifests.append(properties) else: error_list.append('Could not produce a valid manifest for <code>' + key + '</code>.') # Now we're ready to insert into the database print(manifests) for manifest in manifests: db_errors = create_record(manifest) error_list = error_list + db_errors return manifests, error_list
def opener(): _params = dict(headers=1) format = __resource.get("format") if format == "txt": # datapackage-pipelines processing requires having a header row # for txt format we add a single "data" column _params["headers"] = ["data"] _params["custom_parsers"] = {"txt": TXTParser} _params["allow_html"] = True else: if format is None: _, format = tabulator.helpers.detect_scheme_and_format( __url) if format in tabulator.config.SUPPORTED_COMPRESSION: format = None else: try: parser_cls = tabulator.helpers.import_attribute( tabulator.config.PARSERS[format]) except KeyError: logging.error("Unknown format %r", format) raise _params.update( dict(x for x in __resource.items() if x[0] in parser_cls.options)) _params.update( dict( x for x in __resource.items() if x[0] in { 'headers', 'scheme', 'encoding', 'sample_size', 'allow_html', 'force_strings', 'force_parse', 'skip_rows', 'compression' })) if isinstance(_params.get('skip_rows'), int): # Backwards compatibility _params['skip_rows'] = list( range(1, _params.get('skip_rows') + 1)) if format is not None: _params['format'] = format if http_headers: http_session = requests.Session() http_session.headers = http_headers _params['http_session'] = http_session constants = _resource.get('constants', {}) constant_headers = list(constants.keys()) constant_values = [constants.get(k) for k in constant_headers] _stream = tabulator.Stream(__url, **_params, post_parse=[ suffix_remover(format), add_constants( constant_headers, constant_values, _columns) ]) retry = 0 backoff = 2 while True: try: _stream.open() _headers = dedupe(_stream.headers) __columns = len(_headers) _headers = dedupe(_headers + constant_headers) _schema = __resource.get('schema') if _schema is not None: _schema = Schema(_schema) return _schema, _headers, __columns, _stream, _stream.close except tabulator.exceptions.TabulatorException as e: logging.warning( "Error while opening resource from url %s: %r", _url, e) _stream.close() retry += 1 if retry <= 3: logging.warning("Retrying after %d seconds (%d/3)", backoff, retry) time.sleep(backoff) backoff *= 2 continue else: if not _ignore_missing: raise return {}, [], 0, [], lambda: None
download = tempfile.NamedTemporaryFile(delete=False, suffix=os.path.basename(url)) shutil.copyfileobj(requests.get(url, stream=True).raw, download) download.close() download = download.name content = open(download, 'rb').read() os.unlink(download) content = content.replace(b'\n="', b'\n"') content = content.replace(b',="', b',"') out.write(content) out.flush() logging.info('downloaded from %s %d bytes: %r', url, len(content), content[:1000]) datapackage['resources'].append(resource) stream = \ tabulator.Stream('file://'+out.name, force_strings=True, **parameters.get('tabulator', {}))\ .open() resource['schema'] = { 'fields': [{ 'name': h, 'type': 'string' } for h in stream.headers] } ctx.resource_iterator = itertools.chain(res_iter, [stream.iter(keyed=True)])
def get_csv_sample(cls, url, delimiter=",", quotechar='"', number_of_lines_sample=4, encoding=None): bucket = app.config['s3']['bucket_url'] full_url = os.path.join(bucket, url) try: with tabulator.Stream( full_url, headers=1, sample_size=number_of_lines_sample, ignore_blank_headers=True, delimiter=delimiter, quotechar=quotechar, format='csv', force_parse=True, encoding=encoding, post_parse=[cls.check_and_clean_up_row], ) as csv_file: sample = csv_file.sample contents = csv_file.read( limit=app.config['app']['csv_sample_infer_lines']) headers = csv_file.headers if not headers: raise csv.Error( 'no headers found. The first line of the csv should ' 'contain the column headers.') invalid_headings = list( filter(lambda x: not re.match("^[a-z][a-z0-9_]*$", x), headers)) if invalid_headings: joined_invalid_headings = '"' + '", "'.join( invalid_headings) + '"' raise csv.Error( f"column headers must start with a letter and " f"may only contain lowercase letters, numbers, and underscores. Invalid " f"headers: {joined_invalid_headings}") if not sample or not contents: raise csv.Error("no data found") column_types = cls.get_column_types(headers, contents) sample = list( zip(cls.make_unique_headers(headers), column_types, list(map(list, zip(*sample))))) return sample, None except (TabulatorException, csv.Error) as e: error_logger = app.logger.info if isinstance( e, csv.Error) else app.logger.error error_message = 'Unable to process CSV file: ' if isinstance(e, EncodingError): error_message += f'the CSV file could not be opened. (Technical details: {str(e)})' else: error_message += str(e) error_logger(error_message) return [], error_message
import tabulator from datapackage_pipelines.wrapper import process import re tag_areas = tabulator.Stream('https://docs.google.com/spreadsheets/d/1k4iSVsX79-VMZCYoHFRL5Q6GAMP6M1t2hIFv7k4E8eM/edit#gid=0', headers=1) tag_areas.open() tag_areas = list(tag_areas.iter()) tag_areas = [[x.strip() for x in y[0].split(',')] for y in tag_areas] tag_res = [re.compile(r'\b(' + '|'.join('({})'.format(tag) for tag in tag_area) + r')\b') for tag_area in tag_areas] tag_areas = list(zip(tag_res, tag_areas)) def process_row(row, _1, spec, _2, params, _3): resource = params['resource'] target_field = params['target-field'] source_fields = params['source-fields'] if spec['name'] == resource: tags = set() for field in source_fields: value = row[field] if value: for tag_re, tag_area in tag_areas: if tag_re.search(value): repl = tag_re.sub(' '.join(tag_area), value) tags.add(repl) break
import tabulator import logging from datapackage_pipelines.wrapper import process kinds = { 'central': 'מכרז מרכזי', 'office': 'מכרז משרדי', 'exemptions': 'רכש פטור ממכרז', } tender_conversion_table = tabulator.Stream( 'https://docs.google.com/spreadsheets/d/1JkZooBwoxKPlrXWWwUtNHLMdxc-iG0n5tdGZ23I8fYY/edit#gid=1250185114', headers=1).open().iter(keyed=True) tender_conversion_table = dict( ((x['tender_type'].strip(), x['decision'].strip(), x['has_awardees'].strip().lower() == 'true', x['has_active_awardees'].strip().lower() == 'true'), dict((k, x[k].strip()) for k in [ 'simple_decision', 'simple_decision_long', 'extended_status', 'tip1', 'tip1_link', 'tip2', 'tip2_link' ])) for x in tender_conversion_table) exemption_conversion_table = tabulator.Stream( 'https://docs.google.com/spreadsheets/d/1lM2Afiw2JMJzHMIvWt3XylWLy2PLmVGbqaR9laBXFNo/edit#gid=1604640775', headers=1).open().iter(keyed=True) exemption_conversion_table = dict( ((x['regulation'].strip(), x['decision'].strip(), x['has_awardees'].strip().lower() == 'true', x['has_active_awardees'].strip().lower() == 'true'), dict((k, x[k].strip()) for k in [
#!/usr/bin/env python import tabulator import json import os base_dir = os.path.dirname(__file__) tooltips = tabulator.Stream( 'https://docs.google.com/spreadsheets/d/1ztsoslfvEiQS1jSTrivU6chvyAySIIV9tad0L1ZKGj8/edit#gid=0', headers=1).open() tooltips = list(tooltips.iter()) tooltips = sorted(tooltips, key=lambda x: len(x[0]), reverse=True) json.dump(tooltips, open(os.path.join(base_dir, 'tooltips.json'), 'w'), ensure_ascii=False, indent=2)
import json import logging import datetime import tabulator from fuzzywuzzy import process as fw_process logging.info('LOADING LAMAS DATA') lamas_data = Package('/var/datapackages/lamas-municipal-data/datapackage.json') districts = dict( (x['entity_name'], x['district_2015']) for x in lamas_data.resources[0].iter(keyed=True) ) logging.info('LOADING FOA IMPROVEMENT') foa_improvement = tabulator.Stream('https://docs.google.com/spreadsheets/d/17kX25p_M59h6VoDB90BeNVSmrel_9ipxUyko8SD6eKY/edit?usp=sharing', headers=1) foa_improvement = foa_improvement.open().iter(keyed=True) foa_improvement = dict((x.pop('original'), x) for x in foa_improvement) regional_towns = Package('/var/datapackages/lamas-municipality-locality-map/datapackage.json') cache = {} for town, municipality in regional_towns.resources[0].iter(): district = None if municipality in cache: district = cache[municipality] else: best = fw_process.extract(municipality, districts.keys()) if len(best)>0: score = best[0][1] best = best[0][0]
from pathlib import Path fp = re.compile('\w+', re.UNICODE) def fingerprint(x): return ''.join(fp.findall(x.upper())) if __name__ == '__main__': # How to get places.csv from a nominatim server? # # kubectl exec -it nominatim-5d6889fd59-lxps7 -- su nominatim -c "psql -c \"\\copy (select hstore_to_json(name) from place where type in ('village', 'town', 'city', 'locality')) to stdout with csv\"" > places.csv # How to get official list of towns? # # https://data.gov.il/dataset/citiesandsettelments langs = {} s = tabulator.Stream('tools/yeshuvim.csv', headers=1) s.open() for item in s.iter(keyed=True): langs.setdefault('he', []).append(item['שם_ישוב'].strip().replace( '(', 'XXX').replace(')', '(').replace('XXX', ')').replace(' ', ' ')) langs.setdefault('en', []).append( item['שם_ישוב_לועזי'].strip().title().replace(' ', ' ')) s = tabulator.Stream('tools/places.csv') s.open() for item in s.iter(): if len(item) == 0: continue item = json.loads(item[0]) for k, v in item.items(): if k.startswith('name'):
def iter(self): '''Lazily-iterates over rows in data. This method is useful when you don't want to load all data in memory at once. Returns: iter: An iterator that yields each row in this resource. Raises: ValueError: If the data isn't tabular, if the resource has no data, or if its specified encoding is incorrect IOError: If there was some problem opening the data file (e.g. it doesn't exist or we don't have permissions to read it). ''' result = None inline_data = self.descriptor.get('data') if self.local_data_path and os.path.isfile(self.local_data_path): data_path_or_url = self.local_data_path else: data_path_or_url = self.remote_data_path if inline_data: inline_data = self._parse_inline_data() result = iter(inline_data) elif data_path_or_url: encoding = self.descriptor.get('encoding') dialect = self.descriptor.get('dialect', {}) options = {} if dialect: options['format'] = 'csv' if 'delimiter' in dialect: options['delimiter'] = dialect['delimiter'] if 'lineTerminator' in dialect: # https://github.com/frictionlessdata/datapackage-py/issues/58 # tabulator doesn't support lineTerminator because # it's not supported by Python builtin csv parser lineterm = dialect['lineTerminator'] if lineterm not in ['\r\n', '\r', '\n']: message = 'Line terminator "%s" is not supported' % lineterm warnings.warn(message, UserWarning) try: table = tabulator.Stream(data_path_or_url, headers=1, encoding=encoding, **options).open() result = self._iter_from_tabulator( table, self.descriptor.get('schema')) except tabulator.exceptions.TabulatorException as e: msg = 'Data at \'{0}\' isn\'t in a known tabular data format' six.raise_from(ValueError(msg.format(data_path_or_url)), e) if result is None: if self.descriptor.get('path'): # FIXME: This is a hack to throw an IOError when local data # exists but couldn't be loaded for some reason. If "path" # existed and there were no issues opening it, "result" would # never be None. raise IOError('Resource\'s data couldn\'t be loaded.') raise ValueError('Resource has no data') return result
parameters, datapackage, res_iter = ingest() url = parameters.get('url') resource = parameters.get('resource') resource[PROP_STREAMING] = True content = requests.get(url).content content = content.replace(b'\n="', b'\n"') content = content.replace(b',="', b',"') out = tempfile.NamedTemporaryFile(suffix='.csv', delete=False) out.write(content) logging.info('downloaded from %s %d bytes: %r', url, len(content), content[:1000]) datapackage['resources'].append(resource) stream = \ tabulator.Stream('file://'+out.name, **parameters.get('tabulator', {}))\ .open() resource['schema'] = { 'fields': [{ 'name': h, 'type': 'string' } for h in stream.headers] } spew(datapackage, itertools.chain(res_iter, [stream.iter(keyed=True)]))