def rows_from_source(raw_source):
    source = raw_source.copy()
    try:
        f_source = io.BytesIO(source['source'])
        byteslike = True
    except (TypeError, AttributeError, KeyError):
        byteslike = False

    if byteslike:
        source['source'] = f_source
        stream = tabulator.Stream(**source, encoding='utf-8')
    else:
        stream = tabulator.Stream(source, headers=1, encoding='utf-8')

    stream.open()

    # This will get the first row
    try:
        hs = next(stream.iter(extended=True))[1]
    # nothing in the stream
    except StopIteration:
        hs = []
    # Reset the pointer to the beginning
    stream.reset()
    o_headers = get_ordered_headers(hs)

    result = OrderedDict()
    for (row_num, headers, vals) in stream.iter(extended=True):
        data = dict(zip(headers, vals))
        o_data = OrderedDict((h, data.get(h, '')) for h in o_headers)
        result[row_num] = o_data

    return (o_headers, result)
def load(logger, dp, url, res_name):
    try:
        s = tabulator.Stream(url)
        s.open()
        logging.info('OPENED %s', url)
        rows = slower(s.iter())
        # rows = s.iter()
        canary = list(islice(rows, 0, 5))
        maxlen = max(len(r) for r in canary)
        headers = ['Col%d' % i for i in range(1, maxlen + 1)]
        dp['resources'].append({
            PROP_STREAMING: True,
            'name': res_name,
            'path': '.',
            'schema': {
                'fields': [{
                    'name': h,
                    'type': 'string'
                } for h in headers]
            }
        })
        rows = chain(canary, rows)
        rows = map(lambda row: dict(zip(headers, row)), rows)

        def aux(rows_):
            yield from rows_  # islice(rows_,10)

        return aux(rows)
    except:
        logger.error('Failed to load from source')
        raise
Exemple #3
0
    def get_tabular_stream(self, url: str, **kwargs: Any) -> tabulator.Stream:
        """Get Tabulator stream.

        Args:
            url (str): URL or path to download
            **kwargs:
            headers (Union[int, List[int], List[str]]): Number of row(s) containing headers or list of headers
            file_type (Optional[str]): Type of file. Defaults to inferring.
            delimiter (Optional[str]): Delimiter used for values in each row. Defaults to inferring.

        Returns:
            tabulator.Stream: Tabulator Stream object

        """
        self.close_response()
        file_type = kwargs.get("file_type")
        if file_type is not None:
            kwargs["format"] = file_type
            del kwargs["file_type"]
        if "http_session" not in kwargs:
            kwargs["http_session"] = self.session
        try:
            self.response = tabulator.Stream(url, **kwargs)
            self.response.open()
            return self.response
        except Exception as e:
            raise DownloadError(
                f"Getting tabular stream for {url} failed!"
            ) from e
def cli(source, limit, **options):
    """Command-line interface

    ```
    Usage: tabulator [OPTIONS] SOURCE

    Options:
      --headers INTEGER
      --scheme TEXT
      --format TEXT
      --encoding TEXT
      --limit INTEGER
      --version          Show the version and exit.
      --help             Show this message and exit.
    ```

    """
    options = {key: value for key, value in options.items() if value is not None}
    with tabulator.Stream(source, **options) as stream:
        cast = str
        if six.PY2:
            cast = unicode  # noqa
        if stream.headers:
            click.echo(click.style(', '.join(map(cast, stream.headers)), bold=True))
        for count, row in enumerate(stream, start=1):
            click.echo(','.join(map(cast, row)))
            if count == limit:
                break
Exemple #5
0
    def get_tabular_stream(self, url, **kwargs):
        # type: (str, Any) -> tabulator.Stream
        """Get Tabulator stream.

        Args:
            url (str): URL to download
            **kwargs:
            headers (Union[int, List[int], List[str]]): Number of row(s) containing headers or list of headers
            file_type (Optional[str]): Type of file. Defaults to inferring.
            delimiter (Optional[str]): Delimiter used for values in each row. Defaults to inferring.

        Returns:
            tabulator.Stream: Tabulator Stream object

        """
        self.close_response()
        file_type = kwargs.get('file_type')
        if file_type is not None:
            kwargs['format'] = file_type
            del kwargs['file_type']
        try:
            self.response = tabulator.Stream(url, **kwargs)
            self.response.open()
            return self.response
        except TabulatorException as e:
            raisefrom(DownloadError,
                      'Getting tabular stream for %s failed!' % url, e)
Exemple #6
0
 def get_csv_as_utf_8_byte_stream(
     cls,
     full_url,
     delimiter=",",
     quotechar='"',
     encoding=None,
 ):
     try:
         with tabulator.Stream(
                 full_url,
                 headers=1,
                 ignore_blank_headers=True,
                 delimiter=delimiter,
                 quotechar=quotechar,
                 encoding=encoding,
                 format='csv',
                 post_parse=[cls.check_and_clean_up_row],
         ) as csv_file:
             file_contents_utf_8 = BytesIO()
             writer = unicodecsv.writer(
                 file_contents_utf_8,
                 encoding='utf-8',
                 delimiter=delimiter,
                 quotechar=quotechar,
             )
             writer.writerow(csv_file.headers)
             for row in csv_file.iter():
                 writer.writerow(row)
         file_contents_utf_8.seek(0)
         return file_contents_utf_8
     except (TabulatorException, csv.Error) as e:
         error_message = str(e)
         app.logger.error(error_message)
         raise e
Exemple #7
0
        def opener():
            _params = dict(headers=1)
            skip_rows = __resource.get('skip_rows', 0)
            format = __resource.get("format")
            if format == "txt":
                # datapackage-pipelines processing requires having a header row
                # for txt format we add a single "data" column
                _params["headers"] = ["data"]
                _params["custom_parsers"] = {"txt": TXTParser}
                _params["allow_html"] = True
            else:
                if format is None:
                    _, format = tabulator.helpers.detect_scheme_and_format(
                        __url)
                try:
                    parser_cls = tabulator.helpers.import_attribute(
                        tabulator.config.PARSERS[format])
                except KeyError:
                    logging.error("Unknown format %r", format)
                    raise
                _params.update(
                    dict(x for x in __resource.items()
                         if x[0] in parser_cls.options))
                _params.update(
                    dict(
                        x for x in __resource.items() if x[0] in {
                            'headers', 'scheme', 'encoding', 'sample_size',
                            'allow_html', 'force_strings', 'force_parse'
                        }))

            _params['format'] = format

            constants = _resource.get('constants', {})
            constant_headers = list(constants.keys())
            constant_values = [constants.get(k) for k in constant_headers]
            _stream = tabulator.Stream(__url,
                                       **_params,
                                       post_parse=[
                                           row_skipper(skip_rows),
                                           suffix_remover(format),
                                           add_constants(
                                               constant_headers,
                                               constant_values)
                                       ])
            try:
                _stream.open()
                _headers = dedupe(_stream.headers + constant_headers)
                _schema = __resource.get('schema')
                if _schema is not None:
                    _schema = Schema(_schema)
                return _schema, _headers, _stream, _stream.close
            except tabulator.exceptions.TabulatorException as e:
                logging.warning("Error while opening resource from url %s: %r",
                                _url, e)
                _stream.close()
                if not _ignore_missing:
                    raise
                return {}, [], [], lambda: None
Exemple #8
0
def rows_from_source(raw_source):
    source = dict(raw_source)
    source['source'] = io.BytesIO(source['source'])
    stream = tabulator.Stream(**source)
    stream.open()
    result = OrderedDict(
        (row_num, OrderedDict((k, v) for (k, v) in zip(headers, vals) if k))
        for (row_num, headers, vals) in stream.iter(extended=True))
    return (stream.headers, result)
Exemple #9
0
 def _get_registry(self, registry_path_or_url):
     '''dict: Return the registry as dict with profiles keyed by id.'''
     table = tabulator.Stream(registry_path_or_url, headers=1).open()
     try:
         registry = dict([(o['id'], o) for o in table.read(keyed=True)])
         return registry
     except KeyError as e:
         msg = ('Registry at "{path}" has no "id" column.').format(
             path=registry_path_or_url)
         six.raise_from(ValueError(msg), e)
Exemple #10
0
def cli(source, limit, **options):
    """Command-line interface

    ```
    Usage: tabulator [OPTIONS] SOURCE

    Options:
      --headers INTEGER
      --scheme TEXT
      --format TEXT
      --encoding TEXT
      --limit INTEGER
      --sheet TEXT/INTEGER (excel)
      --fill-merged-cells BOOLEAN (excel)
      --preserve-formatting BOOLEAN (excel)
      --adjust-floating-point-error BOOLEAN (excel)
      --table TEXT (sql)
      --order_by TEXT (sql)
      --resource TEXT/INTEGER (datapackage)
      --property TEXT (json)
      --keyed BOOLEAN (json)
      --version          Show the version and exit.
      --help             Show this message and exit.
    ```

    """

    # Normalize options
    options = {
        key: value
        for key, value in options.items() if value is not None
    }
    try:
        options['sheet'] = int(options.get('sheet'))
        options['resource'] = int(options.get('resource'))
    except Exception:
        pass

    # Read the table
    try:
        with tabulator.Stream(source, **options) as stream:
            cast = str
            if six.PY2:
                cast = unicode  # noqa
            if stream.headers:
                click.echo(
                    click.style(', '.join(map(cast, stream.headers)),
                                bold=True))
            for count, row in enumerate(stream, start=1):
                click.echo(','.join(map(cast, row)))
                if count == limit:
                    break
    except exceptions.TabulatorException as exception:
        click.echo('[error] %s' % str(exception))
        exit(1)
    def extracted(self):
        """
        An iterator of data from the upload

        This default implementation does not transform the data at all.
        Complex data sources, like spreadsheets with data in cells that aren't arranged
        in tables, will need to override it.
        """

        stream = tabulator.Stream(io.BytesIO(self.upload.raw),
                                  format=self.upload.file_type)
        stream.open()
        return stream
Exemple #12
0
def cli(source, limit, **options):
    """https://github.com/frictionlessdata/tabulator-py#cli
    """
    options = {key: value for key, value in options.items() if value is not None}
    with tabulator.Stream(source, **options) as stream:
        cast = str
        if six.PY2:
            cast = unicode  # noqa
        if stream.headers:
            click.echo(click.style(', '.join(map(cast, stream.headers)), bold=True))
        for count, row in enumerate(stream, start=1):
            click.echo(', '.join(map(cast, row)))
            if count == limit:
                break
Exemple #13
0
def test_dump_to_path_use_titles():
    from dataflows import Flow, dump_to_path, set_type
    import tabulator

    Flow(
        [{'hello': 'world', 'hola': 'mundo'}, {'hello': 'עולם', 'hola': 'عالم'}],
        *(set_type(name, resources=['res_1'], title=title) for name, title
          in (('hello', 'שלום'), ('hola', 'aloha'))),
        dump_to_path('data/dump_with_titles', use_titles=True)
    ).process()

    with tabulator.Stream('data/dump_with_titles/res_1.csv') as stream:
        assert stream.read() == [['שלום',   'aloha'],
                                 ['world',  'mundo'],
                                 ['עולם',   'عالم']]
Exemple #14
0
 def test_stream(self):
     """
     Given 2 remote pages of a JSON-API resource
     When I parse it with tabulator
     Then I get back a single item list with the full json in it
     """
     url = "https://raw.githubusercontent.com/strets123/frictionless-pres/master/data/smdataset%3Fpage%5Bnumber%5D%3D0"
     with tabulator.Stream(
             url,
             format="json-api",
             custom_parsers={"json-api": jsonapi_parser.JSONAPIParser},
             property='data',
     ) as stream:
         for index, item in enumerate(stream):
             self.assertTrue(isinstance(item[0], dict))
             self.assertIn("attributes", item[0])
             self.assertIn("id", item[0])
             self.assertIn("links", item[0])
             self.assertEqual(len(item), 1)
Exemple #15
0
def transpose(sheet):
    stream = tabulator.Stream(sheet).open()
    cells = list(stream.iter())
    num_rows = len(cells)
    num_cols = len(cells[0])
    headers = None
    outputed = 0
    for i in range(num_cols):
        row = [(cells_row[i] if len(cells_row) > i else None)
               for cells_row in cells]
        if any(row):
            if i == 0:
                headers = row
                all_headers.update(headers)
            else:
                outputed += 1
                yield dict(zip(headers, row))
        else:
            break
    print(sheet, num_rows, '->', outputed)
Exemple #16
0
 def stream(self):
     if self._stream is None:
         source = copy.deepcopy(self.config._unflatten().get('source', {}))
         structure = self._structure_params()
         try:
             path = source.pop('path')
             if not path:
                 return None
             logger.info('Opening stream %s', path)
             if 'workbook_cache' in source:
                 source['workbook_cache'] = _workbook_cache
             self._stream = tabulator.Stream(path, **source, **structure, http_session=self.http_session()).open()
             for k in source.keys():
                 self.config.get('source.' + k)
             for k in structure.keys():
                 self.config.get('structure.' + k)
         except Exception:
             logger.exception('Failed to open URL, source=%r, structure=%r', source, structure)
             raise
     return self._stream
def process_build(build, jenkins_user_token):
    build_url = '{}api/json'.format(build['url'])
    build = jenkins_driver.curl(*jenkins_user_token, build_url)
    try:
        build_timestamp = datetime.datetime.utcfromtimestamp(
            build["timestamp"] / 1000)
    except Exception:
        build_timestamp = None
    if build_timestamp:
        output_url = '{}artifact/output.csv'.format(build['url'])
        try:
            with tabulator.Stream(output_url,
                                  headers=1,
                                  http_session=jenkins_driver.get_session(
                                      *jenkins_user_token)) as stream:
                for row in stream.iter(keyed=True):
                    row['timestamp'] = build_timestamp
                    yield row
        except tabulator.exceptions.HTTPError:
            print('failed to get build artifact', build)
    else:
        print('failed to get build timestamp', build)
Exemple #18
0
    def stream(self):
        """
        An iterator of data from the upload

        This default implementation does not transform the data at all.
        Complex data sources, like spreadsheets with data in cells that aren't arranged
        in tables, will need to override it.
        """

        stream = tabulator.Stream(io.BytesIO(self.upload.raw),
                                  format=self.upload.file_type,
                                  **UPLOAD_SETTINGS['STREAM_ARGS'])
        stream.open()
        if UPLOAD_SETTINGS['OLD_HEADER_ROW'] is not None:
            if not UPLOAD_SETTINGS['HEADERS']:
                raise exceptions.ImproperlyConfigured(
                    "use DATA_INGEST['OLD_HEADER_ROW'] only with DATA_INGEST['HEADERS']"
                )
            for row in range(UPLOAD_SETTINGS['OLD_HEADER_ROW']):
                next(stream)  # discard rows before header

        return stream
Exemple #19
0
 def opener():
     _params = dict(headers=1)
     _params.update(
         dict(x for x in __resource.items()
              if x[0] not in {'path', 'name', 'schema',
                              'mediatype', 'skip_rows'}))
     skip_rows = __resource.get('skip_rows', 0)
     _stream = tabulator.Stream(__url, **_params,
                                post_parse=[row_skipper(skip_rows)])
     try:
         _stream.open()
         _headers = dedupe(_stream.headers)
         _schema = __resource.get('schema')
         if _schema is not None:
             _schema = Schema(_schema)
         return _schema, _headers, _stream, _stream.close
     except tabulator.exceptions.TabulatorException as e:
         logging.warning("Error while opening resource from url %s: %r",
                         _url, e)
         if not _ignore_missing:
             raise
         return {}, [], [], lambda: None
Exemple #20
0
def import_manifests(source_files):
    """
	Loops through the source files and streams them into a dataframe, then converts
	the dataframe to a list of manifest dicts.
	"""
    # Set up the storage functions for pandas dataframes
    storage = Storage()
    storage.create(
        'data', {
            'primaryKey':
            'name',
            'fields': [{
                'name': 'name',
                'type': 'string'
            }, {
                'name': 'metapath',
                'type': 'string'
            }, {
                'name': 'namespace',
                'type': 'string'
            }, {
                'name': 'title',
                'type': 'string'
            }, {
                'name': 'id',
                'type': 'string'
            }, {
                'name': '_id',
                'type': 'string'
            }, {
                'name': 'description',
                'type': 'string'
            }, {
                'name': 'version',
                'type': 'string'
            }, {
                'name': 'shortTitle',
                'type': 'string'
            }, {
                'name': 'label',
                'type': 'string'
            }, {
                'name': 'notes',
                'type': 'string'
            }, {
                'name': 'keywords',
                'type': 'string'
            }, {
                'name': 'image',
                'type': 'string'
            }, {
                'name': 'publisher',
                'type': 'string'
            }, {
                'name': 'webpage',
                'type': 'string'
            }, {
                'name': 'authors',
                'type': 'string'
            }, {
                'name': 'date',
                'type': 'string'
            }, {
                'name': 'edition',
                'type': 'string'
            }, {
                'name': 'contentType',
                'type': 'string'
            }, {
                'name': 'country',
                'type': 'string'
            }, {
                'name': 'language',
                'type': 'string'
            }, {
                'name': 'citation',
                'type': 'string'
            }]
        })
    path = os.path.join('app', current_app.config['UPLOAD_FOLDER'])
    error_list = []
    print('source_files')
    print(source_files)
    for item in source_files:
        if item.endswith('.xlsx') or item.endswith('.xls'):
            options = {'format': 'xlsx', 'sheet': 1, 'headers': 1}
        else:
            options = {'headers': 1}
        filepath = os.path.join(path, item)
        with tabulator.Stream(filepath, **options) as stream:
            try:
                stream.headers == [
                    'name', 'metapath', 'namespace', 'title', 'id', '_id',
                    'description', 'version', 'shortTitle', 'label', 'notes',
                    'keywords', 'image', 'publisher', 'webpage', 'authors',
                    'date', 'edition', 'contentType', 'country', 'language',
                    'citation'
                ]
            except:
                col_order = 'name, metapath, namespace, title, id, _id, description, version, shortTitle, label, notes, keywords, image, publisher, webpage, authors, date, edition, contentType, country, language, citation'
                error_list.append(
                    'Error: The table headings in ' + item +
                    ' do not match the Sources schema. Please use the headings '
                    + col_order + ' in that order.')
        with tabulator.Stream(filepath, **options) as stream:
            try:
                storage.write('data', stream)
            except:
                error_list.append('Error: Could not stream tabular data.')
    os.remove(filepath)
    manifests = []
    properties = {}
    data_dict = storage['data'].to_dict('index')
    print(data_dict)
    for key, values in data_dict.items():
        properties = {k: v for k, v in values.items() if v is not None}
        properties = {k: v.replace('\\n', '\n') for k, v in properties.items()}
        properties['name'] = key
        properties['namespace'] = 'we1sv2.0'
        properties['metapath'] = 'Sources'
        if validate_manifest(properties) == True:
            manifests.append(properties)
        else:
            error_list.append('Could not produce a valid manifest for <code>' +
                              key + '</code>.')
    # Now we're ready to insert into the database
    print(manifests)
    for manifest in manifests:
        db_errors = create_record(manifest)
        error_list = error_list + db_errors
    return manifests, error_list
Exemple #21
0
        def opener():
            _params = dict(headers=1)
            format = __resource.get("format")
            if format == "txt":
                # datapackage-pipelines processing requires having a header row
                # for txt format we add a single "data" column
                _params["headers"] = ["data"]
                _params["custom_parsers"] = {"txt": TXTParser}
                _params["allow_html"] = True
            else:
                if format is None:
                    _, format = tabulator.helpers.detect_scheme_and_format(
                        __url)
                if format in tabulator.config.SUPPORTED_COMPRESSION:
                    format = None
                else:
                    try:
                        parser_cls = tabulator.helpers.import_attribute(
                            tabulator.config.PARSERS[format])
                    except KeyError:
                        logging.error("Unknown format %r", format)
                        raise
                    _params.update(
                        dict(x for x in __resource.items()
                             if x[0] in parser_cls.options))
                _params.update(
                    dict(
                        x for x in __resource.items() if x[0] in {
                            'headers', 'scheme', 'encoding', 'sample_size',
                            'allow_html', 'force_strings', 'force_parse',
                            'skip_rows', 'compression'
                        }))
                if isinstance(_params.get('skip_rows'),
                              int):  # Backwards compatibility
                    _params['skip_rows'] = list(
                        range(1,
                              _params.get('skip_rows') + 1))

            if format is not None:
                _params['format'] = format

            if http_headers:
                http_session = requests.Session()
                http_session.headers = http_headers
                _params['http_session'] = http_session

            constants = _resource.get('constants', {})
            constant_headers = list(constants.keys())
            constant_values = [constants.get(k) for k in constant_headers]
            _stream = tabulator.Stream(__url,
                                       **_params,
                                       post_parse=[
                                           suffix_remover(format),
                                           add_constants(
                                               constant_headers,
                                               constant_values, _columns)
                                       ])
            retry = 0
            backoff = 2
            while True:
                try:
                    _stream.open()
                    _headers = dedupe(_stream.headers)
                    __columns = len(_headers)
                    _headers = dedupe(_headers + constant_headers)
                    _schema = __resource.get('schema')
                    if _schema is not None:
                        _schema = Schema(_schema)
                    return _schema, _headers, __columns, _stream, _stream.close
                except tabulator.exceptions.TabulatorException as e:
                    logging.warning(
                        "Error while opening resource from url %s: %r", _url,
                        e)
                    _stream.close()
                    retry += 1
                    if retry <= 3:
                        logging.warning("Retrying after %d seconds (%d/3)",
                                        backoff, retry)
                        time.sleep(backoff)
                        backoff *= 2
                        continue
                    else:
                        if not _ignore_missing:
                            raise
                        return {}, [], 0, [], lambda: None
        download = tempfile.NamedTemporaryFile(delete=False,
                                               suffix=os.path.basename(url))
        shutil.copyfileobj(requests.get(url, stream=True).raw, download)
        download.close()
        download = download.name
        content = open(download, 'rb').read()
        os.unlink(download)

        content = content.replace(b'\n="', b'\n"')
        content = content.replace(b',="', b',"')

        out.write(content)
        out.flush()

        logging.info('downloaded from %s %d bytes: %r', url, len(content),
                     content[:1000])

        datapackage['resources'].append(resource)

        stream = \
            tabulator.Stream('file://'+out.name, force_strings=True, **parameters.get('tabulator', {}))\
            .open()
        resource['schema'] = {
            'fields': [{
                'name': h,
                'type': 'string'
            } for h in stream.headers]
        }
        ctx.resource_iterator = itertools.chain(res_iter,
                                                [stream.iter(keyed=True)])
Exemple #23
0
    def get_csv_sample(cls,
                       url,
                       delimiter=",",
                       quotechar='"',
                       number_of_lines_sample=4,
                       encoding=None):
        bucket = app.config['s3']['bucket_url']
        full_url = os.path.join(bucket, url)
        try:
            with tabulator.Stream(
                    full_url,
                    headers=1,
                    sample_size=number_of_lines_sample,
                    ignore_blank_headers=True,
                    delimiter=delimiter,
                    quotechar=quotechar,
                    format='csv',
                    force_parse=True,
                    encoding=encoding,
                    post_parse=[cls.check_and_clean_up_row],
            ) as csv_file:
                sample = csv_file.sample
                contents = csv_file.read(
                    limit=app.config['app']['csv_sample_infer_lines'])
                headers = csv_file.headers

            if not headers:
                raise csv.Error(
                    'no headers found. The first line of the csv should '
                    'contain the column headers.')

            invalid_headings = list(
                filter(lambda x: not re.match("^[a-z][a-z0-9_]*$", x),
                       headers))
            if invalid_headings:
                joined_invalid_headings = '"' + '", "'.join(
                    invalid_headings) + '"'
                raise csv.Error(
                    f"column headers must start with a letter and "
                    f"may only contain lowercase letters, numbers, and underscores. Invalid "
                    f"headers: {joined_invalid_headings}")

            if not sample or not contents:
                raise csv.Error("no data found")

            column_types = cls.get_column_types(headers, contents)
            sample = list(
                zip(cls.make_unique_headers(headers), column_types,
                    list(map(list, zip(*sample)))))
            return sample, None

        except (TabulatorException, csv.Error) as e:
            error_logger = app.logger.info if isinstance(
                e, csv.Error) else app.logger.error
            error_message = 'Unable to process CSV file: '

            if isinstance(e, EncodingError):
                error_message += f'the CSV file could not be opened. (Technical details: {str(e)})'
            else:
                error_message += str(e)

            error_logger(error_message)

            return [], error_message
import tabulator

from datapackage_pipelines.wrapper import process
import re

tag_areas = tabulator.Stream('https://docs.google.com/spreadsheets/d/1k4iSVsX79-VMZCYoHFRL5Q6GAMP6M1t2hIFv7k4E8eM/edit#gid=0',
                        headers=1)
tag_areas.open()
tag_areas = list(tag_areas.iter())
tag_areas = [[x.strip() for x in y[0].split(',')] for y in tag_areas]
tag_res = [re.compile(r'\b(' + 
                      '|'.join('({})'.format(tag) for tag in tag_area) + 
                      r')\b')
           for tag_area in tag_areas]
tag_areas = list(zip(tag_res, tag_areas))


def process_row(row, _1, spec, _2, params, _3):
    resource = params['resource']
    target_field = params['target-field']
    source_fields = params['source-fields']
    if spec['name'] == resource:
        tags = set()
        for field in source_fields:
            value = row[field]
            if value:
                for tag_re, tag_area in tag_areas:
                    if tag_re.search(value):
                        repl = tag_re.sub(' '.join(tag_area), value)
                        tags.add(repl)
                        break
Exemple #25
0
import tabulator
import logging

from datapackage_pipelines.wrapper import process

kinds = {
    'central': 'מכרז מרכזי',
    'office': 'מכרז משרדי',
    'exemptions': 'רכש פטור ממכרז',
}

tender_conversion_table = tabulator.Stream(
    'https://docs.google.com/spreadsheets/d/1JkZooBwoxKPlrXWWwUtNHLMdxc-iG0n5tdGZ23I8fYY/edit#gid=1250185114',
    headers=1).open().iter(keyed=True)
tender_conversion_table = dict(
    ((x['tender_type'].strip(), x['decision'].strip(),
      x['has_awardees'].strip().lower() == 'true',
      x['has_active_awardees'].strip().lower() == 'true'),
     dict((k, x[k].strip()) for k in [
         'simple_decision', 'simple_decision_long', 'extended_status', 'tip1',
         'tip1_link', 'tip2', 'tip2_link'
     ])) for x in tender_conversion_table)

exemption_conversion_table = tabulator.Stream(
    'https://docs.google.com/spreadsheets/d/1lM2Afiw2JMJzHMIvWt3XylWLy2PLmVGbqaR9laBXFNo/edit#gid=1604640775',
    headers=1).open().iter(keyed=True)
exemption_conversion_table = dict(
    ((x['regulation'].strip(), x['decision'].strip(),
      x['has_awardees'].strip().lower() == 'true',
      x['has_active_awardees'].strip().lower() == 'true'),
     dict((k, x[k].strip()) for k in [
#!/usr/bin/env python
import tabulator
import json
import os

base_dir = os.path.dirname(__file__)

tooltips = tabulator.Stream(
    'https://docs.google.com/spreadsheets/d/1ztsoslfvEiQS1jSTrivU6chvyAySIIV9tad0L1ZKGj8/edit#gid=0',
    headers=1).open()
tooltips = list(tooltips.iter())
tooltips = sorted(tooltips, key=lambda x: len(x[0]), reverse=True)
json.dump(tooltips,
          open(os.path.join(base_dir, 'tooltips.json'), 'w'),
          ensure_ascii=False,
          indent=2)
import json
import logging
import datetime
import tabulator

from fuzzywuzzy import process as fw_process

logging.info('LOADING LAMAS DATA')
lamas_data = Package('/var/datapackages/lamas-municipal-data/datapackage.json')
districts = dict(
    (x['entity_name'], x['district_2015'])
    for x in lamas_data.resources[0].iter(keyed=True)
)

logging.info('LOADING FOA IMPROVEMENT')
foa_improvement = tabulator.Stream('https://docs.google.com/spreadsheets/d/17kX25p_M59h6VoDB90BeNVSmrel_9ipxUyko8SD6eKY/edit?usp=sharing', headers=1)
foa_improvement = foa_improvement.open().iter(keyed=True)
foa_improvement = dict((x.pop('original'), x) for x in foa_improvement)


regional_towns = Package('/var/datapackages/lamas-municipality-locality-map/datapackage.json')
cache = {}
for town, municipality in regional_towns.resources[0].iter():
    district = None
    if municipality in cache:
        district = cache[municipality]
    else:
        best = fw_process.extract(municipality, districts.keys())
        if len(best)>0:
            score = best[0][1]
            best = best[0][0]
Exemple #28
0
from pathlib import Path

fp = re.compile('\w+', re.UNICODE)


def fingerprint(x):
    return ''.join(fp.findall(x.upper()))


if __name__ == '__main__':
    # How to get places.csv from a nominatim server?
    # # kubectl exec -it nominatim-5d6889fd59-lxps7 -- su nominatim -c "psql -c \"\\copy (select hstore_to_json(name) from place where type in ('village', 'town', 'city', 'locality')) to stdout with csv\"" > places.csv
    # How to get official list of towns?
    # # https://data.gov.il/dataset/citiesandsettelments
    langs = {}
    s = tabulator.Stream('tools/yeshuvim.csv', headers=1)
    s.open()
    for item in s.iter(keyed=True):
        langs.setdefault('he', []).append(item['שם_ישוב'].strip().replace(
            '(', 'XXX').replace(')', '(').replace('XXX',
                                                  ')').replace('  ', ' '))
        langs.setdefault('en', []).append(
            item['שם_ישוב_לועזי'].strip().title().replace('  ', ' '))

    s = tabulator.Stream('tools/places.csv')
    s.open()
    for item in s.iter():
        if len(item) == 0: continue
        item = json.loads(item[0])
        for k, v in item.items():
            if k.startswith('name'):
    def iter(self):
        '''Lazily-iterates over rows in data.

        This method is useful when you don't want to load all data in memory at
        once.

        Returns:
            iter: An iterator that yields each row in this resource.

        Raises:
            ValueError: If the data isn't tabular, if the resource has
                no data, or if its specified encoding is incorrect
            IOError: If there was some problem opening the data file (e.g. it
                doesn't exist or we don't have permissions to read it).
        '''
        result = None
        inline_data = self.descriptor.get('data')
        if self.local_data_path and os.path.isfile(self.local_data_path):
            data_path_or_url = self.local_data_path
        else:
            data_path_or_url = self.remote_data_path

        if inline_data:
            inline_data = self._parse_inline_data()
            result = iter(inline_data)
        elif data_path_or_url:
            encoding = self.descriptor.get('encoding')
            dialect = self.descriptor.get('dialect', {})
            options = {}
            if dialect:
                options['format'] = 'csv'
            if 'delimiter' in dialect:
                options['delimiter'] = dialect['delimiter']
            if 'lineTerminator' in dialect:
                # https://github.com/frictionlessdata/datapackage-py/issues/58
                # tabulator doesn't support lineTerminator because
                # it's not supported by Python builtin csv parser
                lineterm = dialect['lineTerminator']
                if lineterm not in ['\r\n', '\r', '\n']:
                    message = 'Line terminator "%s" is not supported' % lineterm
                    warnings.warn(message, UserWarning)
            try:
                table = tabulator.Stream(data_path_or_url,
                                         headers=1,
                                         encoding=encoding,
                                         **options).open()
                result = self._iter_from_tabulator(
                    table, self.descriptor.get('schema'))
            except tabulator.exceptions.TabulatorException as e:
                msg = 'Data at \'{0}\' isn\'t in a known tabular data format'
                six.raise_from(ValueError(msg.format(data_path_or_url)), e)

        if result is None:
            if self.descriptor.get('path'):
                # FIXME: This is a hack to throw an IOError when local data
                # exists but couldn't be loaded for some reason. If "path"
                # existed and there were no issues opening it, "result" would
                # never be None.
                raise IOError('Resource\'s data couldn\'t be loaded.')

            raise ValueError('Resource has no data')

        return result
Exemple #30
0
parameters, datapackage, res_iter = ingest()

url = parameters.get('url')
resource = parameters.get('resource')
resource[PROP_STREAMING] = True

content = requests.get(url).content
content = content.replace(b'\n="', b'\n"')
content = content.replace(b',="', b',"')

out = tempfile.NamedTemporaryFile(suffix='.csv', delete=False)
out.write(content)

logging.info('downloaded from %s %d bytes: %r', url, len(content),
             content[:1000])

datapackage['resources'].append(resource)

stream = \
    tabulator.Stream('file://'+out.name, **parameters.get('tabulator', {}))\
    .open()
resource['schema'] = {
    'fields': [{
        'name': h,
        'type': 'string'
    } for h in stream.headers]
}

spew(datapackage, itertools.chain(res_iter, [stream.iter(keyed=True)]))