Ejemplo n.º 1
0
def get_merge_rules(schema=None):
    """
    Returns merge rules as key-value pairs, in which the key is a JSON path as a tuple, and the value is a list of
    merge properties whose values are `true`.
    """
    schema = schema or get_release_schema_url(get_tags()[-1])
    if isinstance(schema, dict):
        return _get_merge_rules_from_dereferenced_schema(jsonref.JsonRef.replace_refs(schema))
    else:
        return _get_merge_rules_from_url_or_path(schema)
Ejemplo n.º 2
0
    def process_item(self, item, spider):
        if not spider.unflatten or not isinstance(item, (File, FileItem)):
            return item

        input_name = item["file_name"]
        if input_name.endswith(".csv"):
            item["file_name"] = item["file_name"][:-4] + ".json"
            input_format = "csv"
        elif input_name.endswith(".xlsx"):
            item["file_name"] = item["file_name"][:-5] + ".json"
            input_format = "xlsx"
        else:
            raise NotImplementedError(
                f"the file '{input_name}' has no extension or is not CSV or XLSX, "
                f"obtained from: {item['url']}"
            )

        spider_ocds_version = spider.ocds_version.replace(".", "__")
        for tag in reversed(get_tags()):
            if tag.startswith(spider_ocds_version):
                schema = get_release_schema_url(tag)
                break
        else:
            raise NotImplementedError(f"no schema found for '{spider_ocds_version}'")

        with tempfile.TemporaryDirectory() as directory:
            input_path = os.path.join(directory, input_name)
            output_name = os.path.join(directory, item["file_name"])
            if input_format == "csv":
                input_name = directory
            elif input_format == "xlsx":
                input_name = input_path

            with open(input_path, "wb") as f:
                f.write(item["data"])

            with warnings.catch_warnings():
                warnings.filterwarnings(
                    "ignore"
                )  # flattentool uses UserWarning, so we can't set a specific category

                unflatten(
                    input_name,
                    root_list_path="releases",
                    root_id="ocid",
                    schema=schema,
                    input_format=input_format,
                    output_name=output_name,
                    **spider.unflatten_args,
                )

            with open(output_name, "r") as f:
                item["data"] = f.read()

        return item
Ejemplo n.º 3
0
def test_get_tags():
    assert get_tags()[:11] == [
        '0__3__2',
        '0__3__3',
        '1__0__0',
        '1__0__1',
        '1__0__2',
        '1__0__3',
        '1__1__0',
        '1__1__1',
        '1__1__2',
        '1__1__3',
        '1__1__4',
    ]
Ejemplo n.º 4
0
    def process_item(self, item, spider):
        if not spider.unflatten or not isinstance(item, (File, FileItem)):
            return item

        input_name = item['file_name']
        if input_name.endswith('.csv'):
            item['file_name'] = item['file_name'][:-4] + '.json'
            input_format = 'csv'
        elif input_name.endswith('.xlsx'):
            item['file_name'] = item['file_name'][:-5] + '.json'
            input_format = 'xlsx'
        else:
            raise NotImplementedError(f"the file '{input_name}' has no extension or is not CSV or XLSX, "
                                      f"obtained from: {item['url']}")

        spider_ocds_version = spider.ocds_version.replace('.', '__')
        for tag in reversed(get_tags()):
            if tag.startswith(spider_ocds_version):
                schema = get_release_schema_url(tag)
                break
        else:
            raise NotImplementedError(f"no schema found for '{spider_ocds_version}'")

        with tempfile.TemporaryDirectory() as directory:
            input_path = os.path.join(directory, input_name)
            output_name = os.path.join(directory, item['file_name'])
            if input_format == 'csv':
                input_name = directory
            elif input_format == 'xlsx':
                input_name = input_path

            with open(input_path, 'wb') as f:
                f.write(item['data'])

            unflatten(
                input_name,
                root_list_path='releases',
                root_id='ocid',
                schema=schema,
                input_format=input_format,
                output_name=output_name,
                **spider.unflatten_args
            )

            with open(output_name, 'r') as f:
                item['data'] = f.read()

        return item
Ejemplo n.º 5
0
def test_output_package_no_streaming():
    data = [
        json.loads(read(filename))
        for filename in ('realdata/release-package-1.json',
                         'realdata/release-package-2.json')
    ]

    with Packager() as packager:
        packager.package['version'] = '1.1'
        packager.add(data)

        prefix = packager.version.replace('.', '__') + '__'
        tag = next(tag for tag in reversed(get_tags())
                   if tag.startswith(prefix))
        schema = get_release_schema_url(tag)

        actual = next(packager.output_package(Merger(schema)))

    assert actual == json.loads(read('realdata/record-package_package.json'))
Ejemplo n.º 6
0
def merge(data,
          uri='',
          publisher=None,
          published_date='',
          version=DEFAULT_VERSION,
          schema=None,
          return_versioned_release=False,
          return_package=False,
          use_linked_releases=False,
          streaming=False):
    """
    Merges release packages and individual releases.

    By default, yields compiled releases. If ``return_versioned_release`` is ``True``, yields versioned releases. If
    ``return_package`` is ``True``, wraps the compiled releases (and versioned releases if ``return_versioned_release``
    is ``True``) in a record package.

    If ``return_package`` is set and ``publisher`` isn't set, the output record package will have the same publisher as
    the last input release package.

    :param data: an iterable of release packages and individual releases
    :param str uri: if ``return_package`` is ``True``, the record package's ``uri``
    :param dict publisher: if ``return_package`` is ``True``, the record package's ``publisher``
    :param str published_date: if ``return_package`` is ``True``, the record package's ``publishedDate``
    :param str version: if ``return_package`` is ``True``, the record package's ``version``
    :param dict schema: the URL, path or dict of the patched release schema to use
    :param bool return_package: wrap the compiled releases in a record package
    :param bool use_linked_releases: if ``return_package`` is ``True``, use linked releases instead of full releases,
        if the input is a release package
    :param bool return_versioned_release: if ``return_package`` is ``True``, include versioned releases in the record
        package; otherwise, yield versioned releases instead of compiled releases
    :param bool streaming: if ``return_package`` is ``True``, set the package's records to a generator (this only works
        if the calling code exhausts the generator before ``merge`` returns)
    :raises InconsistentVersionError: if the versions are inconsistent across packages to merge
    :raises MissingOcidKeyError: if the release is missing an ``ocid`` field
    """
    with Packager() as packager:
        packager.add(data)

        if not schema and packager.version:
            prefix = packager.version.replace('.', '__') + '__'
            tag = next(tag for tag in reversed(get_tags())
                       if tag.startswith(prefix))
            schema = get_release_schema_url(tag)

            if packager.package['extensions']:
                builder = ProfileBuilder(tag,
                                         list(packager.package['extensions']))
                schema = builder.patched_release_schema()

        merger = Merger(schema)

        if return_package:
            packager.package['uri'] = uri
            packager.package['publishedDate'] = published_date
            packager.package['version'] = version
            if publisher:
                packager.package['publisher'] = publisher

            yield from packager.output_package(
                merger,
                return_versioned_release=return_versioned_release,
                use_linked_releases=use_linked_releases,
                streaming=streaming)
        else:
            yield from packager.output_releases(
                merger, return_versioned_release=return_versioned_release)
Ejemplo n.º 7
0
def _get_tags():
    return cache.get_or_set('git_tags', sorted(get_tags(), reverse=True), 3600)