Ejemplo n.º 1
0
def test_filteroverdo_wraps_exceptions():
    record = ('<record>'
              '  <datafield tag="269" ind1=" " ind2=" ">'
              '    <subfield code="c">Ceci n’est pas une dâte</subfield>'
              '  </datafield>'
              '  <datafield tag="980" ind1=" " ind2=" ">'
              '    <subfield code="a">HEP</subfield>'
              '  </datafield>'
              '</record>')  # synthetic data

    with pytest.raises(DoJsonError) as exc:
        marcxml2record(record)
    assert 'Error in rule "preprint_date" for field "269__"' in str(exc.value)
Ejemplo n.º 2
0
def merged_records(app):
    merged_snippet = (
        '<record>'
        '  <controlfield tag="001">111</controlfield>'
        '  <datafield tag="245" ind1=" " ind2=" ">'
        '    <subfield code="a">merged</subfield>'
        '  </datafield>'
        '  <datafield tag="981" ind1=" " ind2=" ">'
        '    <subfield code="a">222</subfield>'
        '  </datafield>'
        '  <datafield tag="980" ind1=" " ind2=" ">'
        '    <subfield code="a">HEP</subfield>'
        '  </datafield>'
        '</record>'
    )

    deleted_snippet = (
        '<record>'
        '  <controlfield tag="001">222</controlfield>'
        '  <datafield tag="245" ind1=" " ind2=" ">'
        '    <subfield code="a">deleted</subfield>'
        '  </datafield>'
        '  <datafield tag="970" ind1=" " ind2=" ">'
        '    <subfield code="d">111</subfield>'
        '  </datafield>'
        '  <datafield tag="980" ind1=" " ind2=" ">'
        '    <subfield code="a">HEP</subfield>'
        '    <subfield code="c">DELETED</subfield>'
        '  </datafield>'
        '</record>'
    )

    merged_record = marcxml2record(merged_snippet)
    merged_record['$schema'] = 'http://localhost:5000/schemas/records/hep.json'

    deleted_record = marcxml2record(deleted_snippet)
    deleted_record['$schema'] = 'http://localhost:5000/schemas/records/hep.json'

    with db.session.begin_nested():
        merged_uuid = record_insert_or_replace(merged_record).id
        deleted_uuid = record_insert_or_replace(deleted_record).id
    db.session.commit()

    es.indices.refresh('records-hep')

    yield

    _delete_merged_records('lit', 111, 222, merged_uuid, deleted_uuid)
Ejemplo n.º 3
0
def merged_records(app):
    merged_snippet = (
        '<record>'
        '  <controlfield tag="001">111</controlfield>'
        '  <datafield tag="245" ind1=" " ind2=" ">'
        '    <subfield code="a">merged</subfield>'
        '  </datafield>'
        '  <datafield tag="981" ind1=" " ind2=" ">'
        '    <subfield code="a">222</subfield>'
        '  </datafield>'
        '  <datafield tag="980" ind1=" " ind2=" ">'
        '    <subfield code="a">HEP</subfield>'
        '  </datafield>'
        '</record>'
    )

    deleted_snippet = (
        '<record>'
        '  <controlfield tag="001">222</controlfield>'
        '  <datafield tag="245" ind1=" " ind2=" ">'
        '    <subfield code="a">deleted</subfield>'
        '  </datafield>'
        '  <datafield tag="970" ind1=" " ind2=" ">'
        '    <subfield code="d">111</subfield>'
        '  </datafield>'
        '  <datafield tag="980" ind1=" " ind2=" ">'
        '    <subfield code="a">HEP</subfield>'
        '    <subfield code="c">DELETED</subfield>'
        '  </datafield>'
        '</record>'
    )

    merged_record = marcxml2record(merged_snippet)
    merged_record['$schema'] = 'http://localhost:5000/schemas/records/hep.json'

    deleted_record = marcxml2record(deleted_snippet)
    deleted_record['$schema'] = 'http://localhost:5000/schemas/records/hep.json'

    with db.session.begin_nested():
        merged_uuid = _create_record(merged_record).id
        deleted_uuid = _create_record(deleted_record).id
    db.session.commit()

    es.indices.refresh('records-hep')

    yield

    _delete_merged_records('lit', 111, 222, merged_uuid, deleted_uuid)
Ejemplo n.º 4
0
def update(recid):
    """View for INSPIRE author update form."""
    data = {}
    if recid:
        try:
            url = get_legacy_url_for_recid(recid) + '/export/xm'
            xml = requests.get(url)
            record_regex = re.compile(
                r"\<record\>.*\<\/record\>", re.MULTILINE + re.DOTALL)
            xml_content = record_regex.search(xml.content).group()
            data = marcxml2record(xml_content)
            convert_for_form(data)
        except requests.exceptions.RequestException:
            pass
        data["control_number"] = recid
    else:
        return redirect(url_for("inspirehep_authors.new"))
    form = AuthorUpdateForm(data=data, is_update=True)
    ctx = {
        "action": url_for('.submitupdate'),
        "name": "authorUpdateForm",
        "id": "authorUpdateForm",
    }

    return render_template('authors/forms/update_form.html', form=form, **ctx)
Ejemplo n.º 5
0
def update(recid):
    """View for INSPIRE author update form."""
    data = {}
    if recid:
        try:
            url = get_legacy_url_for_recid(recid) + '/export/xm'
            xml = requests.get(url)
            record_regex = re.compile(
                r"\<record\>.*\<\/record\>", re.MULTILINE + re.DOTALL)
            xml_content = record_regex.search(xml.content).group()
            data = marcxml2record(xml_content)
            convert_for_form(data)
        except requests.exceptions.RequestException:
            pass
        data["control_number"] = recid
    else:
        return redirect(url_for("inspirehep_authors.new"))
    form = AuthorUpdateForm(data=data, is_update=True)
    ctx = {
        "action": url_for('.submitupdate'),
        "name": "authorUpdateForm",
        "id": "authorUpdateForm",
    }

    return render_template('authors/forms/update_form.html', form=form, **ctx)
Ejemplo n.º 6
0
def migrate_and_insert_record(raw_record, skip_files=False):
    """Migrate a record and insert it if valid, or log otherwise."""
    try:
        json_record = marcxml2record(raw_record)
        recid = json_record['control_number']
    except Exception as e:
        LOGGER.exception('Migrator DoJSON Error')
        recid = _get_recid(raw_record)
        _store_migrator_error(recid, raw_record, e)
        return None

    if '$schema' in json_record:
        ensure_valid_schema(json_record)

    try:
        record = record_insert_or_replace(json_record, skip_files=skip_files)
    except ValidationError as e:
        pattern = u'Migrator Validator Error: {}, Value: %r, Record: %r'
        LOGGER.error(pattern.format('.'.join(e.schema_path)), e.instance, recid)
        _store_migrator_error(recid, raw_record, e)
    except Exception as e:
        LOGGER.exception('Migrator Record Insert Error')
        _store_migrator_error(recid, raw_record, e)
    else:
        prod_record = InspireProdRecords(recid=recid)
        prod_record.marcxml = raw_record
        prod_record.valid = True
        db.session.merge(prod_record)
        return record
Ejemplo n.º 7
0
def core_record():
    """Provide record fixtures."""
    record_oai_arxiv_plots = pkg_resources.resource_string(
        __name__,
        os.path.join(
            '../fixtures',
            'oai_arxiv_core_record.xml'
        )
    )
    # Convert to MARCXML, then dict, then HEP JSON
    record_oai_arxiv_plots_marcxml = convert(
        record_oai_arxiv_plots,
        "oaiarXiv2marcxml.xsl"
    )
    json_data = marcxml2record(record_oai_arxiv_plots_marcxml)

    categories = {'core': [], 'non-core': []}
    for eprint in json_data.get('arxiv_eprints', []):
        categories['core'].extend(eprint.get('categories', []))

    if 'preprint_date' in json_data:
        json_data['preprint_date'] = datetime.date.today().isoformat()

    assert categories

    return json_data, categories
Ejemplo n.º 8
0
    def _author_list(obj, eng):
        arxiv_id = get_arxiv_id(obj.data)
        filename = secure_filename('{0}.tar.gz'.format(arxiv_id))
        try:
            tarball = obj.files[filename]
        except KeyError:
            obj.log.info(
                'Skipping author list extraction, no tarball with name "%s" found'
                % filename)
            return

        with TemporaryDirectory(prefix='author_list') as scratch_space, \
                retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file:
            try:
                file_list = untar(tarball_file, scratch_space)
            except InvalidTarball:
                obj.log.info(
                    'Invalid tarball %s for arxiv_id %s',
                    tarball.file.uri,
                    arxiv_id,
                )
                return

            obj.log.info('Extracted tarball to: {0}'.format(scratch_space))
            xml_files_list = [
                path for path in file_list if path.endswith('.xml')
            ]
            obj.log.info('Found xmlfiles: {0}'.format(xml_files_list))

            extracted_authors = []
            for xml_file in xml_files_list:
                with open(xml_file, 'r') as xml_file_fd:
                    xml_content = xml_file_fd.read()

                match = REGEXP_AUTHLIST.findall(xml_content)
                if match:
                    obj.log.info('Found a match for author extraction')
                    try:
                        authors_xml = convert(xml_content, stylesheet)
                    except XMLSyntaxError:
                        # Probably the %auto-ignore comment exists, so we skip the
                        # first line. See: inspirehep/inspire-next/issues/2195
                        authors_xml = convert(
                            xml_content.split('\n', 1)[1],
                            stylesheet,
                        )

                    extracted_authors.extend(
                        marcxml2record(authors_xml).get('authors', []))

            if extracted_authors:
                for author in extracted_authors:
                    author['full_name'] = decode_latex(author['full_name'])

                obj.data['authors'] = extracted_authors
Ejemplo n.º 9
0
def reporterrors(output):
    """Reports in a friendly way all failed records and corresponding motivation."""
    click.echo("Reporting broken records into {0}".format(output))
    errors = {}
    results = LegacyRecordsMirror.query.filter(
        LegacyRecordsMirror.valid == False)  # noqa: ignore=F712
    results_length = results.count()
    with click.progressbar(results.yield_per(100),
                           length=results_length) as bar:
        for obj in bar:
            marc_record = create_record(obj.marcxml, keep_singletons=False)
            collection = get_collection(marc_record)
            if 'DELETED' in collection:
                continue
            recid = int(marc_record['001'])
            try:
                json_record = marcxml2record(obj.marcxml)
            except Exception as err:
                tb = u''.join(traceback.format_tb(sys.exc_info()[2]))
                errors.setdefault((collection, 'dojson', tb), []).append(recid)
                continue

            ensure_valid_schema(json_record)

            try:
                validate(json_record)
            except jsonschema.exceptions.ValidationError as err:
                exc = [
                    row for row in str(err).splitlines()
                    if row.startswith('Failed validating')
                ][0]
                details = u'\n'.join(
                    dropwhile(lambda x: not x.startswith('On instance'),
                              str(err).splitlines()))
                errors.setdefault((collection, 'validation', exc), []).append(
                    (recid, details))
                continue

    with open(output, "w") as out:
        csv_writer = csv.writer(out)
        for (collection, stage, error), elements in errors.iteritems():
            if stage == 'dojson':
                csv_writer.writerow(
                    (collection, stage, error,
                     '\n'.join('http://inspirehep.net/record/{}'.format(recid)
                               for recid in elements)))
            else:
                for recid, details in elements:
                    csv_writer.writerow(
                        (collection, stage, error,
                         'http://inspirehep.net/record/{}'.format(recid),
                         details))
    click.echo("Dumped errors into {}".format(output))
Ejemplo n.º 10
0
    def _author_list(obj, eng):
        arxiv_id = LiteratureReader(obj.data).arxiv_id
        filename = secure_filename('{0}.tar.gz'.format(arxiv_id))
        try:
            tarball = obj.files[filename]
        except KeyError:
            obj.log.info(
                'Skipping author list extraction, no tarball with name "%s" found' % filename
            )
            return

        with TemporaryDirectory(prefix='author_list') as scratch_space, \
                retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file:
            try:
                file_list = untar(tarball_file, scratch_space)
            except InvalidTarball:
                obj.log.info(
                    'Invalid tarball %s for arxiv_id %s',
                    tarball.file.uri,
                    arxiv_id,
                )
                return

            obj.log.info('Extracted tarball to: {0}'.format(scratch_space))
            xml_files_list = [path for path in file_list if path.endswith('.xml')]
            obj.log.info('Found xmlfiles: {0}'.format(xml_files_list))

            extracted_authors = []
            for xml_file in xml_files_list:
                with open(xml_file, 'r') as xml_file_fd:
                    xml_content = xml_file_fd.read()

                match = REGEXP_AUTHLIST.findall(xml_content)
                if match:
                    obj.log.info('Found a match for author extraction')
                    try:
                        authors_xml = convert(xml_content, stylesheet)
                    except XMLSyntaxError:
                        # Probably the %auto-ignore comment exists, so we skip the
                        # first line. See: inspirehep/inspire-next/issues/2195
                        authors_xml = convert(
                            xml_content.split('\n', 1)[1],
                            stylesheet,
                        )

                    extracted_authors.extend(marcxml2record(authors_xml).get('authors', []))

            if extracted_authors:
                for author in extracted_authors:
                    author['full_name'] = decode_latex(author['full_name'])

                obj.data['authors'] = extracted_authors
Ejemplo n.º 11
0
    def _parsed_items_from_marcxml(
            self,
            marcxml_records,
            base_url="",
            hostname="",
            url_schema=None,
            ftp_params=None,
            url=""
    ):
        app = Flask('hepcrawl')
        app.config.update(self.settings.getdict('MARC_TO_HEP_SETTINGS', {}))
        file_name = url.split('/')[-1]

        with app.app_context():
            parsed_items = []
            for xml_record in marcxml_records:
                try:
                    record = marcxml2record(xml_record)
                    parsed_item = ParsedItem(record=record, record_format='hep')
                    parsed_item.ftp_params = ftp_params
                    parsed_item.file_name = file_name

                    files_to_download = [
                        self._get_full_uri(
                            current_url=document['url'],
                            base_url=base_url,
                            schema=url_schema,
                            hostname=hostname,
                        )
                        for document in parsed_item.record.get('documents', [])
                        if self._has_to_be_downloaded(document['url'])
                    ]
                    parsed_item.file_urls = files_to_download

                    self.logger.info('Got the following attached documents to download: %s'% files_to_download)
                    self.logger.info('Got item: %s' % parsed_item)

                    parsed_items.append(parsed_item)

                except Exception as e:
                    tb = ''.join(traceback.format_tb(sys.exc_info()[2]))
                    error_parsed_item = ParsedItem.from_exception(
                        record_format='hep',
                        exception=repr(e),
                        traceback=tb,
                        source_data=xml_record,
                        file_name=file_name
                    )
                    parsed_items.append(error_parsed_item)

            return parsed_items
Ejemplo n.º 12
0
    def _parsed_items_from_marcxml(
            self,
            marcxml_records,
            base_url="",
            hostname="",
            url_schema=None,
            ftp_params=None,
            url=""
    ):
        app = Flask('hepcrawl')
        app.config.update(self.settings.getdict('MARC_TO_HEP_SETTINGS', {}))
        file_name = url.split('/')[-1]

        with app.app_context():
            parsed_items = []
            for xml_record in marcxml_records:
                try:
                    record = marcxml2record(xml_record)
                    parsed_item = ParsedItem(record=record, record_format='hep')
                    parsed_item.ftp_params = ftp_params
                    parsed_item.file_name = file_name

                    files_to_download = [
                        self._get_full_uri(
                            current_url=document['url'],
                            base_url=base_url,
                            schema=url_schema,
                            hostname=hostname,
                        )
                        for document in parsed_item.record.get('documents', [])
                        if self._has_to_be_downloaded(document['url'])
                    ]
                    parsed_item.file_urls = files_to_download

                    self.logger.info('Got the following attached documents to download: %s'% files_to_download)
                    self.logger.info('Got item: %s' % parsed_item)

                    parsed_items.append(parsed_item)

                except Exception as e:
                    tb = ''.join(traceback.format_tb(sys.exc_info()[2]))
                    error_parsed_item = ParsedItem.from_exception(
                        record_format='hep',
                        exception=repr(e),
                        traceback=tb,
                        source_data=xml_record,
                        file_name=file_name
                    )
                    parsed_items.append(error_parsed_item)

            return parsed_items
Ejemplo n.º 13
0
    def _parsed_items_from_marcxml(self, marcxml_records, base_url="", url=""):
        self.logger.info('parsing record')
        app = Flask('hepcrawl')
        app.config.update(self.settings.getdict('MARC_TO_HEP_SETTINGS', {}))
        file_name = url.split('/')[-1].split("?")[0]

        with app.app_context():
            parsed_items = []
            for xml_record in marcxml_records:
                try:
                    record = marcxml2record(xml_record)
                    parsed_item = ParsedItem(record=record,
                                             record_format='hep')
                    parsed_item.file_name = file_name
                    new_documents = []
                    files_to_download = []
                    self.logger.info("Parsed document: %s", parsed_item.record)
                    self.logger.info("Record have documents: %s", "documents"
                                     in parsed_item.record)
                    for document in parsed_item.record.get('documents', []):
                        if self._is_local_path(document['url']):
                            document['url'] = self._get_full_uri(
                                document['url'])
                            self.logger.info("Updating document %s", document)
                        else:
                            files_to_download.append(document['url'])
                        new_documents.append(document)

                    if new_documents:
                        parsed_item.record['documents'] = new_documents

                    parsed_item.file_urls = files_to_download
                    self.logger.info(
                        'Got the following attached documents to download: %s',
                        files_to_download)
                    self.logger.info('Got item: %s', parsed_item)

                    parsed_items.append(parsed_item)

                except Exception as e:
                    tb = ''.join(traceback.format_tb(sys.exc_info()[2]))
                    error_parsed_item = ParsedItem.from_exception(
                        record_format='hep',
                        exception=repr(e),
                        traceback=tb,
                        source_data=xml_record,
                        file_name=file_name)
                    parsed_items.append(error_parsed_item)

            return parsed_items
Ejemplo n.º 14
0
        def _get_crawl_result(xml_record):
            app = Flask('hepcrawl')
            app.config.update(self.settings.getdict('MARC_TO_HEP_SETTINGS',
                                                    {}))
            with app.app_context():
                item = ParsedItem(record={}, record_format='hep')
                try:
                    item.record = marcxml2record(xml_record)
                except Exception as e:
                    item.exception = repr(e)
                    item.traceback = traceback.format_tb(sys.exc_info()[2])
                    item.source_data = xml_record

            return item
Ejemplo n.º 15
0
def generate_record():
    """Provide record fixture."""
    record_oai_arxiv_plots = pkg_resources.resource_string(
        __name__, os.path.join('../fixtures',
                               'oai_arxiv_record_with_plots.xml'))

    # Convert to MARCXML, then dict, then HEP JSON
    record_oai_arxiv_plots_marcxml = convert(record_oai_arxiv_plots,
                                             "oaiarXiv2marcxml.xsl")
    json_data = marcxml2record(record_oai_arxiv_plots_marcxml)

    if 'preprint_date' in json_data:
        json_data['preprint_date'] = datetime.date.today().isoformat()

    return json_data
Ejemplo n.º 16
0
    def _author_list(obj, eng):
        arxiv_id = get_arxiv_id(obj.data)
        filename = secure_filename('{0}.tar.gz'.format(arxiv_id))
        tarball = obj.files[filename]

        if tarball:
            with TemporaryDirectory(prefix='author_list') as scratch_space:
                tarball_file = retrieve_uri(
                    tarball.file.uri,
                    outdir=scratch_space,
                )
                try:
                    file_list = untar(tarball_file, scratch_space)
                except InvalidTarball:
                    obj.log.info(
                        'Invalid tarball %s for arxiv_id %s',
                        tarball.file.uri,
                        arxiv_id,
                    )
                    return
                obj.log.info('Extracted tarball to: {0}'.format(scratch_space))

                xml_files_list = [
                    path for path in file_list if path.endswith('.xml')
                ]
                obj.log.info('Found xmlfiles: {0}'.format(xml_files_list))

                for xml_file in xml_files_list:
                    with open(xml_file, 'r') as xml_file_fd:
                        xml_content = xml_file_fd.read()

                    match = REGEXP_AUTHLIST.findall(xml_content)
                    if match:
                        obj.log.info('Found a match for author extraction')
                        try:
                            authors_xml = convert(xml_content, stylesheet)
                        except XMLSyntaxError:
                            # Probably the %auto-ignore comment exists, so we skip the
                            # first line. See: inspirehep/inspire-next/issues/2195
                            authors_xml = convert(
                                xml_content.split('\n', 1)[1],
                                stylesheet,
                            )
                        authorlist_record = marcxml2record(authors_xml)
                        obj.data.update(authorlist_record)
                        break
Ejemplo n.º 17
0
def migrate_and_insert_record(raw_record, skip_files=False):
    """Convert a marc21 record to JSON and insert it into the DB."""
    error = None

    try:
        json_record = marcxml2record(raw_record)
        if '$schema' in json_record:
            json_record['$schema'] = url_for(
                'invenio_jsonschemas.get_schema',
                schema_path='records/{0}'.format(json_record['$schema']),
            )
    except Exception as e:
        LOGGER.exception('Migrator DoJSON Error')
        error = e
    else:
        recid = json_record['control_number']
        prod_record = InspireProdRecords(recid=recid)
        prod_record.marcxml = raw_record

    try:
        if not error:
            record = record_insert_or_replace(json_record,
                                              skip_files=skip_files)
    except ValidationError as e:
        # Aggregate logs by part of schema being validated.
        pattern = u'Migrator Validator Error: {}, Value: %r, Record: %r'
        LOGGER.error(pattern.format('.'.join(e.schema_path)), e.instance,
                     recid)
        error = e
    except Exception as e:
        # Receivers can always cause exceptions and we could dump the entire
        # chunk because of a single broken record.
        LOGGER.exception('Migrator Record Insert Error')
        error = e

    if error:
        # Invalid record, will not get indexed.
        error_str = u'{0}: Record {1}: {2}'.format(type(error), recid, e)
        prod_record.valid = False
        prod_record.errors = error_str
        db.session.merge(prod_record)
        return None
    else:
        prod_record.valid = True
        db.session.merge(prod_record)
        return record
Ejemplo n.º 18
0
def get_record_from_legacy(record_id=None):
    data = {}
    try:
        url = get_legacy_url_for_recid(record_id) + '/export/xm'
        xml = requests.get(url)
        record_regex = re.compile(
            r"\<record\>.*\<\/record\>", re.MULTILINE + re.DOTALL)
        xml_content = record_regex.search(xml.content).group()
        data = marcxml2record(xml_content)
    except requests.exceptions.RequestException:
        current_app.logger.error(
            'Failed to get record {} from legacy.'.format(record_id),
        )
    except Exception:
        current_app.logger.error(
            'Error parsing the record {} from legacy.'.format(record_id),
        )
    return data
Ejemplo n.º 19
0
def core_record():
    """Provide record fixture."""
    record_oai_arxiv_plots = pkg_resources.resource_string(
        __name__, os.path.join('../fixtures', 'oai_arxiv_core_record.xml'))
    # Convert to MARCXML, then dict, then HEP JSON
    record_oai_arxiv_plots_marcxml = convert(record_oai_arxiv_plots,
                                             "oaiarXiv2marcxml.xsl")
    json_data = marcxml2record(record_oai_arxiv_plots_marcxml)

    categories = {'core': [], 'non-core': []}
    for eprint in json_data.get('arxiv_eprints', []):
        categories['core'].extend(eprint.get('categories', []))

    if 'preprint_date' in json_data:
        json_data['preprint_date'] = datetime.date.today().isoformat()

    assert categories

    return json_data, categories
Ejemplo n.º 20
0
def migrate_record_from_mirror(prod_record, skip_files=False):
    """Migrate a mirrored legacy record into an Inspire record.

    Args:
        prod_record(LegacyRecordsMirror): the mirrored record to migrate.
        skip_files(bool): flag indicating whether the files in the record
            metadata should be copied over from legacy and attach to the
            record.

    Returns:
        dict: the migrated record metadata, which is also inserted into the database.
    """
    try:
        json_record = marcxml2record(prod_record.marcxml)
    except Exception as exc:
        LOGGER.exception('Migrator DoJSON Error')
        prod_record.error = exc
        db.session.merge(prod_record)
        return None

    if '$schema' in json_record:
        ensure_valid_schema(json_record)

    try:
        with db.session.begin_nested():
            record = InspireRecord.create_or_update(json_record,
                                                    skip_files=skip_files)
            record.commit()
    except ValidationError as exc:
        pattern = u'Migrator Validator Error: {}, Value: %r, Record: %r'
        LOGGER.error(pattern.format('.'.join(exc.schema_path)), exc.instance,
                     prod_record.recid)
        prod_record.error = exc
        db.session.merge(prod_record)
    except Exception as exc:
        LOGGER.exception('Migrator Record Insert Error')
        prod_record.error = exc
        db.session.merge(prod_record)
    else:
        prod_record.valid = True
        db.session.merge(prod_record)
        return record
Ejemplo n.º 21
0
def parse_received_package(file_data, package_name):
    """Parses received MarcXML data, also applies the needed mappings."""

    # Delete XML header if exists. Dojson library will call lxml.etree.parse on a decoded string,
    # which results in 'ValueError: Unicode strings with encoding declaration are not supported.'
    file_data = file_data.replace('<?xml version="1.0" encoding="UTF-8"?>', '')
    try:
        obj = marcxml2record(file_data)
    except Exception as e:
        logger.error('Marcxml parsing failed for package %s: %s' % (package_name, e))
        raise InvalidUsage("MARCXML is not valid.")

    obj['$schema'] = url_for('invenio_jsonschemas.get_schema', schema_path="hep.json")

    if 'self' in obj:
        del obj['self']

    _add_additional_info(obj)

    return obj
Ejemplo n.º 22
0
def migrate_record_from_mirror(prod_record, skip_files=False):
    """Migrate a mirrored legacy record into an Inspire record.

    Args:
        prod_record(LegacyRecordsMirror): the mirrored record to migrate.
        skip_files(bool): flag indicating whether the files in the record
            metadata should be copied over from legacy and attach to the
            record.

    Returns:
        dict: the migrated record metadata, which is also inserted into the database.
    """
    try:
        json_record = marcxml2record(prod_record.marcxml)
    except Exception as exc:
        LOGGER.exception('Migrator DoJSON Error')
        prod_record.error = exc
        db.session.merge(prod_record)
        return None

    if '$schema' in json_record:
        ensure_valid_schema(json_record)

    try:
        with db.session.begin_nested():
            record = InspireRecord.create_or_update(json_record, skip_files=skip_files)
            record.commit()
    except ValidationError as exc:
        pattern = u'Migrator Validator Error: {}, Value: %r, Record: %r'
        LOGGER.error(pattern.format('.'.join(exc.schema_path)), exc.instance, prod_record.recid)
        prod_record.error = exc
        db.session.merge(prod_record)
    except Exception as exc:
        LOGGER.exception('Migrator Record Insert Error')
        prod_record.error = exc
        db.session.merge(prod_record)
    else:
        prod_record.valid = True
        db.session.merge(prod_record)
        return record
Ejemplo n.º 23
0
def parse_received_package(file_data, package_name):
    """Parses received MarcXML data, also applies the needed mappings."""

    # Delete XML header if exists. Dojson library will call lxml.etree.parse on a decoded string,
    # which results in 'ValueError: Unicode strings with encoding declaration are not supported.'
    file_data = file_data.replace('<?xml version="1.0" encoding="UTF-8"?>', '')
    try:
        obj = marcxml2record(file_data)
    except Exception as e:
        logger.error('Marcxml parsing failed for package %s: %s' %
                     (package_name, e))
        raise InvalidUsage("MARCXML is not valid.")

    obj['$schema'] = url_for('invenio_jsonschemas.get_schema',
                             schema_path="hep.json")

    if 'self' in obj:
        del obj['self']

    _add_additional_info(obj)

    return obj
Ejemplo n.º 24
0
def deleted_record(app):
    snippet = ('<record>'
               '  <controlfield tag="001">111</controlfield>'
               '  <datafield tag="245" ind1=" " ind2=" ">'
               '    <subfield code="a">deleted</subfield>'
               '  </datafield>'
               '  <datafield tag="980" ind1=" " ind2=" ">'
               '    <subfield code="a">HEP</subfield>'
               '    <subfield code="c">DELETED</subfield>'
               '  </datafield>'
               '</record>')

    record = marcxml2record(snippet)
    record['$schema'] = 'http://localhost:5000/schemas/records/hep.json'

    with db.session.begin_nested():
        _create_record(record)
    db.session.commit()

    yield

    _delete_record('lit', 111)
Ejemplo n.º 25
0
def already_harvested_on_legacy_record():
    """Provide record fixture."""
    record_oai_arxiv_plots = pkg_resources.resource_string(
        __name__,
        os.path.join(
            '../fixtures',
            'oai_arxiv_record_already_on_legacy.xml'
        )
    )
    # Convert to MARCXML, then dict, then HEP JSON
    record_oai_arxiv_plots_marcxml = convert(
        record_oai_arxiv_plots,
        "oaiarXiv2marcxml.xsl"
    )
    json_data = marcxml2record(record_oai_arxiv_plots_marcxml)

    categories = []
    for eprint in json_data.get('arxiv_eprints', []):
        categories.extend(eprint.get('categories', []))

    assert categories

    return json_data, categories
Ejemplo n.º 26
0
def deleted_record(app):
    snippet = (
        '<record>'
        '  <controlfield tag="001">111</controlfield>'
        '  <datafield tag="245" ind1=" " ind2=" ">'
        '    <subfield code="a">deleted</subfield>'
        '  </datafield>'
        '  <datafield tag="980" ind1=" " ind2=" ">'
        '    <subfield code="a">HEP</subfield>'
        '    <subfield code="c">DELETED</subfield>'
        '  </datafield>'
        '</record>'
    )

    record = marcxml2record(snippet)
    record['$schema'] = 'http://localhost:5000/schemas/records/hep.json'

    with db.session.begin_nested():
        _create_record(record)
    db.session.commit()

    yield

    _delete_record('lit', 111)
Ejemplo n.º 27
0
def migrate_record_from_mirror(prod_record,
                               disable_orcid_push=True,
                               disable_citation_update=True):
    """Migrate a mirrored legacy record into an Inspire record.
    Args:
        prod_record(LegacyRecordsMirror): the mirrored record to migrate.
    Returns:
        dict: the migrated record metadata, which is also inserted into the database.
    """
    logger = LOGGER.bind(recid=prod_record.recid)
    try:
        json_record = marcxml2record(prod_record.marcxml)
    except Exception as exc:
        logger.exception("Error converting from marcxml")
        prod_record.error = exc
        db.session.merge(prod_record)
        return None

    if "$schema" in json_record:
        ensure_valid_schema(json_record)

        pid_type = PidStoreBase.get_pid_type_from_schema(
            json_record.get("$schema"))
        if pid_type in current_app.config.get("MIGRATION_PID_TYPE_BLACKLIST"):
            prod_record.error = Exception(
                f"Record: ${prod_record.recid} has blacklisted pid_type: ${pid_type} is blacklisted"
            )
            db.session.merge(prod_record)
            return

    try:
        with db.session.begin_nested():
            cls = InspireRecord.get_class_for_record(json_record)
            for deleted_record in cls.get_linked_records_from_dict_field(
                    json_record, "deleted_records"):
                deleted_record.pidstore_handler(
                    deleted_record.id, deleted_record).delete_external_pids()
            record = cls.create_or_update(
                json_record,
                disable_orcid_push=disable_orcid_push,
                disable_citation_update=disable_citation_update,
            )
    except ValidationError as exc:
        path = ".".join(exc.schema_path)
        logger.warn(
            "Migrator validator error",
            path=path,
            value=exc.instance,
            recid=prod_record.recid,
        )
        prod_record.error = exc
        db.session.merge(prod_record)
    except PIDValueError as exc:
        message = f"pid_type:'{exc.pid_type}', pid_value:'{exc.pid_value}'"
        logger.exception("PIDValueError while migrate from mirror",
                         msg=message)
        exc.args = (message, )
        prod_record.error = exc
        db.session.merge(prod_record)
    except Exception as exc:
        logger.exception("Error while migrating record into mirror")
        prod_record.error = exc
        db.session.merge(prod_record)
    else:
        prod_record.valid = True
        db.session.merge(prod_record)
        return record
Ejemplo n.º 28
0
def migrate_record_from_mirror(prod_record,
                               disable_external_push=True,
                               disable_relations_update=True):
    """Migrate a mirrored legacy record into an Inspire record.
    Args:
        prod_record(LegacyRecordsMirror): the mirrored record to migrate.
    Returns:
        dict: the migrated record metadata, which is also inserted into the database.
    """
    logger = LOGGER.bind(recid=prod_record.recid)
    try:
        json_record = marcxml2record(prod_record.marcxml)
    except NotSupportedError as exc:
        logger.warning(str(exc), recid=prod_record.recid)
        prod_record.valid = True
        db.session.merge(prod_record)
        return
    except Exception as exc:
        logger.exception("Error converting from marcxml")
        prod_record.error = exc
        db.session.merge(prod_record)
        return

    if "$schema" in json_record:
        ensure_valid_schema(json_record)

        pid_type = PidStoreBase.get_pid_type_from_schema(
            json_record.get("$schema"))
        if pid_type in current_app.config.get("MIGRATION_PID_TYPE_BLACKLIST"):
            prod_record.error = Exception(
                f"Record: {prod_record.recid} has blacklisted pid_type: {pid_type} is blacklisted"
            )
            db.session.merge(prod_record)
            return

    try:
        with db.session.begin_nested():
            cls = InspireRecord.get_class_for_record(json_record)
            original_urls = replace_afs_file_locations_with_local(json_record)
            record = cls.create_or_update(
                json_record,
                disable_external_push=disable_external_push,
                disable_relations_update=disable_relations_update,
            )
            cache_afs_file_locations(record)
    except ValidationError as exc:
        path = ".".join(exc.schema_path)
        logger.warn(
            "Migrator validator error",
            path=path,
            value=exc.instance,
            recid=prod_record.recid,
        )
        prod_record.error = exc
        db.session.merge(prod_record)
    except DownloadFileError as exc:
        removed_cached_files = remove_cached_afs_file_locations(original_urls)
        if not removed_cached_files:
            logger.exception("DownloadFileError while migrate from mirror")
            prod_record.error = exc
            db.session.merge(prod_record)
        else:
            return migrate_record_from_mirror(
                prod_record=prod_record,
                disable_external_push=disable_external_push,
                disable_relations_update=disable_relations_update,
            )
    except PIDValueError as exc:
        message = f"pid_type:'{exc.pid_type}', pid_value:'{exc.pid_value}'"
        logger.exception("PIDValueError while migrate from mirror",
                         msg=message)
        exc.args = (message, )
        prod_record.error = exc
        db.session.merge(prod_record)
    except ThreadsTimeoutError:
        raise
    except Exception as exc:
        logger.exception("Error while migrating record into mirror")
        prod_record.error = exc
        db.session.merge(prod_record)
    else:
        prod_record.valid = True
        db.session.merge(prod_record)
        return record
Ejemplo n.º 29
0
def reporterrors(output):
    """Reports in a friendly way all failed records and corresponding motivation."""
    click.echo("Reporting broken records into {0}".format(output))
    errors = {}
    results = LegacyRecordsMirror.query.filter(LegacyRecordsMirror.valid == False) # noqa: ignore=F712
    results_length = results.count()
    with click.progressbar(results.yield_per(100), length=results_length) as bar:
        for obj in bar:
            marc_record = create_record(obj.marcxml, keep_singletons=False)
            collection = get_collection(marc_record)
            if 'DELETED' in collection:
                continue
            recid = int(marc_record['001'])
            try:
                json_record = marcxml2record(obj.marcxml)
            except Exception as err:
                tb = u''.join(traceback.format_tb(sys.exc_info()[2]))
                errors.setdefault((collection, 'dojson', tb), []).append(recid)
                continue

            ensure_valid_schema(json_record)

            try:
                validate(json_record)
            except jsonschema.exceptions.ValidationError as err:
                exc = [
                    row
                    for row in str(err).splitlines()
                    if row.startswith('Failed validating')
                ][0]
                details = u'\n'.join(
                    dropwhile(
                        lambda x: not x.startswith('On instance'),
                        str(err).splitlines()
                    )
                )
                errors.setdefault(
                    (collection, 'validation', exc), []
                ).append((recid, details))
                continue

    with open(output, "w") as out:
        csv_writer = csv.writer(out)
        for (collection, stage, error), elements in errors.iteritems():
            if stage == 'dojson':
                csv_writer.writerow((
                    collection,
                    stage,
                    error,
                    '\n'.join(
                        'http://inspirehep.net/record/{}'.format(recid)
                        for recid in elements
                    )
                ))
            else:
                for recid, details in elements:
                    csv_writer.writerow((
                        collection,
                        stage,
                        error,
                        'http://inspirehep.net/record/{}'.format(recid),
                        details
                    ))
    click.echo("Dumped errors into {}".format(output))