Exemple #1
0
def already_harvested_on_legacy_record():
    """Provide record fixture."""
    record_oai_arxiv_plots = pkg_resources.resource_string(
        __name__,
        os.path.join('../fixtures', 'oai_arxiv_record_already_on_legacy.xml'))
    # Convert to MARCXML, then dict, then HEP JSON
    record_oai_arxiv_plots_marcxml = convert(record_oai_arxiv_plots,
                                             "oaiarXiv2marcxml.xsl")
    record_marc = create_record(record_oai_arxiv_plots_marcxml)
    json_data = hep.do(record_marc)

    return json_data
Exemple #2
0
    def _author_list(obj, eng):
        arxiv_id = get_arxiv_id(obj.data)
        filename = secure_filename('{0}.tar.gz'.format(arxiv_id))
        tarball = obj.files[filename]

        if tarball:
            sub_dir = os.path.abspath('{0}_files'.format(tarball.file.uri))
            try:
                file_list = untar(tarball.file.uri, sub_dir)
            except InvalidTarball:
                obj.log.error('Invalid tarball %s for arxiv_id %s', tarball.file.uri, arxiv_id)
                return
            obj.log.info('Extracted tarball to: {0}'.format(sub_dir))

            xml_files_list = [path for path in file_list
                              if path.endswith('.xml')]
            obj.log.info('Found xmlfiles: {0}'.format(xml_files_list))

            for xml_file in xml_files_list:
                with open(xml_file, 'r') as xml_file_fd:
                    xml_content = xml_file_fd.read()

                match = REGEXP_AUTHLIST.findall(xml_content)
                if match:
                    obj.log.info('Found a match for author extraction')
                    try:
                        authors_xml = convert(xml_content, stylesheet)
                    except XMLSyntaxError:
                        # Probably the %auto-ignore comment exists, so we skip the
                        # first line. See: inspirehep/inspire-next/issues/2195
                        authors_xml = convert(
                            xml_content.split('\n', 1)[1],
                            stylesheet,
                        )
                    authors_rec = create_record(authors_xml)
                    authorlist_record = hep.do(authors_rec)
                    obj.data.update(authorlist_record)
                    break
Exemple #3
0
def generate_record():
    """Provide record fixture."""
    record_oai_arxiv_plots = pkg_resources.resource_string(
        __name__, os.path.join('../fixtures',
                               'oai_arxiv_record_with_plots.xml'))

    # Convert to MARCXML, then dict, then HEP JSON
    record_oai_arxiv_plots_marcxml = convert(record_oai_arxiv_plots,
                                             "oaiarXiv2marcxml.xsl")
    record_marc = create_record(record_oai_arxiv_plots_marcxml)
    json_data = hep.do(record_marc)

    if 'preprint_date' in json_data:
        json_data['preprint_date'] = datetime.date.today().isoformat()

    return json_data
Exemple #4
0
    def _author_list(obj, eng):
        from inspirehep.modules.converter import convert

        arxiv_id = get_clean_arXiv_id(obj.data)
        filename = secure_filename("{0}.tar.gz".format(arxiv_id))
        if filename not in obj.files:
            tarball = download_file_to_record(
                record=obj,
                name=filename,
                url=current_app.config['ARXIV_TARBALL_URL'].format(
                    arxiv_id=arxiv_id
                )
            )
        else:
            tarball = obj.files[filename]

        sub_dir = os.path.abspath("{0}_files".format(tarball.file.uri))
        try:
            file_list = untar(tarball.file.uri, sub_dir)
        except InvalidTarball:
            obj.log.error("Invalid tarball {0}".format(tarball.file.uri))
            return
        obj.log.info("Extracted tarball to: {0}".format(sub_dir))

        xml_files_list = [path for path in file_list
                          if path.endswith(".xml")]
        obj.log.info("Found xmlfiles: {0}".format(xml_files_list))

        for xml_file in xml_files_list:
            xml_file_fd = open(xml_file, "r")
            xml_content = xml_file_fd.read()
            xml_file_fd.close()

            match = REGEXP_AUTHLIST.findall(xml_content)
            if match:
                obj.log.info("Found a match for author extraction")
                authors_xml = convert(xml_content, stylesheet)
                authors_rec = create_record(authors_xml)
                authorlist_record = hep.do(authors_rec)
                obj.data.update(authorlist_record)
                break
Exemple #5
0
    def _author_list(obj, eng):
        from inspirehep.modules.converter import convert

        arxiv_id = get_clean_arXiv_id(obj.data)
        filename = secure_filename("{0}.tar.gz".format(arxiv_id))
        if filename not in obj.files:
            tarball = download_file_to_workflow(
                workflow=obj,
                name=filename,
                url=current_app.config['ARXIV_TARBALL_URL'].format(
                    arxiv_id=arxiv_id
                )
            )
        else:
            tarball = obj.files[filename]

        sub_dir = os.path.abspath("{0}_files".format(tarball.file.uri))
        try:
            file_list = untar(tarball.file.uri, sub_dir)
        except InvalidTarball:
            obj.log.error("Invalid tarball {0}".format(tarball.file.uri))
            return
        obj.log.info("Extracted tarball to: {0}".format(sub_dir))

        xml_files_list = [path for path in file_list
                          if path.endswith(".xml")]
        obj.log.info("Found xmlfiles: {0}".format(xml_files_list))

        for xml_file in xml_files_list:
            xml_file_fd = open(xml_file, "r")
            xml_content = xml_file_fd.read()
            xml_file_fd.close()

            match = REGEXP_AUTHLIST.findall(xml_content)
            if match:
                obj.log.info("Found a match for author extraction")
                authors_xml = convert(xml_content, stylesheet)
                authors_rec = create_record(authors_xml)
                authorlist_record = hep.do(authors_rec)
                obj.data.update(authorlist_record)
                break
Exemple #6
0
def test_xslt(oai_xml, oai_xml_result):
    """Test conversion of XSLT from XML."""
    xml = convert(xml=oai_xml, xslt_filename="oaiarXiv2marcxml.xsl")
    assert xml
    assert xml == oai_xml_result
def test_xslt(oai_xml, oai_xml_result):
    """Test conversion of XSLT from XML."""
    xml = convert(xml=oai_xml, xslt_filename="oaiarXiv2marcxml.xsl")
    assert xml
    assert xml == oai_xml_result