def _author_list(obj, eng): from inspire.modules.converter.xslt import convert from invenio_oaiharvester.utils import find_matching_files model = eng.workflow_definition.model(obj) record = get_record_from_model(model) arxiv_id = get_arxiv_id_from_record(record) existing_file = get_file_by_name(model, "{0}.tar.gz".format(arxiv_id)) if not existing_file: # We download it tarball = get_tarball_for_model(eng, arxiv_id) if tarball is None: obj.log.error("No tarball found") return add_file_by_name(model, tarball) else: tarball = existing_file.get_syspath() sub_dir = os.path.abspath("{0}_files".format(tarball)) try: file_list = untar(tarball, sub_dir) except InvalidTarball: obj.log.error("Invalid tarball {0}".format(tarball)) return obj.log.info("Extracted tarball to: {0}".format(sub_dir)) xml_files_list = [filename for filename in file_list if filename.endswith(".xml")] obj.log.info("Found xmlfiles: {0}".format(xml_files_list)) for xml_file in xml_files_list: xml_file_fd = open(xml_file, "r") xml_content = xml_file_fd.read() xml_file_fd.close() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info("Found a match for author extraction") authors_xml = convert(xml_content, stylesheet) authorlist_record = get_json_from_marcxml(authors_xml)[0] record.update(authorlist_record) obj.update_task_results( "authors", [{ "name": "authors", "results": authorlist_record["authors"] }] ) obj.update_task_results( "number_of_authors", [{ "name": "number_of_authors", "results": authorlist_record["number_of_authors"] }] ) break model.update()
def _author_list(obj, eng): arxiv_id = get_arxiv_id(obj.data) filename = secure_filename('{0}.tar.gz'.format(arxiv_id)) try: tarball = obj.files[filename] except KeyError: obj.log.info( 'Skipping author list extraction, no tarball with name "%s" found' % filename) return with TemporaryDirectory(prefix='author_list') as scratch_space, \ retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file: try: file_list = untar(tarball_file, scratch_space) except InvalidTarball: obj.log.info( 'Invalid tarball %s for arxiv_id %s', tarball.file.uri, arxiv_id, ) return obj.log.info('Extracted tarball to: {0}'.format(scratch_space)) xml_files_list = [ path for path in file_list if path.endswith('.xml') ] obj.log.info('Found xmlfiles: {0}'.format(xml_files_list)) extracted_authors = [] for xml_file in xml_files_list: with open(xml_file, 'r') as xml_file_fd: xml_content = xml_file_fd.read() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info('Found a match for author extraction') try: authors_xml = convert(xml_content, stylesheet) except XMLSyntaxError: # Probably the %auto-ignore comment exists, so we skip the # first line. See: inspirehep/inspire-next/issues/2195 authors_xml = convert( xml_content.split('\n', 1)[1], stylesheet, ) extracted_authors.extend( marcxml2record(authors_xml).get('authors', [])) if extracted_authors: for author in extracted_authors: author['full_name'] = decode_latex(author['full_name']) obj.data['authors'] = extracted_authors
def _author_list(obj, eng): arxiv_id = LiteratureReader(obj.data).arxiv_id filename = secure_filename('{0}.tar.gz'.format(arxiv_id)) try: tarball = obj.files[filename] except KeyError: obj.log.info( 'Skipping author list extraction, no tarball with name "%s" found' % filename ) return with TemporaryDirectory(prefix='author_list') as scratch_space, \ retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file: try: file_list = untar(tarball_file, scratch_space) except InvalidTarball: obj.log.info( 'Invalid tarball %s for arxiv_id %s', tarball.file.uri, arxiv_id, ) return obj.log.info('Extracted tarball to: {0}'.format(scratch_space)) xml_files_list = [path for path in file_list if path.endswith('.xml')] obj.log.info('Found xmlfiles: {0}'.format(xml_files_list)) extracted_authors = [] for xml_file in xml_files_list: with open(xml_file, 'r') as xml_file_fd: xml_content = xml_file_fd.read() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info('Found a match for author extraction') try: authors_xml = convert(xml_content, stylesheet) except XMLSyntaxError: # Probably the %auto-ignore comment exists, so we skip the # first line. See: inspirehep/inspire-next/issues/2195 authors_xml = convert( xml_content.split('\n', 1)[1], stylesheet, ) extracted_authors.extend(marcxml2record(authors_xml).get('authors', [])) if extracted_authors: for author in extracted_authors: author['full_name'] = decode_latex(author['full_name']) obj.data['authors'] = extracted_authors
def test_detect_images_and_tex_ignores_hidden_metadata_files(): tarball_filename = pkg_resources.resource_filename( __name__, os.path.join('data', '1704.02281.tar.gz')) try: temporary_dir = mkdtemp() file_list = untar(tarball_filename, temporary_dir) image_files, _ = detect_images_and_tex(file_list) # Ensure image_list doesn't contain a hidden or metadata file for f in image_files: assert 'image' in magic.from_file(f).lower() \ or 'eps' in magic.from_file(f).lower() \ or 'Postscript' in magic.from_file(f) finally: rmtree(temporary_dir)
def _author_list(obj, eng): arxiv_id = get_arxiv_id(obj.data) filename = secure_filename('{0}.tar.gz'.format(arxiv_id)) tarball = obj.files[filename] if tarball: with TemporaryDirectory(prefix='author_list') as scratch_space: tarball_file = retrieve_uri( tarball.file.uri, outdir=scratch_space, ) try: file_list = untar(tarball_file, scratch_space) except InvalidTarball: obj.log.info( 'Invalid tarball %s for arxiv_id %s', tarball.file.uri, arxiv_id, ) return obj.log.info('Extracted tarball to: {0}'.format(scratch_space)) xml_files_list = [ path for path in file_list if path.endswith('.xml') ] obj.log.info('Found xmlfiles: {0}'.format(xml_files_list)) for xml_file in xml_files_list: with open(xml_file, 'r') as xml_file_fd: xml_content = xml_file_fd.read() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info('Found a match for author extraction') try: authors_xml = convert(xml_content, stylesheet) except XMLSyntaxError: # Probably the %auto-ignore comment exists, so we skip the # first line. See: inspirehep/inspire-next/issues/2195 authors_xml = convert( xml_content.split('\n', 1)[1], stylesheet, ) authorlist_record = marcxml2record(authors_xml) obj.data.update(authorlist_record) break
def _author_list(obj, eng): from inspirehep.modules.converter import convert arxiv_id = get_clean_arXiv_id(obj.data) filename = secure_filename("{0}.tar.gz".format(arxiv_id)) if filename not in obj.files: tarball = download_file_to_record( record=obj, name=filename, url=current_app.config['ARXIV_TARBALL_URL'].format( arxiv_id=arxiv_id ) ) else: tarball = obj.files[filename] sub_dir = os.path.abspath("{0}_files".format(tarball.file.uri)) try: file_list = untar(tarball.file.uri, sub_dir) except InvalidTarball: obj.log.error("Invalid tarball {0}".format(tarball.file.uri)) return obj.log.info("Extracted tarball to: {0}".format(sub_dir)) xml_files_list = [path for path in file_list if path.endswith(".xml")] obj.log.info("Found xmlfiles: {0}".format(xml_files_list)) for xml_file in xml_files_list: xml_file_fd = open(xml_file, "r") xml_content = xml_file_fd.read() xml_file_fd.close() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info("Found a match for author extraction") authors_xml = convert(xml_content, stylesheet) authors_rec = create_record(authors_xml) authorlist_record = hep.do(authors_rec) obj.data.update(authorlist_record) break
def _author_list(obj, eng): from inspirehep.modules.converter import convert arxiv_id = get_clean_arXiv_id(obj.data) filename = secure_filename("{0}.tar.gz".format(arxiv_id)) if filename not in obj.files: tarball = download_file_to_workflow( workflow=obj, name=filename, url=current_app.config['ARXIV_TARBALL_URL'].format( arxiv_id=arxiv_id ) ) else: tarball = obj.files[filename] sub_dir = os.path.abspath("{0}_files".format(tarball.file.uri)) try: file_list = untar(tarball.file.uri, sub_dir) except InvalidTarball: obj.log.error("Invalid tarball {0}".format(tarball.file.uri)) return obj.log.info("Extracted tarball to: {0}".format(sub_dir)) xml_files_list = [path for path in file_list if path.endswith(".xml")] obj.log.info("Found xmlfiles: {0}".format(xml_files_list)) for xml_file in xml_files_list: xml_file_fd = open(xml_file, "r") xml_content = xml_file_fd.read() xml_file_fd.close() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info("Found a match for author extraction") authors_xml = convert(xml_content, stylesheet) authors_rec = create_record(authors_xml) authorlist_record = hep.do(authors_rec) obj.data.update(authorlist_record) break
def _author_list(obj, eng): arxiv_id = get_arxiv_id(obj.data) filename = secure_filename('{0}.tar.gz'.format(arxiv_id)) tarball = obj.files[filename] if tarball: sub_dir = os.path.abspath('{0}_files'.format(tarball.file.uri)) try: file_list = untar(tarball.file.uri, sub_dir) except InvalidTarball: obj.log.error('Invalid tarball %s for arxiv_id %s', tarball.file.uri, arxiv_id) return obj.log.info('Extracted tarball to: {0}'.format(sub_dir)) xml_files_list = [path for path in file_list if path.endswith('.xml')] obj.log.info('Found xmlfiles: {0}'.format(xml_files_list)) for xml_file in xml_files_list: with open(xml_file, 'r') as xml_file_fd: xml_content = xml_file_fd.read() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info('Found a match for author extraction') try: authors_xml = convert(xml_content, stylesheet) except XMLSyntaxError: # Probably the %auto-ignore comment exists, so we skip the # first line. See: inspirehep/inspire-next/issues/2195 authors_xml = convert( xml_content.split('\n', 1)[1], stylesheet, ) authors_rec = create_record(authors_xml) authorlist_record = hep.do(authors_rec) obj.data.update(authorlist_record) break