def test_decode_latex(): name_with_latex = u'{\\AA}άλφα' expected_value = u'Åάλφα' assert decode_latex(name_with_latex) == expected_value name_with_latex = '{\\AA}βήτα' expected_value = u'Åβήτα' assert decode_latex(name_with_latex) == expected_value
def _author_list(obj, eng): arxiv_id = get_arxiv_id(obj.data) filename = secure_filename('{0}.tar.gz'.format(arxiv_id)) try: tarball = obj.files[filename] except KeyError: obj.log.info( 'Skipping author list extraction, no tarball with name "%s" found' % filename) return with TemporaryDirectory(prefix='author_list') as scratch_space, \ retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file: try: file_list = untar(tarball_file, scratch_space) except InvalidTarball: obj.log.info( 'Invalid tarball %s for arxiv_id %s', tarball.file.uri, arxiv_id, ) return obj.log.info('Extracted tarball to: {0}'.format(scratch_space)) xml_files_list = [ path for path in file_list if path.endswith('.xml') ] obj.log.info('Found xmlfiles: {0}'.format(xml_files_list)) extracted_authors = [] for xml_file in xml_files_list: with open(xml_file, 'r') as xml_file_fd: xml_content = xml_file_fd.read() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info('Found a match for author extraction') try: authors_xml = convert(xml_content, stylesheet) except XMLSyntaxError: # Probably the %auto-ignore comment exists, so we skip the # first line. See: inspirehep/inspire-next/issues/2195 authors_xml = convert( xml_content.split('\n', 1)[1], stylesheet, ) extracted_authors.extend( marcxml2record(authors_xml).get('authors', [])) if extracted_authors: for author in extracted_authors: author['full_name'] = decode_latex(author['full_name']) obj.data['authors'] = extracted_authors
def _author_list(obj, eng): arxiv_id = LiteratureReader(obj.data).arxiv_id filename = secure_filename('{0}.tar.gz'.format(arxiv_id)) try: tarball = obj.files[filename] except KeyError: obj.log.info( 'Skipping author list extraction, no tarball with name "%s" found' % filename ) return with TemporaryDirectory(prefix='author_list') as scratch_space, \ retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file: try: file_list = untar(tarball_file, scratch_space) except InvalidTarball: obj.log.info( 'Invalid tarball %s for arxiv_id %s', tarball.file.uri, arxiv_id, ) return obj.log.info('Extracted tarball to: {0}'.format(scratch_space)) xml_files_list = [path for path in file_list if path.endswith('.xml')] obj.log.info('Found xmlfiles: {0}'.format(xml_files_list)) extracted_authors = [] for xml_file in xml_files_list: with open(xml_file, 'r') as xml_file_fd: xml_content = xml_file_fd.read() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info('Found a match for author extraction') try: authors_xml = convert(xml_content, stylesheet) except XMLSyntaxError: # Probably the %auto-ignore comment exists, so we skip the # first line. See: inspirehep/inspire-next/issues/2195 authors_xml = convert( xml_content.split('\n', 1)[1], stylesheet, ) extracted_authors.extend(marcxml2record(authors_xml).get('authors', [])) if extracted_authors: for author in extracted_authors: author['full_name'] = decode_latex(author['full_name']) obj.data['authors'] = extracted_authors
def test_decode_latex_with_empty_string(): name_with_latex = '' expected_value = '' assert decode_latex(name_with_latex) == expected_value