def test_decode_latex():
    name_with_latex = u'{\\AA}άλφα'
    expected_value = u'Åάλφα'
    assert decode_latex(name_with_latex) == expected_value

    name_with_latex = '{\\AA}βήτα'
    expected_value = u'Åβήτα'
    assert decode_latex(name_with_latex) == expected_value
Example #2
0
def test_decode_latex():
    name_with_latex = u'{\\AA}άλφα'
    expected_value = u'Åάλφα'
    assert decode_latex(name_with_latex) == expected_value

    name_with_latex = '{\\AA}βήτα'
    expected_value = u'Åβήτα'
    assert decode_latex(name_with_latex) == expected_value
Example #3
0
    def _author_list(obj, eng):
        arxiv_id = get_arxiv_id(obj.data)
        filename = secure_filename('{0}.tar.gz'.format(arxiv_id))
        try:
            tarball = obj.files[filename]
        except KeyError:
            obj.log.info(
                'Skipping author list extraction, no tarball with name "%s" found'
                % filename)
            return

        with TemporaryDirectory(prefix='author_list') as scratch_space, \
                retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file:
            try:
                file_list = untar(tarball_file, scratch_space)
            except InvalidTarball:
                obj.log.info(
                    'Invalid tarball %s for arxiv_id %s',
                    tarball.file.uri,
                    arxiv_id,
                )
                return

            obj.log.info('Extracted tarball to: {0}'.format(scratch_space))
            xml_files_list = [
                path for path in file_list if path.endswith('.xml')
            ]
            obj.log.info('Found xmlfiles: {0}'.format(xml_files_list))

            extracted_authors = []
            for xml_file in xml_files_list:
                with open(xml_file, 'r') as xml_file_fd:
                    xml_content = xml_file_fd.read()

                match = REGEXP_AUTHLIST.findall(xml_content)
                if match:
                    obj.log.info('Found a match for author extraction')
                    try:
                        authors_xml = convert(xml_content, stylesheet)
                    except XMLSyntaxError:
                        # Probably the %auto-ignore comment exists, so we skip the
                        # first line. See: inspirehep/inspire-next/issues/2195
                        authors_xml = convert(
                            xml_content.split('\n', 1)[1],
                            stylesheet,
                        )

                    extracted_authors.extend(
                        marcxml2record(authors_xml).get('authors', []))

            if extracted_authors:
                for author in extracted_authors:
                    author['full_name'] = decode_latex(author['full_name'])

                obj.data['authors'] = extracted_authors
Example #4
0
    def _author_list(obj, eng):
        arxiv_id = LiteratureReader(obj.data).arxiv_id
        filename = secure_filename('{0}.tar.gz'.format(arxiv_id))
        try:
            tarball = obj.files[filename]
        except KeyError:
            obj.log.info(
                'Skipping author list extraction, no tarball with name "%s" found' % filename
            )
            return

        with TemporaryDirectory(prefix='author_list') as scratch_space, \
                retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file:
            try:
                file_list = untar(tarball_file, scratch_space)
            except InvalidTarball:
                obj.log.info(
                    'Invalid tarball %s for arxiv_id %s',
                    tarball.file.uri,
                    arxiv_id,
                )
                return

            obj.log.info('Extracted tarball to: {0}'.format(scratch_space))
            xml_files_list = [path for path in file_list if path.endswith('.xml')]
            obj.log.info('Found xmlfiles: {0}'.format(xml_files_list))

            extracted_authors = []
            for xml_file in xml_files_list:
                with open(xml_file, 'r') as xml_file_fd:
                    xml_content = xml_file_fd.read()

                match = REGEXP_AUTHLIST.findall(xml_content)
                if match:
                    obj.log.info('Found a match for author extraction')
                    try:
                        authors_xml = convert(xml_content, stylesheet)
                    except XMLSyntaxError:
                        # Probably the %auto-ignore comment exists, so we skip the
                        # first line. See: inspirehep/inspire-next/issues/2195
                        authors_xml = convert(
                            xml_content.split('\n', 1)[1],
                            stylesheet,
                        )

                    extracted_authors.extend(marcxml2record(authors_xml).get('authors', []))

            if extracted_authors:
                for author in extracted_authors:
                    author['full_name'] = decode_latex(author['full_name'])

                obj.data['authors'] = extracted_authors
def test_decode_latex_with_empty_string():
    name_with_latex = ''
    expected_value = ''
    assert decode_latex(name_with_latex) == expected_value
Example #6
0
def test_decode_latex_with_empty_string():
    name_with_latex = ''
    expected_value = ''
    assert decode_latex(name_with_latex) == expected_value