Example #1
def test_categories():
    """Test extraction arXiv categories from arXiv api."""

    file_data = read_response('arxiv', '1811.00370.xml')

    with requests_mock.Mocker() as m:
        m.get('http://export.arxiv.org/api/query?search_query=id:1811.00370', text=file_data)
        categories = get_arxiv_categories('1811.00370')
        assert categories == ['hep-th', 'gr-qc', 'math-ph', 'math.MP']
Example #2
def test_empty_response():
    """Test extraction arXiv categories from arXiv api."""

    file_data = read_response('arxiv', 'empty.xml')

    with requests_mock.Mocker() as m:
        m.get('http://export.arxiv.org/api/query?search_query=id:not_found', text=file_data)
        categories = get_arxiv_categories('not_found')
        assert categories == []
Example #3
def test_ambiguous_title():
    """Test for receiving more then one result for partial title."""
    title = 'hep'

    arxiv_search_title_hep = read_response('arxiv', 'search_title_hep.xml')
    with requests_mock.Mocker() as m:
        m.get('http://export.arxiv.org/api/query?search_query=ti:"%s"' % title, text=arxiv_search_title_hep)
        categories = get_arxiv_categories(title=title)
        assert categories == []
Example #4
def test_categories():
    """Test extraction arXiv categories from arXiv api."""

    file_path = path.join(get_response_dir(), 'arxiv', '1811.00370.xml')
    with open(file_path, 'rb') as f:
        file_data = f.read()

        with requests_mock.Mocker() as m:
            m.get('http://export.arxiv.org/api/query?search_query=id:1811.00370', text=file_data)
            categories = get_arxiv_categories('1811.00370')
            assert categories == ['hep-th', 'gr-qc', 'math-ph', 'math.MP']
Example #5
def test_empty_response():
    """Test extraction arXiv categories from arXiv api."""

    file_path = path.join(get_response_dir(), 'arxiv', 'empty.xml')
    with open(file_path, 'rb') as f:
        file_data = f.read()

        with requests_mock.Mocker() as m:
            m.get('http://export.arxiv.org/api/query?search_query=id:not_found', text=file_data)
            categories = get_arxiv_categories('not_found')
            assert categories == []
Example #6
def add_arxiv_category(obj, eng):
    """Add arXiv categories fetched from arXiv.org"""
    if "arxiv_eprints" in obj.data:
        for element in obj.data.get("arxiv_eprints", ()):
            if 'value' not in element:
                logger.warning('arxiv_eprints value missing for article with doi: %s' % get_first_doi(obj))

            arxiv_id = element['value']

            if 'categories' not in element:
                categories = get_arxiv_categories(arxiv_id)
                element['categories'] = categories
Example #7
def _get_arxiv_category_from_arxiv(item):
    Try querying arXiv for the category.

    Hence the arXiv id is not present at this point, try filter for doi or title.
    field_list = (('doi', 'DOI'), ('title', 'title[0]'))
    for param, item_key in field_list:
        categories = get_arxiv_categories(**{param: get_value(item, item_key)})
        if categories:
            return categories[0]

    return None
Example #8
def add_arxiv_category(obj, eng):
    """Add arXiv categories fetched from arXiv.org"""
    if "arxiv_eprints" in obj.data:
        for element in obj.data.get("arxiv_eprints", ()):
            if 'value' not in element:
                    'arxiv_eprints value missing for article with doi: %s' %

            arxiv_id = element['value']

            if 'categories' not in element:
                categories = get_arxiv_categories(arxiv_id)
                element['categories'] = categories
Example #9
def add_arxiv_category(obj, eng):
    """Add arXiv categories fetched from arXiv.org"""
    if "report_numbers" in obj.data:
        for i, element in enumerate(obj.data["report_numbers"]):
            arxiv_id = element['value']
            if arxiv_id.lower().startswith("arxiv:"):
                arxiv_id = element['value'][6:]
            arxiv_id = arxiv_id.split('v')[0]

            if 'categories' not in element:
                categories = get_arxiv_categories(arxiv_id)
                obj.data["report_numbers"][i]['categories'] = categories

            if 'primary_category' not in element:
                primary_category = get_arxiv_primary_category(arxiv_id)
                    'primary_category'] = primary_category
Example #10
def add_arxiv_category(obj, eng):
    """Add arXiv categories fetched from arXiv.org"""
    if "arxiv_eprints" in obj.data:
        for element in obj.data.get("arxiv_eprints", ()):
            if 'value' not in element:
                    'arxiv_eprints value missing for article with doi: %s' %

            arxiv_id = element['value']

            if 'categories' not in element:
                categories = get_arxiv_categories(arxiv_id)
                if not categories:
                        'Could not determine arXiv category based on id.', eng)

                element['categories'] = categories
Example #11
    def proc(article_impact):
            if 'arxiv_primary_category' in article_impact.details:

            pid = PersistentIdentifier.get('recid', article_impact.control_number)
            record = Record.get_record(pid.object_uuid)

            if not record:

            if 'arxiv_eprints' in record:
                info('%d: eprints found' % article_impact.control_number)
                arxiv = (record['arxiv_eprints'][0]['value'].split(':')[1]).split('v')[0]
                cat = get_arxiv_categories(arxiv)[0]
                info('category: %s' % cat)
                if cat:
                    article_impact.details['arxiv_primary_category'] = cat
                    flag_modified(article_impact, 'details')

            elif 'report_numbers' in record:
                info('%d: report_numbers found' % article_impact.control_number)
                cat = get_arxiv_primary_category(record)
                info('category: %s' % cat)
                if cat:
                    article_impact.details['arxiv_primary_category'] = cat
                    flag_modified(article_impact, 'details')

                error('%d: no arxiv' % article_impact.control_number)

        except PIDDoesNotExistError:
            # records imported from Inspire won't be found
        except AttributeError as e:
            error('%d: %s' % (article_impact.control_number, e))
Example #12
def map_old_record(record, dry_run):
    Maps the given record if needed to comply with the new schema.

    Following fields will be mapped:
     - page_nr will be a list of integers instead of list of strings
     - arxiv id will be put to the arxiv_eprints field
     - arxiv categories will be added if not yet present
     - "arxiv:" prefix will be removed from arxiv id
     - record_creation_date will be converted to iso format

     Following fields will be deleted at the end of the process:
     - _collections
     - report_numbers
     - files
     - local_files
     - free_keywords
     - additional_files
     - file_urls
     - earliest_date

    The result won't be saved and None will be returned in the following cases:
     - the record doesn't contain a json
     - a record fails the validation after mapping
     - both report_numbers and arxiv_eprints fields are present (shouldn't happen in the existing records)
     - there is more then one value in report_numbers field (shouldn't happen in the existing records)
     - report_numbers field is present, but there is no source subfield
     - no record_creation_date is present

    # if there is no json, the record is considered deleted
    if not record.json:
        rerror('no json', record)

    # page_nr to list of integers
    if 'page_nr' in record.json:
        record.json['page_nr'] = [int(x) for x in record.json['page_nr']]

    # extract arxiv from report_numbers if present
    if "report_numbers" in record.json and "arxiv_eprints" in record.json:
        rerror('both report_numbers and arxiv_eprints are present. Skip record.', record)

    if "report_numbers" in record.json:
        if len(record.json["report_numbers"]) > 1:
            rerror('report_numbers has more then one element. Skip record.', record)

        arxiv_id = None
        for element in record.json.get("report_numbers", ()):
            source = element.get('source')
            if not source:
                rerror('report_numbers present, but no source. Skip record.', record)

            if source.lower() == 'arxiv':
                arxiv_id = element.get('value')

        if arxiv_id:
            arxiv_id = arxiv_id.lower().replace('arxiv:', '')
            record.json['arxiv_eprints'] = [{'value': arxiv_id}]
            rinfo('report_numbers -> arxiv_eprints', record)
            rerror('report_numbers present, but no arxiv id? Skip record.', record)

    # add arxiv category if not yet present
    if "arxiv_eprints" in record.json:
        for element in record.json.get("arxiv_eprints", ()):
            if 'value' not in element:
                rerror('arxiv_eprints value missing', record)

            arxiv_id = element['value']

            # remove arxiv prefix if present
            if arxiv_id.lower().startswith('arxiv:'):
                rinfo('removing "arxiv:" prefix', record)
                arxiv_id = arxiv_id[len('arxiv:'):]

            if 'categories' not in element:
                categories = get_arxiv_categories(arxiv_id)
                element['categories'] = categories

    # record_creation_date to isoformat
    record_creation_date = record.json.get('record_creation_date')
    if record_creation_date is None:
        rerror('no record creation date. Skip record.', record)

    new_date = parse_date(record_creation_date).isoformat()
    if new_date != record_creation_date:
        rinfo('update record_creation_date: %s -> %s' % (record_creation_date, new_date), record)
        record.json['record_creation_date'] = new_date

    # delete unwanted fields
    unwanted_fields = (
    for key in unwanted_fields:
        if record.json.pop(key, None) is not None:
            rinfo('deleted %s field' % key, record)

    # validate record
    valid = False
    schema = record.json.get('$schema')
    if schema is not None:
        schema_data = requests_retry_session().get(schema).content
        schema_data = json.loads(schema_data)

            validate(record.json, schema_data)
            valid = True
        except ValidationError as err:
            rerror('Invalid record: %s' % err, record)
        except SchemaError as err:
            rerror('SchemaError during record validation! %s' % err, record)
        rerror('No schema found!', record)

    if not valid:

    # mark changes if not dry_run
    if not dry_run:
        flag_modified(record, 'json')

    return record
Example #13
def japanise():
    size = 100

    def get_query(start_index, size):
        return {
            '_source': ['authors', 'control_number', 'dois', 'publication_info', 'report_numbers', 'arxiv_eprints'],
            'from': start_index,
            'size': size,
            'query': {
                'term': {
                    'country': 'Japan'

    def get_arxiv(data):
        if 'report_numbers' in data:
            for r in data['report_numbers']:
                if r['source'] == 'arXiv':
                    return r['value'].split(':')[1]
            error('no arxiv? %s' % data['control_number'])
        if 'arxiv_eprints' in data:
            return data['arxiv_eprints'][0]['value'].split(':')[1]

        return ''

    index = 0
    total = None

    header = ['year', 'journal', 'doi', 'arxiv number', 'primary arxiv category', 'affiliaton',
              'authors with affiliation', 'total number of authors']
    si = StringIO()
    cw = csv.writer(si, delimiter=";")

    while total is None or index < total:
        search_results = es.search(index='records-record',
                                   body=get_query(index, size))
        total = search_results['hits']['total']
        info("%s/%s" % (index, total))
        index += size

        for hit in search_results['hits']['hits']:
            data = hit['_source']

            year = data['publication_info'][0]['year']
            journal = data['publication_info'][0]['journal_title']
            doi = data['dois'][0]['value']
            arxiv = get_arxiv(data)
            arxiv_category = get_arxiv_categories(arxiv)[0] if arxiv else ''

            total_authors = len(data['authors'])

            extracted_affiliations = {}
            for author in data['authors']:
                if 'affiliations' not in author:
                    error('no affiliations for author. %s' % doi)

                for aff in author['affiliations']:
                    if aff['country'] == 'Japan':
                        value = aff['value']
                        if value not in extracted_affiliations:
                            extracted_affiliations[value] = 0
                        extracted_affiliations[value] += 1

            if not extracted_affiliations:
                error('no extracted affs')

            for aff, count in extracted_affiliations.items():
                cw.writerow([year, journal, doi, arxiv, arxiv_category, aff.encode('utf8'), count, total_authors])

    with open('/tmp/japanise.csv', 'wt') as f: