Ejemplo n.º 1
0
def attach_files(obj, eng):
    if 'files' in obj.extra_data:
        recid = obj.data['control_number']
        pid = PersistentIdentifier.get('recid', recid)
        existing_record = Record.get_record(pid.object_uuid)

        if '_files' not in existing_record or not existing_record['_files']:
            bucket = Bucket.create()
            RecordsBuckets.create(record=existing_record.model, bucket=bucket)

        for file_ in obj.extra_data['files']:
            if file_['url'].startswith('http'):
                data = requests_retry_session().get(file_['url'], headers=file_.get('headers', {}))
                f = StringIO(data.content)
            else:
                f = open(file_['url'])

            existing_record.files[file_['name']] = f
            existing_record.files[file_['name']]['filetype'] = file_['filetype']

        obj.save()
        existing_record.commit()
        db.session.commit()
    else:
        __halt_and_notify('No files found.', eng)
Ejemplo n.º 2
0
def attach_files(obj, eng):
    if 'files' in obj.extra_data:
        recid = obj.data['control_number']
        pid = PersistentIdentifier.get('recid', recid)
        existing_record = Record.get_record(pid.object_uuid)

        if '_files' not in existing_record or not existing_record['_files']:
            bucket = Bucket.create()
            RecordsBuckets.create(record=existing_record.model, bucket=bucket)

        for file_ in obj.extra_data['files']:
            if file_['url'].startswith('http'):
                headers = file_.get('headers', {})
                data = requests_retry_session().get(file_['url'],
                                                    headers=headers)

                if data.status_code != 200:
                    __halt_and_notify(
                        "Error during acquiring files.\nHTTP status: %d\nUrl: %s\nHeaders:%s"
                        % (data.status_code, file_['url'], headers), eng)

                f = StringIO(data.content)
            else:
                f = open(file_['url'])

            existing_record.files[file_['name']] = f
            existing_record.files[
                file_['name']]['filetype'] = file_['filetype']

        obj.save()
        existing_record.commit()
        db.session.commit()
    else:
        __halt_and_notify('No files found.', eng)
Ejemplo n.º 3
0
def repos_diff():
    OLD_REPO_FILE = '/tmp/old_repo_dump4'
    OLD_REPO_URL = 'https://repo.scoap3.org/search?p=&of=recjson&ot=recid,doi,creation_date&rg=100000000'
    COOKIES = {
        'INVENIOSESSION': 'd3c673cf6be468dc6c6fd25703ff90c3',
        'INVENIOSESSIONstub': 'HTTPS',
        '_pk_id.10.1cdf': 'ff8bdd9962372712.1536586766.49.1546956598.1546955767.'
    }
    RESULT_FILE = '/tmp/repo_diff_result9'

    if not isfile(OLD_REPO_FILE):
        info('No old repo file (%s), downloding...' % OLD_REPO_FILE)
        data = requests_retry_session().get(OLD_REPO_URL, cookies=COOKIES).json()
        info('download complete (%d records), mapping...' % len(data))

        if len(data) < 1000:
            error('Aborting, not all record queried.')
            return

        mapped_data = {}
        for r in data:
            doi = r.pop('doi')
            if doi in mapped_data:
                error('Multiple records with doi. %s' % r)
            mapped_data[doi] = r

        info('mapping complete, saving file...')
        with open(OLD_REPO_FILE, 'wt') as f:
            f.write(json.dumps(mapped_data))

        info('File saved.')

    info('reading old repo data from: %s' % OLD_REPO_FILE)
    with open(OLD_REPO_FILE, 'rt') as f:
        old_data = json.loads(f.read())

    result = dict(only_in_old=[],
                  only_in_new=[],
                  in_both=[])

    def proc(record):
        if not record.json:
            return

        doi = get_first_doi(record.json)
        if doi in old_data:
            result['in_both'].append(doi)
            old_data.pop(doi)
        else:
            result['only_in_new'].append(doi)

    process_all_records(proc)

    result['only_in_old'] = map(lambda x: x[0], old_data.iteritems())
    with open(RESULT_FILE, 'wt') as f:
        f.write(json.dumps(result, indent=2))

    info('only_in_old: %s\nonly_in_new: %s\nin_both:%s\nALL DONE.' % (
        len(result['only_in_old']), len(result['only_in_new']), len(result['in_both'])))
Ejemplo n.º 4
0
    def proc(ai):
        try:
            PersistentIdentifier.get('recid', ai.control_number)
        except PIDDoesNotExistError:
            api_response = requests_retry_session().get(crossref_url % ai.doi)
            if api_response.status_code != 200:
                error('Failed to query crossref for doi: %s. Error code: %s' % (ai.doi, api_response.status_code))
                result['not200'].append(ai.control_number)
                return None

            title = api_response.json()['message']['title'][0].lower()

            if 'addendum' in title or 'corrigendum' in title or 'erratum' in title:
                result['hit'].append((ai.control_number, title))
Ejemplo n.º 5
0
def get_record_date(doi):
    crossref_url = current_app.config.get('CROSSREF_API_URL')

    api_response = requests_retry_session().get(crossref_url + doi)
    if api_response.status_code != 200:
        current_app.logger.error('Failed to query crossref for doi: %s. Error code: %s' % (doi, api_response.status_code))
        return None

    message = api_response.json()['message']
    if 'published-online' in message:
        parts = message['published-online']['date-parts'][0]
        # if we don't have month or day substitute it with 1
        if len(parts) < 3:
            parts.extend([1] * (3 - len(parts)))
        return datetime(*parts)

    return datetime.fromtimestamp(message['created']['timestamp'] // 1000)
Ejemplo n.º 6
0
def get_record_date(doi):
    crossref_url = current_app.config.get('CROSSREF_API_URL')

    api_response = requests_retry_session().get(crossref_url % doi)
    if api_response.status_code != 200:
        current_app.logger.error('Failed to query crossref for doi: %s. Error code: %s' % (doi, api_response.status_code))
        return None

    message = api_response.json()['message']
    if 'published-online' in message:
        parts = message['published-online']['date-parts'][0]
        # if we don't have month or day substitute it with 1
        if len(parts) < 3:
            parts.extend([1] * (3 - len(parts)))
        return datetime(*parts)

    return datetime.fromtimestamp(message['created']['timestamp'] // 1000)
Ejemplo n.º 7
0
def __get_country(search_text):
    """Return the country of the search text based on Google Maps."""

    GOOGLE_MAPS_API_URL = 'https://maps.googleapis.com/maps/api/geocode/json'

    params = {
        'address': search_text,
        'language': 'en',
        'key': current_app.config.get('GOOGLE_API_KEY', '')
    }

    req = requests_retry_session().get(GOOGLE_MAPS_API_URL, params=params, timeout=1).json()

    if 'status' in req:
        if req['status'].lower() == 'ok':
            country = __get_country_from_results(req)
            return COUNTRIES_DEFAULT_MAPPING.get(country, country)

    return None
Ejemplo n.º 8
0
def validate_record(obj, eng):
    """
    Validate record based on its schema.

    If there is no schema or the record is invalid, the workflow will be halted.
    """

    if '$schema' not in obj.data:
        __halt_and_notify('No schema found!', eng)
        return

    schema_data = requests_retry_session().get(obj.data['$schema']).content
    schema_data = json.loads(schema_data)

    try:
        validate(obj.data, schema_data)
    except ValidationError as err:
        __halt_and_notify('Invalid record: %s' % err, eng)
    except SchemaError as err:
        __halt_and_notify('SchemaError during record validation! %s' % err, eng)
Ejemplo n.º 9
0
def attach_file(control_number, file_path, file_type, filename):
    """
    Attach a file to an already existing record.

    The file-path can point to a local file, but also http and https protocols are supported. For these protocols
    sending specific headers are not supported, so make sure the website doesn't require any.

    In case the record already has a file with the given filename, it will be overwritten.
    """

    # get existing record
    try:
        api_record = APIRecord.get_record(
            PersistentIdentifier.get('recid', control_number).object_uuid)
    except (PIDDoesNotExistError, NoResultFound):
        error('No record found for given control number!')
        return

    # read and attach file
    if file_path.startswith('http://') or file_path.startswith('https://'):
        data = requests_retry_session().get(file_path)
        if data.status_code != 200:
            error('Could not download file. Status code: %d' %
                  data.status_code)
            return

        file_data = StringIO(data.content)
        if not attach_file_object(api_record, filename, file_type, file_data):
            return
    else:
        try:
            with open(file_path) as f:
                if not attach_file_object(api_record, filename, file_type, f):
                    return
        except IOError:
            error('local file was not found or not readable: %s' % file_path)
            return

    api_record.commit()
    db.session.commit()
    info('File successfully attached.')
Ejemplo n.º 10
0
def get_arxiv_categories(arxiv_id=None, title=None, doi=None):
    """
    Return a list of arxiv categories based on the specified arXiv identifier and/or title and/or doi.

    The identifier and title (if given) are both forwarded to the arXiv api.

    First element of the returned list is the primary category.
    In case categories cannot be found, empty list is returned.
    """

    if arxiv_id is None and title is None and doi is None:
        raise ValueError('One of the arxiv_id, title and doi parameters has to be different then None.')

    # make sure we have a clean arxiv number
    arxiv_id = clean_arxiv(arxiv_id)

    query = []
    if arxiv_id:
        query.append('id:%s' % arxiv_id)

    if title:
        title = title.replace('-', '?').encode('ascii', 'replace')
        query.append('ti:"%s"' % title)

    if doi:
        query.append('doi:"%s"' % doi)

    request_url = url.format(' '.join(query))
    data = requests_retry_session().get(request_url)

    categories = []
    if data.status_code == 200:
        xml = etree.fromstring(data.content)
        categories = get_arxiv_categories_from_response_xml(xml)
        if not categories:
            logger.warning('Could not get arxiv categories for id="%s" title="%s" doi="%s"' % (arxiv_id, title, doi))
    else:
        logger.error('Got status_code %s from arXiv when looking for categires for id="%s" title="%s" doi="%s"' % (
            data.status_code, arxiv_id, title, doi))

    return categories
Ejemplo n.º 11
0
def validate_record(obj, eng):
    """
    Validate record based on its schema.

    If there is no schema or the record is invalid, the workflow will be halted.
    """

    if '$schema' not in obj.data:
        __halt_and_notify('No schema found!', eng)
        return

    schema_data = requests_retry_session().get(obj.data['$schema']).content
    schema_data = json.loads(schema_data)

    try:
        validate(obj.data, schema_data)
    except ValidationError as err:
        __halt_and_notify('Invalid record: %s' % err, eng)
    except SchemaError as err:
        __halt_and_notify('SchemaError during record validation! %s' % err,
                          eng)
Ejemplo n.º 12
0
def __get_country(search_text):
    """Return the country of the search text based on Google Maps."""

    GOOGLE_MAPS_API_URL = 'https://maps.googleapis.com/maps/api/geocode/json'

    params = {
        'address': search_text,
        'language': 'en',
        'key': current_app.config.get('GOOGLE_API_KEY', '')
    }

    req = requests_retry_session().get(GOOGLE_MAPS_API_URL,
                                       params=params,
                                       timeout=1).json()

    if 'status' in req:
        if req['status'].lower() == 'ok':
            country = __get_country_from_results(req)
            return COUNTRIES_DEFAULT_MAPPING.get(country, country)

    return None
Ejemplo n.º 13
0
def get_crossref_items(filter_param=None):
    crossref_url = current_app.config.get('CROSSREF_API_URL')

    params = {'filter': filter_param, 'cursor': '*'}

    while True:
        api_response = requests_retry_session().get(crossref_url,
                                                    params=params)

        if api_response.status_code != 200:
            logger.error('Failed to query crossref. params' % params)
            break

        message = api_response.json()['message']

        items = message.get('items')
        if not items:
            break

        for item in items:
            yield item

        params['cursor'] = message.get('next-cursor')
Ejemplo n.º 14
0
def get_arxiv_categories(arxiv_id):
    """
    Return a list of arxiv categories for specified arXiv identifier.
    First element of the list is the primary category.
    In case categories cannot be found, empty list is returned.
    """

    # make sure we have a clean arxiv number
    arxiv_id = clean_arxiv(arxiv_id)

    data = requests_retry_session().get(url.format(arxiv_id))

    categories = []
    if data.status_code == 200:
        xml = etree.fromstring(data.content)
        primary_category = xml.xpath('//arxiv:primary_category/@term', namespaces=xml_namespaces)

        if not primary_category:
            logger.error('Arxiv did not return primary category for id: %s' % arxiv_id)
            return categories

        if len(primary_category) > 1:
            logger.error('Arxiv returned %d primary category for id: %s' % (len(primary_category), arxiv_id))

        secondary_categories = xml.xpath('//w3:category/@term', namespaces=xml_namespaces)

        # remove primary category from secondary category list, if exists
        try:
            secondary_categories.remove(primary_category[0])
        except ValueError:
            logger.warning('Primary arxiv category not present in secondary categories for arxiv: %s' % arxiv_id)
        categories = primary_category + secondary_categories

    else:
        logger.error('Got status_code %s from arXiv when looking for categires for %s' % (data.status_code, arxiv_id))

    return categories
Ejemplo n.º 15
0
def get_inspire_records(query):
    url = current_app.config.get('INSPIRE_LITERATURE_API_URL')
    data = requests_retry_session().get(url, params={'q': query})

    return data.json()['hits']['hits']
Ejemplo n.º 16
0
def map_old_record(record, dry_run):
    """
    Maps the given record if needed to comply with the new schema.

    Following fields will be mapped:
     - page_nr will be a list of integers instead of list of strings
     - arxiv id will be put to the arxiv_eprints field
     - arxiv categories will be added if not yet present
     - "arxiv:" prefix will be removed from arxiv id
     - record_creation_date will be converted to iso format

     Following fields will be deleted at the end of the process:
     - _collections
     - report_numbers
     - files
     - local_files
     - free_keywords
     - additional_files
     - file_urls
     - earliest_date

    The result won't be saved and None will be returned in the following cases:
     - the record doesn't contain a json
     - a record fails the validation after mapping
     - both report_numbers and arxiv_eprints fields are present (shouldn't happen in the existing records)
     - there is more then one value in report_numbers field (shouldn't happen in the existing records)
     - report_numbers field is present, but there is no source subfield
     - no record_creation_date is present
    """

    # if there is no json, the record is considered deleted
    if not record.json:
        rerror('no json', record)
        return

    # page_nr to list of integers
    if 'page_nr' in record.json:
        record.json['page_nr'] = [int(x) for x in record.json['page_nr']]

    # extract arxiv from report_numbers if present
    if "report_numbers" in record.json and "arxiv_eprints" in record.json:
        rerror('both report_numbers and arxiv_eprints are present. Skip record.', record)
        return

    if "report_numbers" in record.json:
        if len(record.json["report_numbers"]) > 1:
            rerror('report_numbers has more then one element. Skip record.', record)
            return

        arxiv_id = None
        for element in record.json.get("report_numbers", ()):
            source = element.get('source')
            if not source:
                rerror('report_numbers present, but no source. Skip record.', record)
                return

            if source.lower() == 'arxiv':
                arxiv_id = element.get('value')
                break

        if arxiv_id:
            arxiv_id = arxiv_id.lower().replace('arxiv:', '')
            record.json['arxiv_eprints'] = [{'value': arxiv_id}]
            rinfo('report_numbers -> arxiv_eprints', record)
        else:
            rerror('report_numbers present, but no arxiv id? Skip record.', record)
            return

    # add arxiv category if not yet present
    if "arxiv_eprints" in record.json:
        for element in record.json.get("arxiv_eprints", ()):
            if 'value' not in element:
                rerror('arxiv_eprints value missing', record)
                continue

            arxiv_id = element['value']

            # remove arxiv prefix if present
            if arxiv_id.lower().startswith('arxiv:'):
                rinfo('removing "arxiv:" prefix', record)
                arxiv_id = arxiv_id[len('arxiv:'):]

            if 'categories' not in element:
                categories = get_arxiv_categories(arxiv_id)
                element['categories'] = categories

    # record_creation_date to isoformat
    record_creation_date = record.json.get('record_creation_date')
    if record_creation_date is None:
        rerror('no record creation date. Skip record.', record)
        return

    new_date = parse_date(record_creation_date).isoformat()
    if new_date != record_creation_date:
        rinfo('update record_creation_date: %s -> %s' % (record_creation_date, new_date), record)
        record.json['record_creation_date'] = new_date

    # delete unwanted fields
    unwanted_fields = (
        '_collections',
        'report_numbers',
        'files',
        'local_files',
        'free_keywords',
        'additional_files',
        'file_urls',
        'earliest_date',
    )
    for key in unwanted_fields:
        if record.json.pop(key, None) is not None:
            rinfo('deleted %s field' % key, record)

    # validate record
    valid = False
    schema = record.json.get('$schema')
    if schema is not None:
        schema_data = requests_retry_session().get(schema).content
        schema_data = json.loads(schema_data)

        try:
            validate(record.json, schema_data)
            valid = True
        except ValidationError as err:
            rerror('Invalid record: %s' % err, record)
        except SchemaError as err:
            rerror('SchemaError during record validation! %s' % err, record)
    else:
        rerror('No schema found!', record)

    if not valid:
        return

    # mark changes if not dry_run
    if not dry_run:
        flag_modified(record, 'json')

    return record