Example #1
0
def get_country_for_aff(x_aff):
    # In XML could have other representations for certain organizations?
    ORGS = (
        'CERN',
        'JINR',
    )

    organizations = [
        c.childNodes[0].nodeValue
        for c in x_aff.getElementsByTagName('sa:organization')
    ]
    common = set(organizations).intersection(ORGS)
    if common:
        return common.pop()

    country = x_aff.getElementsByTagName('sa:country')
    if country:
        return country[0].childNodes[0].nodeValue

    info('No country in XML. Falling back to google maps.')
    country = get_country(
        x_aff.getElementsByTagName('ce:textfn')[0].childNodes[0].nodeValue)
    if country:
        return country

    error('Google didn\'t help.')
    return 'HUMAN CHECK'
Example #2
0
def springer():
    DIR = 'JHEP/'
    EXT = ('.xml.Meta', '.xml.scoap')
    BASE_DIR = '/eos/project/s/scoap3repo/BETA/harvesting/Springer/download/' + DIR
    zip_list = listdir(BASE_DIR)

    needed_dois = json.loads(open('/tmp/repo_diff_result2', 'r').read())['only_in_old']

    extracted_dois = {}
    for file in zip_list:
        full_path = BASE_DIR + file
        if isfile(full_path) and full_path.endswith('.zip'):
            try:
                zip = ZipFile(full_path)
                for zip_element in zip.infolist():
                    fn = zip_element.filename
                    if fn.endswith(EXT):
                        xml = parseString(zip.read(zip_element))
                        doi = xml.getElementsByTagName('ArticleDOI')[0].firstChild.nodeValue
                        if doi in needed_dois:
                            if full_path not in extracted_dois:
                                extracted_dois[full_path] = []
                            extracted_dois[full_path].append(doi)
            except BadZipfile as e:
                error('file %s: %s' % (file, e))

    info('%s' % json.dumps(extracted_dois, indent=2))
Example #3
0
    def proc(record):
        try:
            if 'authors' not in record.json:
                error('no authors for record %s' % record.json['control_number'])
                return

            for author_index, author_data in enumerate(record.json['authors']):
                if 'affiliations' not in author_data:
                    error('no affiliations for record %s' % record.json['control_number'])
                    continue

                for aff_index, aff_data in enumerate(author_data['affiliations']):
                    counts['all'] += 1

                    new_country = find_country(aff_data['value'])
                    if aff_data['country'] != new_country:
                        counts['changed'] += 1

                        info('Changed country for record with id %s from %s to %s' % (record.json['control_number'],
                                                                                      aff_data['country'], new_country))
                        record.json['authors'][author_index]['affiliations'][aff_index]['country'] = new_country

            if not dry_run:
                flag_modified(record, 'json')
        except Exception as e:
            error(str(e))
Example #4
0
 def proc(r):
     for k, v in dict(r.results).iteritems():
         new_k = COUNTRIES_DEFAULT_MAPPING.get(k, k)
         if k != new_k:
             info('%d: %s => %s' % (r.control_number, k, new_k))
             r.results[new_k] = v
             r.results.pop(k)
             flag_modified(r, 'results')
Example #5
0
 def proc(r):
     for k, v in dict(r.results).iteritems():
         new_k = COUNTRIES_DEFAULT_MAPPING.get(k, k)
         if k != new_k:
             info('%d: %s => %s' % (r.control_number, k, new_k))
             r.results[new_k] = v
             r.results.pop(k)
             flag_modified(r, 'results')
Example #6
0
def hotfix_country_mapping_in_article_impacts():
    def proc(r):
        for k, v in dict(r.results).iteritems():
            new_k = COUNTRIES_DEFAULT_MAPPING.get(k, k)
            if k != new_k:
                info('%d: %s => %s' % (r.control_number, k, new_k))
                r.results[new_k] = v
                r.results.pop(k)
                flag_modified(r, 'results')

    process_all_articles_impact(proc)
    info('ALL DONE')
Example #7
0
def hotfix_country_mapping_in_article_impacts():
    def proc(r):
        for k, v in dict(r.results).iteritems():
            new_k = COUNTRIES_DEFAULT_MAPPING.get(k, k)
            if k != new_k:
                info('%d: %s => %s' % (r.control_number, k, new_k))
                r.results[new_k] = v
                r.results.pop(k)
                flag_modified(r, 'results')

    process_all_articles_impact(proc)
    info('ALL DONE')
Example #8
0
    def proc_delete(record):
        to_delete = []
        for i, a in enumerate(record.json['authors']):
            s = sum(map(bool, a.values()))
            if s == 0:
                to_delete.append(i)

        if to_delete:
            for d in to_delete:
                del record.json['authors'][d]
            flag_modified(record, 'json')
        info('DELETE %d authors' % len(to_delete))
Example #9
0
def process_all_records(function, chuck_size=50, control_ids=(), *args):
    """
    Calls the 'function' for all records.
    If 'control_ids' is set to a non empty list, then only those records will be processed.
    :param function: Function to be called for all record. First parameter will be a RecordMetadata object.
    :param chuck_size: How many records should be queried at once from db.
    :param control_ids: Control ids of records. If set to a non empty list, this will be used to filter records
    :param args: Args to be passed to 'function'
    """
    info('gathering records...')

    # query ids from all records
    record_ids = RecordMetadata.query.with_entities(RecordMetadata.id)

    # filter records
    if control_ids:
        info('applying filter for records...')
        uuids = [
            PersistentIdentifier.get('recid', recid).object_uuid
            for recid in control_ids
        ]
        record_ids = record_ids.filter(RecordMetadata.id.in_(uuids))

    # get record ids
    record_ids = [r[0] for r in record_ids.all()]
    records_count = len(record_ids)
    processed = 0
    info('start processing %d records...' % records_count)

    # process record chunks
    for i in range((records_count / chuck_size) + 1):
        # calculate chunk start and end position
        ixn = i * chuck_size
        current_ids = record_ids[ixn:ixn + chuck_size]

        # process current chunk
        for record in RecordMetadata.query.filter(
                RecordMetadata.id.in_(current_ids)):
            try:
                function(record, *args)
            except Exception:
                raise  # TODO Should we handle anything here, or just stop the whole process?
            processed += 1

        # commiting processed precords
        info('partial commit...')
        db.session.commit()

        info('%s records processed.' % processed)

    # have we processed everything?
    assert (processed == records_count)
Example #10
0
def utf8(ids):
    """Unescape records and store data as unicode."""
    def proc(record):
        if record.json is None:
            rerror('record.json is None', record)
            return
        record.json = utf8rec(record.json)
        flag_modified(record, 'json')

    if ids:
        ids = ids.split(',')

    process_all_records(proc, control_ids=ids)
    info('all done!')
Example #11
0
def utf8(ids):
    """Unescape records and store data as unicode."""

    def proc(record):
        if record.json is None:
            rerror('record.json is None', record)
            return
        record.json = utf8rec(record.json)
        flag_modified(record, 'json')

    if ids:
        ids = ids.split(',')

    process_all_records(proc, control_ids=ids)
    info('all done!')
Example #12
0
def process_all_records(function, chuck_size=50, control_ids=(), *args):
    """
    Calls the 'function' for all records.
    If 'control_ids' is set to a non empty list, then only those records will be processed.
    :param function: Function to be called for all record. First parameter will be a RecordMetadata object.
    :param chuck_size: How many records should be queried at once from db.
    :param control_ids: Control ids of records. If set to a non empty list, this will be used to filter records
    :param args: Args to be passed to 'function'
    """
    info('gathering records...')

    # query ids from all records
    record_ids = RecordMetadata.query.with_entities(RecordMetadata.id)

    # filter records
    if control_ids:
        info('applying filter for records...')
        uuids = [PersistentIdentifier.get('recid', recid).object_uuid for recid in control_ids]
        record_ids = record_ids.filter(RecordMetadata.id.in_(uuids))

    # get record ids
    record_ids = [r[0] for r in record_ids.all()]
    records_count = len(record_ids)
    processed = 0
    info('start processing %d records...' % records_count)

    # process record chunks
    for i in range((records_count / chuck_size) + 1):
        # calculate chunk start and end position
        ixn = i * chuck_size
        current_ids = record_ids[ixn:ixn + chuck_size]

        # process current chunk
        for record in RecordMetadata.query.filter(RecordMetadata.id.in_(current_ids)):
            try:
                function(record, *args)
            except Exception:
                raise  # TODO Should we handle anything here, or just stop the whole process?
            processed += 1

        # commiting processed precords
        info('partial commit...')
        db.session.commit()

        info('%s records processed.' % processed)

    # have we processed everything?
    assert (processed == records_count)
Example #13
0
def extract_year_from_record_creation():
    def proc(record):
        if not record.json:
            rerror('no json.', record)
            return

        if 'record_creation_year' not in record.json:
            date = parse_date(record.json['record_creation_date'])
            if not date:
                rerror("Date couldn't be parsed: %s" % record.json['record_creation_date'], record)

            record.json['record_creation_year'] = date.year
            flag_modified(record, 'json')

    process_all_records(proc)
    info('ALL DONE')
Example #14
0
def check_authors():
    RESULT_FILE = '/tmp/check_authors'
    result = {
        'null': set(),
        'noauth': set(),
        'noaff': set(),
        'nocountry': set(),
        'empty_aff': set()
    }

    def proc(record):
        key = ''
        if not record.json:
            key = 'null'
        elif 'authors' not in record.json:
            key = 'noauth'
        else:
            for a in record.json['authors']:
                if 'affiliations' not in a:
                    key = 'noaff'
                    break
                elif not a['affiliations']:
                    key = 'empty_aff'
                    break
                else:
                    for aff in a['affiliations']:
                        if 'country' not in aff:
                            key = 'nocountry'
                            break

        if key:
            result[key].add(record.id)

    process_all_records(proc)

    for k, v in result.items():
        pids = PersistentIdentifier.query\
            .filter(PersistentIdentifier.pid_type == 'recid')\
            .filter(PersistentIdentifier.object_uuid.in_(v)).all()
        result[k+'_c'] = map(lambda x: x.pid_value, pids)
        result[k] = map(six.text_type, v)

    result_str = json.dumps(result, indent=2)
    with open(RESULT_FILE, 'wt') as f:
        f.write(result_str)
    info(result_str)
    info('DONE')
Example #15
0
def extract_year_from_record_creation():
    def proc(record):
        if not record.json:
            rerror('no json.', record)
            return

        if 'record_creation_year' not in record.json:
            date = parse_date(record.json['record_creation_date'])
            if not date:
                rerror(
                    "Date couldn't be parsed: %s" %
                    record.json['record_creation_date'], record)

            record.json['record_creation_year'] = date.year
            flag_modified(record, 'json')

    process_all_records(proc)
    info('ALL DONE')
Example #16
0
def update_countries(dry_run, ids):
    """
    Updates countries for articles, that are marked as given parameter. Countries are determined with the google maps api.
    """

    counts = {'changed': 0, 'all': 0}

    if ids:
        ids = ids.split(',')

    def proc(record):
        try:
            if 'authors' not in record.json:
                error('no authors for record %s' % record.json['control_number'])
                return

            for author_index, author_data in enumerate(record.json['authors']):
                if 'affiliations' not in author_data:
                    error('no affiliations for record %s' % record.json['control_number'])
                    continue

                for aff_index, aff_data in enumerate(author_data['affiliations']):
                    counts['all'] += 1

                    new_country = find_country(aff_data['value'])
                    if aff_data['country'] != new_country:
                        counts['changed'] += 1

                        info('Changed country for record with id %s from %s to %s' % (record.json['control_number'],
                                                                                      aff_data['country'], new_country))
                        record.json['authors'][author_index]['affiliations'][aff_index]['country'] = new_country

            if not dry_run:
                flag_modified(record, 'json')
        except Exception as e:
            error(str(e))

    process_all_records(proc, control_ids=ids)

    if dry_run:
        error('NO CHANGES were committed to the database, because --dry-run flag was present.')

    info("%s\nDONE." % counts)
Example #17
0
def unescaperecords(ids):
    """HTML unescape abstract and title for all records."""

    parser = HTMLParser()

    def proc(record, parser):
        if record.json is None:
            rerror('record.json is None', record)
            return

        unescape_abstract(record, parser)
        unescape_titles(record, parser)

    if ids:
        ids = ids.split(',')

    process_all_records(proc, 50, ids, parser)

    info('all done!')
Example #18
0
def unescaperecords(ids):
    """HTML unescape abstract and title for all records."""

    parser = HTMLParser()

    def proc(record, parser):
        if record.json is None:
            rerror('record.json is None', record)
            return

        unescape_abstract(record, parser)
        unescape_titles(record, parser)

    if ids:
        ids = ids.split(',')

    process_all_records(proc, 50, ids, parser)

    info('all done!')
Example #19
0
def hotfix_country_mapping():
    ids = (29476, 44219, 44220)

    def proc(record):
        """Fix country mappings..."""

        if record.json and 'authors' in record.json:
            for i, a in enumerate(record.json['authors']):
                for i2, aff in enumerate(a.get('affiliations', ())):

                    c = aff.get('country')
                    new_c = find_country(aff['value'])
                    if c != new_c:
                        rinfo('%s -> %s (%s)' % (c, new_c, aff['value']), record)
                        record.json['authors'][i]['affiliations'][i2]['country'] = new_c
                        flag_modified(record, 'json')

    process_all_records(proc, control_ids=ids)
    info('ALL DONE')
Example #20
0
def get_country_for_aff(x_aff):
    # In XML could have other representations for certain organizations?
    ORGS = ('CERN', 'JINR',)

    organizations = [c.childNodes[0].nodeValue for c in x_aff.getElementsByTagName('sa:organization')]
    common = set(organizations).intersection(ORGS)
    if common:
        return common.pop()

    country = x_aff.getElementsByTagName('sa:country')
    if country:
        return country[0].childNodes[0].nodeValue

    info('No country in XML. Falling back to google maps.')
    country = get_country(x_aff.getElementsByTagName('ce:textfn')[0].childNodes[0].nodeValue)
    if country:
        return country

    error('Google didn\'t help.')
    return 'HUMAN CHECK'
Example #21
0
def add_primary_arxiv_categories():
    def proc(article_impact):
        try:
            if 'arxiv_primary_category' in article_impact.details:
                return

            pid = PersistentIdentifier.get('recid', article_impact.control_number)
            record = Record.get_record(pid.object_uuid)

            if not record:
                return

            if 'arxiv_eprints' in record:
                info('%d: eprints found' % article_impact.control_number)
                arxiv = (record['arxiv_eprints'][0]['value'].split(':')[1]).split('v')[0]
                cat = get_arxiv_categories(arxiv)[0]
                info('category: %s' % cat)
                if cat:
                    article_impact.details['arxiv_primary_category'] = cat
                    flag_modified(article_impact, 'details')

            elif 'report_numbers' in record:
                info('%d: report_numbers found' % article_impact.control_number)
                cat = get_arxiv_primary_category(record)
                info('category: %s' % cat)
                if cat:
                    article_impact.details['arxiv_primary_category'] = cat
                    flag_modified(article_impact, 'details')

            else:
                error('%d: no arxiv' % article_impact.control_number)

        except PIDDoesNotExistError:
            # records imported from Inspire won't be found
            pass
        except AttributeError as e:
            error('%d: %s' % (article_impact.control_number, e))

    process_all_articles_impact(proc)

    info('DONE.')
Example #22
0
def attach_file(control_number, file_path, file_type, filename):
    """
    Attach a file to an already existing record.

    The file-path can point to a local file, but also http and https protocols are supported. For these protocols
    sending specific headers are not supported, so make sure the website doesn't require any.

    In case the record already has a file with the given filename, it will be overwritten.
    """

    # get existing record
    try:
        api_record = APIRecord.get_record(
            PersistentIdentifier.get('recid', control_number).object_uuid)
    except (PIDDoesNotExistError, NoResultFound):
        error('No record found for given control number!')
        return

    # read and attach file
    if file_path.startswith('http://') or file_path.startswith('https://'):
        data = requests_retry_session().get(file_path)
        if data.status_code != 200:
            error('Could not download file. Status code: %d' %
                  data.status_code)
            return

        file_data = StringIO(data.content)
        if not attach_file_object(api_record, filename, file_type, file_data):
            return
    else:
        try:
            with open(file_path) as f:
                if not attach_file_object(api_record, filename, file_type, f):
                    return
        except IOError:
            error('local file was not found or not readable: %s' % file_path)
            return

    api_record.commit()
    db.session.commit()
    info('File successfully attached.')
Example #23
0
def delete_file(control_number, key):
    """
    Deletes a file attached to a record.
    """

    # get existing record
    try:
        api_record = APIRecord.get_record(
            PersistentIdentifier.get('recid', control_number).object_uuid)
    except (PIDDoesNotExistError, NoResultFound):
        error('No record found for given control number!')
        return

    if key not in api_record.files:
        error('Defined key is not present.')
        return

    del api_record.files[key]
    api_record.commit()
    db.session.commit()
    info('File successfully deleted.')
Example #24
0
def hotfix_country_mapping():
    ids = ()

    def proc(record):
        """Fix country mappings..."""

        if record.json and 'authors' in record.json:
            for i, a in enumerate(record.json['authors']):
                for i2, aff in enumerate(a.get('affiliations', ())):

                    c = aff['country']
                    new_c = find_country(aff['value'])
                    if c != new_c:
                        rinfo('%s -> %s (%s)' % (c, new_c, aff['value']),
                              record)
                        record.json['authors'][i]['affiliations'][i2][
                            'country'] = new_c
                        flag_modified(record, 'json')

    process_all_records(proc, control_ids=ids)
    info('ALL DONE')
Example #25
0
def check_country_share():
    RESULT_FILE = '/tmp/cs_test'

    data = {'countries': {},
            'not_one': set()}

    def proc(article_impact):
        for country, val in article_impact.results.items():
            if country not in data['countries']:
                data['countries'][country] = 0

            data['countries'][country] += val

        try:
            record = Record.get_record(PersistentIdentifier.get('recid', article_impact.control_number).object_uuid)
            author_count = len(record['authors'])
        except PIDDoesNotExistError:
            author_count = len(article_impact.details['authors'])

        sum_values = sum(article_impact.results.values())
        if sum_values != author_count:
            data['not_one'].add((article_impact.control_number, sum_values, author_count))

    process_all_articles_impact(proc)

    data['not_one'] = list(data['not_one'])

    data['missing_gdp'] = []
    all_country = [g.name for g in Gdp.query.all()]
    for c in data['countries'].keys():
        if c not in all_country:
            data['missing_gdp'].append(c)

    data['countries'] = sorted(data['countries'].items(), key=lambda x: x[0])
    result_str = json.dumps(data, indent=2)
    with open(RESULT_FILE, 'wt') as f:
        f.write(result_str)

    info('DONE')
Example #26
0
def fix_doi_dates(json_file, dry_run):
    """
    Fixes the imprint/publication/copyright dates on a list of DOIs.
    """
    with open(json_file) as _file:
        dois_with_dates = json.load(_file)

    for doi in dois_with_dates.keys():
        search_result = current_search_client.search(
            'scoap3-records-record',
            q='dois.value:"{}"'.format(doi))['hits']['hits']

        if search_result:
            uuid = search_result[0]['_id']
            rec = Record.get_record(uuid)

            date = dois_with_dates[doi]
            year = int(date.split('-')[0])
            old_date = rec['imprints'][0]['date']

            rec['imprints'][0]['date'] = date
            rec['publication_info'][0]['year'] = year
            rec['copyright'][0]['year'] = year

            info('DOI {} with UUID {}: changed {} -> {}'.format(
                doi, uuid, old_date, date))

            if not dry_run:
                rec.commit()
                db.session.commit()
                info('{} successfully updated.'.format(doi))

        else:
            error('DOI {} not found in ES.'.format(doi))

    if dry_run:
        error(
            'NO CHANGES were committed to the database, because --dry-run flag was present.'
        )
Example #27
0
def empty_author():
    missing_authors = []

    def proc_find(record):
        if record.json and 'authors' in record.json:
            for a in record.json['authors']:
                s = sum(map(bool, a.values()))
                if s == 0:
                    rerror('error', record)
                    missing_authors.append(record.id)
                    return

    # process_all_records(proc_find)
    # missing_authors2 = list(map(lambda recid:PersistentIdentifier.query\
    # .filter(PersistentIdentifier.pid_type == 'recid')\
    # .filter(PersistentIdentifier.object_uuid == recid).one().pid_value, missing_authors))
    # info(json.dumps(missing_authors2, indent=2))

    def proc_delete(record):
        to_delete = []
        for i, a in enumerate(record.json['authors']):
            s = sum(map(bool, a.values()))
            if s == 0:
                to_delete.append(i)

        if to_delete:
            for d in to_delete:
                del record.json['authors'][d]
            flag_modified(record, 'json')
        info('DELETE %d authors' % len(to_delete))

    control_ids = [22647, 21193, 14535, 10195, 16281, 16197, 9110, 4336, 21274, 22399, 1156, 14391, 22126, 22633,
                   22433, 22217, 10402, 22208, 20511, 3059, 2926, 4780, 1232, 2513, 22388, 10523, 22606, 12874,
                   22853, 22789, 4021, 13026, 3073, 1899, 20297, 4185, 1311, 23074]
    process_all_records(proc_delete, control_ids=control_ids)

    info('done')
Example #28
0
def elsevier():
    EXT = 'main.xml'
    BASE_DIR = '/eos/project/s/scoap3repo/BETA/harvesting/Elsevier/download/'
    RESULT_FILE = '/tmp/elsevier'
    tar_list = listdir(BASE_DIR)
    needed_dois = json.loads(open('/tmp/repo_diff_result5', 'r').read())['only_in_old']

    from_date = datetime.now() - timedelta(days=365)
    to_date = datetime.now() - timedelta(days=60)
    info('found %d files in base dir.' % len(tar_list))

    extracted_dois = {}
    for file in tar_list:
        full_path = BASE_DIR + file
        creation_date = datetime.utcfromtimestamp(getctime(full_path))
        if isfile(full_path) and full_path.endswith('.tar') and from_date <= creation_date <= to_date:
            try:
                tar = tarfile.open(full_path, 'r')
                for element in tar.getmembers():
                    if element.name.endswith(EXT):
                        xml = parseString(tar.extractfile(element).read())
                        doi = xml.getElementsByTagName('item-info')[0].getElementsByTagName('ce:doi')[0].firstChild.nodeValue
                        if doi in needed_dois:
                            if full_path not in extracted_dois:
                                extracted_dois[full_path] = []
                            extracted_dois[full_path].append(doi)
                            info('found %s in %s' % (doi, file))
                    else:
                        pass
                        # info('ignoring file: %s' % fn)
            except (tarfile.TarError, ExpatError) as e:
                error('file %s: %s' % (file, e))

    info('%s' % json.dumps(extracted_dois, indent=2))

    with open(RESULT_FILE, 'wt') as f:
        f.write(json.dumps(extracted_dois, indent=2))
Example #29
0
def hotfix_els_countries():
    """Hotfix for updating countries from xml"""
    ids = (
        18758,
        19841,
        21407,
        21896,
        22903,
        24301,
        40311,
        23504,
        23866,
        23613,
        23661,
        23861,
        23725,
        24005,
        23867,
        15590,
        16071,
        15938,
        15943,
        15867,
        15931,
        16014,
        15940,
        15942,
        16196,
        15851,
        15817,
        15789,
        15790,
        15745,
        25282,
        25288,
        24955,
        25442,
        25376,
        25346,
        25277,
        40576,
        40629,
        40677,
        40680,
        40813,
        23974,
        24958,
        24932,
        40833,
        25272,
        25265,
        24434,
        25301,
        25303,
        25299,
        25261,
        24811,
        24810,
        24809,
        24860,
        24848,
        24815,
        24825,
        24571,
        40834,
        40766,
        40838,
        40900,
        40906,
        23424,
        23411,
        23237,
        23040,
        23195,
        23060,
        23221,
        23414,
        23081,
        23419,
        23130,
        23134,
        23211,
        23017,
        23451,
        23235,
        40240,
        40279,
        40288,
        40487,
        40435,
        25292,
        25426,
        25400,
        25399,
        25522,
        40392,
        40583,
        40575,
        40665,
        40245,
        40242,
        25309,
        40633,
        25467,
        25468,
        25471,
        40678,
        40291,
        40285,
        40343,
        25328,
        25445,
        40910,
        40911,
        40679,
        40540,
        40812,
        40839,
        40438,
        40728,
        40681,
        40884,
        40885,
        40858,
        40932,
        40901,
        40904,
        40928,
        40962,
        40963,
        41570,
        41572,
        41573,
        41585,
        41588,
        41594,
        41595,
        41598,
        41599,
        41601,
        41602,
        41605,
        41612,
        41613,
        41617,
        41618,
        41627,
        41628,
        41631,
        41637,
        41640,
        41641,
        41678,
        41692,
        41702,
        41740,
        41810,
        41837,
        41857,
        41944,
        41977,
        41979,
        42005,
        42049,
        42050,
        42099,
        42116,
        42155,
        42156,
        42174,
        42215,
        42221,
        42225,
        42259,
        42286,
        42300,
        42307,
        42308,
        42341,
        42344,
        42351,
        42385,
        42422,
        42424,
        42456,
        42458,
        42485,
        42505,
        43068,
        43070,
        43071,
        43072,
        43080,
        43082,
        43084,
        43089,
        43092,
        43093,
        43096,
        43098,
        43109,
        43110,
        43113,
        43114,
        43116,
        43118,
        43120,
        43121,
        43127,
        43129,
        43150,
        43154,
        43170,
        43171,
        43173,
        43174,
        43176,
        43200,
        43213,
        43224,
        43226,
        43227,
        43230,
        43237,
        43269,
        43288,
        43290,
        43303,
        43305,
        43314,
    )

    def proc(record):
        rinfo('start...', record)

        if '_files' not in record.json:
            rerror('Skipping. No _files', record)
            return

        xml = filter(lambda x: x['filetype'] == 'xml', record.json['_files'])
        if not xml:
            rerror('Skipping. No xml in _files', record)
            return

        object = ObjectVersion.get(xml[0]['bucket'], xml[0]['key'])
        uri = object.file.uri
        xml = parse(open(uri, 'rt'))
        x_author_groups = xml.getElementsByTagName('ce:author-group')

        if not x_author_groups:
            rerror('Skipping. No author groups.', record)
            return

        if len(x_author_groups) > 1:
            rerror('Skipping. MORE THEN ONE author group. Not supported.',
                   record)
            return

        for x_author_group in x_author_groups:
            x_collaborations = x_author_group.getElementsByTagName(
                'ce:collaboration')
            x_affiliations = x_author_group.getElementsByTagName(
                'ce:affiliation')
            # needed for supporting multiple author groups with author matching, but author matching is not rly possible.
            # authors_in_group = [
            #     (c.getElementsByTagName('ce:given-name')[0].childNodes[0].nodeValue.replace('-', '').title(),
            #      c.getElementsByTagName('ce:surname')[0].childNodes[0].nodeValue.replace('-', '').title())
            #     for c in x_author_group.getElementsByTagName('ce:author')
            # ]

            if 'authors' not in record.json:
                # Type 1 and 3: has no authors at all. Fix: add collaborations if there are affiliations in xml.
                rerror('No authors... SKIPPING', record)
                return

                # extract collaborations, find countries later
                # FIXME we should always extract collaborations, but that would cause a lot more problems now.
                authors = [{
                    'full_name':
                    c.getElementsByTagName('ce:text')
                    [0].childNodes[0].nodeValue
                } for c in x_collaborations]
                if authors:
                    rinfo('Collaborations found: %s' % authors, record)
                    record.json['authors'] = authors
                else:
                    rerror('No collaborations. Not fixable.', record)

            # possibly we added authors in the previous step.
            if 'authors' in record.json:
                # Type 2 and 4: has authors, but no affiliations.
                authors = record.json['authors']
                aff_count = sum(map(lambda x: 'affiliations' in x, authors))
                if aff_count == 0:
                    # Type 4: No affiliations in data.
                    new_affs = [{
                        u'country':
                        get_country_for_aff(a),
                        u'value':
                        a.getElementsByTagName('ce:textfn')
                        [0].childNodes[0].nodeValue
                    } for a in x_affiliations]
                    if new_affs:
                        rinfo('New affiliations: %s' % new_affs, record)
                        # FIXME modify this, if multiple author groups should be supported
                        # FIXME (not all authors should be updated)!!!
                        # update_authors(record, authors_in_group, new_affs)
                        for i, a in enumerate(record.json.get('authors')):
                            record.json['authors'][i][
                                'affiliations'] = new_affs
                        flag_modified(record, 'json')
                    else:
                        rerror('No affiliations at all. Not fixable.', record)

                elif aff_count == len(authors):
                    empty_aff_count = sum(
                        map(lambda x: len(x['affiliations']) == 0, authors))
                    if empty_aff_count == len(authors):
                        # Type 2: Only empty affiliations.
                        rinfo('Type 2. Not fixable.', record)
                    else:
                        rerror(
                            'Only SOME authors have EMPTY affiliations. What now?',
                            record)
                else:
                    rerror('Only SOME authors have affiliations. What now?',
                           record)

        rinfo('OK', record)

    process_all_records(proc, control_ids=ids)
    info('ALL DONE')
Example #30
0
def update_countries(dry_run, ids, country="HUMAN CHECK"):
    """
    Updates countries for articles, that are marked as given parameter. Countries are determined with the google maps api.
    """

    country_cache = {}
    cache_fails = 0
    total_hits = 0

    # Use parameter ids or, if not given, search for all records with the specified country.
    if ids:
        ids = ids.split(',')
    else:
        search_result = current_search_client.search(
            'records-record', 'record-v1.0.0', {
                'size': 10000,
                'query': {
                    'term': {
                        'country': country
                    }
                }
            })
        ids = [
            hit['_source']['control_number']
            for hit in search_result['hits']['hits']
        ]
        info('Found %d records having %s as a country of one of the authors.' %
             (len(ids), country))

    uuids = [
        PersistentIdentifier.get('recid', recid).object_uuid for recid in ids
    ]
    records = Record.get_records(uuids)

    try:
        for record in records:
            for author_index, author_data in enumerate(record['authors']):
                for aff_index, aff_data in enumerate(
                        author_data['affiliations']):
                    if aff_data['country'] == country:
                        total_hits += 1

                        # cache countries based on old affiliation value to decrease api requests
                        old_value = aff_data['value']
                        if old_value not in country_cache:
                            country_cache[old_value] = get_country(old_value)
                            cache_fails += 1

                        new_country = country_cache[old_value]

                        if new_country:
                            record['authors'][author_index]['affiliations'][
                                aff_index]['country'] = new_country
                            info(
                                'Changed country for record with id %s to %s' %
                                (record['control_number'], new_country))
                        else:
                            error(
                                'Could not find country for record with id %s (affiliation value: %s)'
                                % (record['control_number'], old_value))
            if not dry_run:
                record.commit()
                db.session.commit()
    except Exception as e:
        print(e)

    info(
        'In total %d countries needed to be updated and %d queries were made to determine the countries.'
        % (total_hits, cache_fails))

    if dry_run:
        error(
            'NO CHANGES were committed to the database, because --dry-run flag was present.'
        )
Example #31
0
def repos_diff():
    OLD_REPO_FILE = '/tmp/old_repo_dump4'
    OLD_REPO_URL = 'https://repo.scoap3.org/search?p=&of=recjson&ot=recid,doi,creation_date&rg=100000000'
    COOKIES = {
        'INVENIOSESSION': 'd3c673cf6be468dc6c6fd25703ff90c3',
        'INVENIOSESSIONstub': 'HTTPS',
        '_pk_id.10.1cdf': 'ff8bdd9962372712.1536586766.49.1546956598.1546955767.'
    }
    RESULT_FILE = '/tmp/repo_diff_result9'

    if not isfile(OLD_REPO_FILE):
        info('No old repo file (%s), downloding...' % OLD_REPO_FILE)
        data = requests_retry_session().get(OLD_REPO_URL, cookies=COOKIES).json()
        info('download complete (%d records), mapping...' % len(data))

        if len(data) < 1000:
            error('Aborting, not all record queried.')
            return

        mapped_data = {}
        for r in data:
            doi = r.pop('doi')
            if doi in mapped_data:
                error('Multiple records with doi. %s' % r)
            mapped_data[doi] = r

        info('mapping complete, saving file...')
        with open(OLD_REPO_FILE, 'wt') as f:
            f.write(json.dumps(mapped_data))

        info('File saved.')

    info('reading old repo data from: %s' % OLD_REPO_FILE)
    with open(OLD_REPO_FILE, 'rt') as f:
        old_data = json.loads(f.read())

    result = dict(only_in_old=[],
                  only_in_new=[],
                  in_both=[])

    def proc(record):
        if not record.json:
            return

        doi = get_first_doi(record.json)
        if doi in old_data:
            result['in_both'].append(doi)
            old_data.pop(doi)
        else:
            result['only_in_new'].append(doi)

    process_all_records(proc)

    result['only_in_old'] = map(lambda x: x[0], old_data.iteritems())
    with open(RESULT_FILE, 'wt') as f:
        f.write(json.dumps(result, indent=2))

    info('only_in_old: %s\nonly_in_new: %s\nin_both:%s\nALL DONE.' % (
        len(result['only_in_old']), len(result['only_in_new']), len(result['in_both'])))
Example #32
0
def hotfix_els_countries():
    """Hotfix for updating countries from xml"""
    ids = (44264, 24944, 24850, 16040, 23414, 15632, 15820, 24786, 15937, 25306, 15819, 40393, 15681, 23089, 23019)

    def get_aff_by_id(x_author_group, aff_id):
        for x_affiliation in x_author_group.getElementsByTagName('ce:affiliation'):
            id = x_affiliation.attributes.get('id').value
            if id == aff_id:
                return x_affiliation.getElementsByTagName('ce:textfn')[0].childNodes[0].nodeValue

        error('No affiliation for id: %s' % aff_id)
        return None

    def proc(record):
        rinfo('start...', record)

        if '_files' not in record.json:
            rerror('Skipping. No _files', record)
            return

        xml = filter(lambda x: x['filetype'] == 'xml', record.json['_files'])
        if not xml:
            rerror('Skipping. No xml in _files', record)
            return

        object = ObjectVersion.get(xml[0]['bucket'], xml[0]['key'])
        uri = object.file.uri
        xml = parse(open(uri, 'rt'))
        x_author_groups = xml.getElementsByTagName('ce:author-group')

        if not x_author_groups:
            rerror('Skipping. No author groups.', record)
            return

        if len(x_author_groups) > 1:
            rinfo('Reparse all authors.', record)
            authors = []

            for x_author_group in x_author_groups:
                # skip if not deepest author-group
                if x_author_group.getElementsByTagName('ce:author-group'):
                    continue

                # extract affiliations
                x_affiliations = x_author_group.getElementsByTagName('ce:affiliation')
                affs = []
                for a in x_affiliations:
                    value = a.getElementsByTagName('ce:textfn')[0].childNodes[0].nodeValue
                    affs.append({
                        u'country': find_country(value),
                        u'value': value
                    })

                # extract authors, add affiliations
                x_authors = x_author_group.getElementsByTagName('ce:author')
                for x_author in x_authors:
                    given_name = x_author.getElementsByTagName('ce:given-name')[0].childNodes[0].nodeValue
                    surname = x_author.getElementsByTagName('ce:surname')[0].childNodes[0].nodeValue
                    full_name = '%s, %s' % (surname, given_name)

                    author_affs = []
                    for ref in x_author.getElementsByTagName('ce:cross-ref'):
                        affid = ref.attributes.get('refid').value
                        if 'aff' in affid:
                            aff_value = get_aff_by_id(x_author_group, affid)
                            aff_country = find_country(aff_value)
                            author_affs.append({
                                u'country': aff_country,
                                u'value': aff_value
                            })

                    if not (author_affs or affs):
                        rerror('no affs for author: %s. Skip this record.' % surname, record)
                        return

                    authors.append({
                        'full_name': full_name,
                        'given_name': given_name,
                        'surname': surname,
                        'affiliations': author_affs or affs
                    })

            if authors:
                record.json['authors'] = authors
                flag_modified(record, 'json')
                rinfo('updated', record)
            else:
                rerror('No authors found', record)

        else:
            for x_author_group in x_author_groups:
                x_collaborations = x_author_group.getElementsByTagName('ce:collaboration')
                x_affiliations = x_author_group.getElementsByTagName('ce:affiliation')
                # needed for supporting multiple author groups with author matching, but author matching is not rly possible.
                # authors_in_group = [
                #     (c.getElementsByTagName('ce:given-name')[0].childNodes[0].nodeValue.replace('-', '').title(),
                #      c.getElementsByTagName('ce:surname')[0].childNodes[0].nodeValue.replace('-', '').title())
                #     for c in x_author_group.getElementsByTagName('ce:author')
                # ]

                if 'authors' not in record.json:
                    # Type 1 and 3: has no authors at all. Fix: add collaborations if there are affiliations in xml.
                    rerror('No authors... SKIPPING', record)
                    return

                    # extract collaborations, find countries later
                    # FIXME we should always extract collaborations, but that would cause a lot more problems now.
                    authors = [{'full_name': c.getElementsByTagName('ce:text')[0].childNodes[0].nodeValue} for c in
                               x_collaborations]
                    if authors:
                        rinfo('Collaborations found: %s' % authors, record)
                        record.json['authors'] = authors
                    else:
                        rerror('No collaborations. Not fixable.', record)

                # possibly we added authors in the previous step.
                if 'authors' in record.json:
                    # Type 2 and 4: has authors, but no affiliations.
                    authors = record.json['authors']
                    aff_count = sum(map(lambda x: 'affiliations' in x, authors))
                    if aff_count == 0:
                        # Type 4: No affiliations in data.
                        new_affs = [
                            {u'country': find_country(a.getElementsByTagName('ce:textfn')[0].childNodes[0].nodeValue),
                             u'value': a.getElementsByTagName('ce:textfn')[0].childNodes[0].nodeValue
                             }
                            for a in x_affiliations]
                        if new_affs:
                            rinfo('New affiliations: %s' % new_affs, record)
                            # FIXME modify this, if multiple author groups should be supported
                            # FIXME (not all authors should be updated)!!!
                            # update_authors(record, authors_in_group, new_affs)

                            for i, a in enumerate(record.json.get('authors')):
                                record.json['authors'][i]['affiliations'] = new_affs
                            flag_modified(record, 'json')
                        else:
                            rerror('No affiliations at all. Not fixable.', record)

                    elif aff_count == len(authors):
                        empty_aff_count = sum(map(lambda x: len(x['affiliations']) == 0, authors))
                        if empty_aff_count == len(authors):
                            # Type 2: Only empty affiliations.
                            rinfo('Type 2. Not fixable.', record)
                        else:
                            rerror('Only SOME authors have EMPTY affiliations. What now?', record)
                    else:
                        rerror('Only SOME authors have affiliations. What now?', record)

        rinfo('OK', record)

    process_all_records(proc, control_ids=ids)
    info('ALL DONE')
Example #33
0
def japanise():
    size = 100

    def get_query(start_index, size):
        return {
            '_source': ['authors', 'control_number', 'dois', 'publication_info', 'report_numbers', 'arxiv_eprints'],
            'from': start_index,
            'size': size,
            'query': {
                'term': {
                    'country': 'Japan'
                }
            }
        }

    def get_arxiv(data):
        if 'report_numbers' in data:
            for r in data['report_numbers']:
                if r['source'] == 'arXiv':
                    return r['value'].split(':')[1]
            error('no arxiv? %s' % data['control_number'])
        if 'arxiv_eprints' in data:
            return data['arxiv_eprints'][0]['value'].split(':')[1]

        return ''

    index = 0
    total = None

    header = ['year', 'journal', 'doi', 'arxiv number', 'primary arxiv category', 'affiliaton',
              'authors with affiliation', 'total number of authors']
    si = StringIO()
    cw = csv.writer(si, delimiter=";")
    cw.writerow(header)

    while total is None or index < total:
        search_results = es.search(index='records-record',
                                   doc_type='record-v1.0.0',
                                   body=get_query(index, size))
        total = search_results['hits']['total']
        info("%s/%s" % (index, total))
        index += size

        for hit in search_results['hits']['hits']:
            data = hit['_source']

            year = data['publication_info'][0]['year']
            journal = data['publication_info'][0]['journal_title']
            doi = data['dois'][0]['value']
            arxiv = get_arxiv(data)
            arxiv_category = get_arxiv_categories(arxiv)[0] if arxiv else ''

            total_authors = len(data['authors'])

            extracted_affiliations = {}
            for author in data['authors']:
                if 'affiliations' not in author:
                    error('no affiliations for author. %s' % doi)
                    continue

                for aff in author['affiliations']:
                    if aff['country'] == 'Japan':
                        value = aff['value']
                        if value not in extracted_affiliations:
                            extracted_affiliations[value] = 0
                        extracted_affiliations[value] += 1

            if not extracted_affiliations:
                error('no extracted affs')

            for aff, count in extracted_affiliations.items():
                cw.writerow([year, journal, doi, arxiv, arxiv_category, aff.encode('utf8'), count, total_authors])

    with open('/tmp/japanise.csv', 'wt') as f:
        f.write(si.getvalue())