def collect_samples():
    samples = []

    db = get_mongo_db('../config.json')
    print(db.collection_names())

    query = db['entries'].aggregate([
        {
            '$match': {
                "abstract": {
                    "$exists": True
                },
                "doi": {
                    "$exists": True
                },
                "keywords.0": {
                    "$exists": True
                },
            },
        },
        {
            '$sample': {
                'size': 100
            }
        },
    ],
                                    allowDiskUse=True)
    for doc in query:
        if doc['abstract']:
            samples.append(doc)

    print('len(samples)', len(samples))

    with open('../scratch/paper_samples.json', 'w') as fw:
        json.dump(samples, fw, indent=2, default=json_util.default)
Ejemplo n.º 2
0
    error_doi = []
    print('col_name', col_name)
    col = mongo_db[col_name]
    query = col.find({'doi': {'$exists': True}})
    for doc in query:
        valid = valid_a_doi(doi=doc['doi'], abstract=doc.get('abstract'))
        print(doc['doi'], valid)
        if valid == False:
            error_doi.append(doc['doi'])
        # break
    return error_doi


def foo():
    fake_dois = [
        '10.7326/m20-0504',
        '10.7326/m20-050423423423',
        '10.7326/m20',
        '10.3390/v12010064',
    ]
    for doi in fake_dois:
        valid_a_doi(doi)


if __name__ == '__main__':
    db = get_mongo_db('../config.json')
    print(db.collection_names())

    # foo()

    valid_existing_doi(db, 'google_form_submissions')
def doi_match_a_batch_by_csv(task_batch):
    mongo_db = get_mongo_db('../config.json')
    csv_data = pd.read_csv('../rsc/metadata.csv',
                           dtype={
                               'pubmed_id': str,
                               'pmcid': str,
                               'publish_time': str,
                               'Microsoft Academic Paper ID': str,
                           })
    csv_data = csv_data.fillna('')
    csv_data['title'] = csv_data['title'].str.lower()
    for i, task in enumerate(task_batch):
        if i % 10 == 0:
            print('thread', threading.currentThread().getName())
            print('processing the {}th out of {}'.format(i, len(task_batch)))
        col = mongo_db[task['col_name']]

        # get doc
        doc = col.find_one({'_id': task['_id']})
        if doc is None:
            continue

        doc_updated = False

        # get metadata
        metadata = None
        if ('metadata' in doc):
            metadata = doc['metadata']

        # get title
        title = None
        raw_title = None
        if metadata is not None:
            if ('title' in metadata and isinstance(metadata['title'], str)
                    and len(metadata['title'].strip()) > 0):
                raw_title = metadata['title']
                # print('raw_title', raw_title)
                title = clean_title(raw_title)

        # get author
        author_names = None
        if metadata is not None:
            author_names = metadata.get('authors')
            if not (isinstance(author_names, list) and len(author_names) > 0):
                author_names = None

        # get abstract
        abstract = None
        if 'abstract' in doc and len(doc['abstract']) > 0:
            abstract = ''
            for fragment in doc['abstract']:
                if ('text' in fragment and isinstance(fragment['text'], str)
                        and len(fragment['text']) > 0):
                    abstract += fragment['text'].strip() + ' '

            abstract = abstract.strip()
            if len(abstract) == 0:
                abstract = None

        # query csv_data
        matched_item = None
        matched_candidates = []

        # match by title
        if title is not None and matched_item is None:
            similarity = csv_data.apply(lambda x: text_similarity_by_char(
                x['title'], title, quick_mode=True),
                                        axis=1)
            sim_csv_data = csv_data[similarity >= 0.5]
            if len(sim_csv_data) > 0:
                similarity = sim_csv_data.apply(
                    lambda x: text_similarity_by_char(
                        x['title'], title, quick_mode=False),
                    axis=1)
                sorted_similarity = similarity.sort_values(ascending=False)
                sorted_data = sim_csv_data.reindex(
                    index=sorted_similarity.index)

                print('raw_title: ', raw_title)
                print('title', title)
                print("csv_title", sorted_data.iloc[0]['title'])
                print('similarity', sorted_similarity.iloc[0])
                print(sorted_similarity.head(10))
                print('len(raw_title)', len(raw_title))
                print('doi', sorted_data.iloc[0]['doi'])
                print()

                if (len(title) > LEAST_TITLE_LEN
                        and len(sorted_data.iloc[0]['title']) > LEAST_TITLE_LEN
                        and
                        sorted_similarity.iloc[0] > LEAST_TITLE_SIMILARITY):
                    matched_item = correct_pd_dict(
                        sorted_data.iloc[0].to_dict())

            if matched_item is None and len(sim_csv_data) > 0:
                similarity = sim_csv_data.apply(
                    lambda x: text_similarity_by_char(
                        x['title'],
                        title,
                        quick_mode=False,
                        enable_ignore_begin_end=True,
                        ignore_begin_end_text_len=FIVE_PERCENT_TITLE_LEN,
                        ignore_begin_end_similarity=
                        IGNORE_BEGIN_END_TITLE_SIMILARITY,
                    ),
                    axis=1)
                sorted_similarity = similarity.sort_values(ascending=False)
                sorted_data = sim_csv_data.reindex(
                    index=sorted_similarity.index)
                if (len(title) > LEAST_TITLE_LEN
                        and len(sorted_data.iloc[0]['title']) > LEAST_TITLE_LEN
                        and
                        sorted_similarity.iloc[0] > LEAST_TITLE_SIMILARITY):
                    print('result after ignore_begin_end')
                    print('raw_title: ', raw_title)
                    print('title', title)
                    print("csv_title", sorted_data.iloc[0]['title'])
                    print('similarity', sorted_similarity.iloc[0])
                    print()
                    matched_item = correct_pd_dict(
                        sorted_data.iloc[0].to_dict())
                elif (len(title) > LEAST_TITLE_LEN
                      and len(sorted_data.iloc[0]['title']) > LEAST_TITLE_LEN
                      and sorted_similarity.iloc[0] > 0.5):
                    matched_candidates.insert(
                        0, correct_pd_dict(sorted_data.iloc[0].to_dict()))

        if abstract is not None and matched_item is None:
            # match by abstract
            similarity = csv_data.apply(lambda x: text_similarity_by_char(
                x['abstract'], abstract, quick_mode=True),
                                        axis=1)
            sim_csv_data = csv_data[similarity >= 0.5]
            if len(sim_csv_data) > 0:
                similarity = sim_csv_data.apply(
                    lambda x: text_similarity_by_char(
                        x['abstract'], abstract, quick_mode=False),
                    axis=1)
                sorted_similarity = similarity.sort_values(ascending=False)
                sorted_data = sim_csv_data.reindex(
                    index=sorted_similarity.index)

                print('abstract', abstract)
                print("csv_abstract", sorted_data.iloc[0]['abstract'])
                print('similarity', sorted_similarity.iloc[0])
                print()

                if (len(abstract) > LEAST_ABS_LEN and
                        len(sorted_data.iloc[0]['abstract']) > LEAST_ABS_LEN
                        and sorted_similarity.iloc[0] > LEAST_ABS_SIMILARITY):
                    matched_item = correct_pd_dict(
                        sorted_data.iloc[0].to_dict())
                elif (len(abstract) > LEAST_ABS_LEN
                      and len(sorted_data.iloc[0]['abstract']) > LEAST_ABS_LEN
                      and sorted_similarity.iloc[0] > 0.5):
                    matched_candidates.insert(
                        0, correct_pd_dict(sorted_data.iloc[0].to_dict()))

            if matched_item is None and len(sim_csv_data) > 0:
                similarity = sim_csv_data.apply(
                    lambda x: text_similarity_by_char(
                        x['abstract'],
                        abstract,
                        quick_mode=False,
                        enable_ignore_begin_end=True,
                        ignore_begin_end_text_len=FIVE_PERCENT_ABS_LEN,
                        ignore_begin_end_similarity=
                        IGNORE_BEGIN_END_ABS_SIMILARITY,
                    ),
                    axis=1)
                sorted_similarity = similarity.sort_values(ascending=False)
                sorted_data = sim_csv_data.reindex(
                    index=sorted_similarity.index)

                if (len(abstract) > LEAST_ABS_LEN and
                        len(sorted_data.iloc[0]['abstract']) > LEAST_ABS_LEN
                        and sorted_similarity.iloc[0] > LEAST_ABS_SIMILARITY):
                    print('result after ignore_begin_end')
                    print('abstract', abstract)
                    print("csv_abstract", sorted_data.iloc[0]['abstract'])
                    print('similarity', sorted_similarity.iloc[0])
                    print()
                    matched_item = correct_pd_dict(
                        sorted_data.iloc[0].to_dict())

        if (matched_item is None and len(matched_candidates) > 0
                and author_names is not None):
            # match by author
            for candidate in matched_candidates:
                if not candidate['authors']:
                    continue
                names_parsed = parse_names(candidate['authors'])
                name_cmp_result = compare_author_names(author_names,
                                                       names_parsed)
                print('raw_title: ', raw_title)
                print("candidate['title']", candidate['title'])
                print('abstract', abstract)
                print("candidate['abstract']", candidate['abstract'])
                print('author_names', [{
                    'first': x['first'],
                    'last': x['last']
                } for x in author_names])
                print("candidate['authors']", candidate['authors'])
                print('name_cmp_result', name_cmp_result)
                print()
                if name_cmp_result:
                    matched_item = candidate
                    break

        if matched_item is None and len(matched_candidates) == 0:
            print('no similar and no candidates!')
            print('raw_title: ', raw_title)
            print('abstract', abstract)
            if author_names:
                print('author_names', [{
                    'first': x['first'],
                    'last': x['last']
                } for x in author_names])
            else:
                print('author_names', author_names)
            print()

        # update db
        set_params = {
            "tried_csv_doi": True,
            'last_updated': datetime.now(),
        }

        # update doi found
        if matched_item is not None:
            print("FOUND")
            print()
            set_params['csv_raw_result'] = matched_item
            if (matched_item.get('doi')
                    and isinstance(matched_item['doi'], str)
                    and len(matched_item['doi'].strip()) > 0):
                set_params['doi'] = matched_item['doi'].strip()

            doc_updated = True

        try:
            col.find_one_and_update({"_id": doc['_id']}, {
                "$set": set_params,
            })
        except Exception as e:
            print('matched_item')
            pprint(matched_item)
            print(e)
            raise e
def doi_match_a_batch_by_csv_new(task_batch):
    mongo_db = get_mongo_db('../config.json')
    csv_data = pd.read_csv('../rsc/metadata.csv',
                           dtype={
                               'pubmed_id': str,
                               'pmcid': str,
                               'publish_time': str,
                               'Microsoft Academic Paper ID': str,
                           })
    csv_data = csv_data.fillna('')
    csv_data['title'] = csv_data['title'].str.lower()
    data = csv_data[csv_data['sha'] != '']
    print('data.shape', data.shape)
    for i, task in enumerate(task_batch):
        if i % 10 == 0:
            print('thread', threading.currentThread().getName())
            print('processing the {}th out of {}'.format(i, len(task_batch)))
        col = mongo_db[task['col_name']]

        # get doc
        doc = col.find_one({'_id': task['_id']})
        if doc is None:
            continue

        doc_updated = False

        # get cord_id
        cord_id = None
        if ('paper_id' in doc and isinstance(doc['paper_id'], str)
                and len(doc['paper_id']) > 0):
            cord_id = doc['paper_id']

        # query csv_data
        matched_item = None

        # match by title
        if cord_id is not None and matched_item is None:
            data_w_cord_id = csv_data[csv_data['sha'] == cord_id]

            if len(data_w_cord_id) == 1:
                # print('raw_title: ', raw_title)
                # print('title', title)
                # print("csv_title", sorted_data.iloc[0]['title'])
                # print('similarity', sorted_similarity.iloc[0])
                # print(sorted_similarity.head(10))
                # print('len(raw_title)', len(raw_title))
                # print('doi', sorted_data.iloc[0]['doi'])
                # print()
                #
                # if (len(title) > LEAST_TITLE_LEN
                #     and len(sorted_data.iloc[0]['title']) > LEAST_TITLE_LEN
                #     and sorted_similarity.iloc[0] > LEAST_TITLE_SIMILARITY):
                matched_item = correct_pd_dict(
                    data_w_cord_id.iloc[0].to_dict())

            elif len(data_w_cord_id) > 1:
                print('more than 1 entries matched!')
                print('cord_id', cord_id)
                print(', '.join(list(data_w_cord_id['sha'])))

            else:
                print('no entry matched!')
                print('cord_id', cord_id)

        if matched_item is None:
            print('no entry matched!')
            print('cord_id', cord_id)
            print()

        # update db
        set_params = {
            "tried_csv_doi": True,
            'last_updated': datetime.now(),
        }

        # update doi found
        if matched_item is not None:
            print("FOUND")
            print()
            set_params['csv_raw_result'] = matched_item
            if (matched_item.get('doi')
                    and isinstance(matched_item['doi'], str)
                    and len(matched_item['doi'].strip()) > 0):
                set_params['doi'] = matched_item['doi'].strip()

            doc_updated = True

        try:
            col.find_one_and_update({"_id": doc['_id']}, {
                "$set": set_params,
            })
        except Exception as e:
            print('matched_item')
            pprint(matched_item)
            print(e)
            raise e
def doi_match_a_batch_by_crossref(task_batch):
    mongo_db = get_mongo_db('../config.json')
    for i, task in enumerate(task_batch):
        if i % 100 == 0:
            print('thread', threading.currentThread().getName())
            print('processing the {}th out of {}'.format(i, len(task_batch)))
        col = mongo_db[task['col_name']]

        # get doc
        doc = col.find_one({'_id': task['_id']})
        if doc is None:
            continue

        doc_updated = False

        # get metadata
        metadata = None
        if ('metadata' in doc):
            metadata = doc['metadata']

        # get title
        title = None
        raw_title = None
        if metadata is not None:
            if ('title' in metadata and isinstance(metadata['title'], str)
                    and len(metadata['title'].strip()) > 0):
                raw_title = metadata['title']
                title = clean_title(raw_title)

        # get author
        author_names = None
        if metadata is not None:
            author_names = metadata.get('authors')
            if not (isinstance(author_names, list) and len(author_names) > 0):
                author_names = None

        # get abstract
        abstract = None
        if 'abstract' in doc and len(doc['abstract']) > 0:
            abstract = ''
            for fragment in doc['abstract']:
                if ('text' in fragment and isinstance(fragment['text'], str)
                        and len(fragment['text']) > 0):
                    abstract += fragment['text'].strip() + ' '

            abstract = abstract.strip()
            if len(abstract) == 0:
                abstract = None

        # query crossref
        crossref_results = []
        if title:
            # after some experiments, we use pass the query value in plain str rather than html str
            # therefore, use title instead of urllib.parse.quote_plus(title)
            query_params = {
                'sort': 'relevance',
                'order': 'desc',
                'query.bibliographic': title,
            }
            try:
                query_results = query_crossref(query_params)
            except Exception as e:
                query_results = None
                print(e)
            if query_results is not None:
                crossref_results.extend(query_results)

        if author_names:
            query_params = {
                'sort':
                'relevance',
                'order':
                'desc',
                'query.bibliographic':
                ', '.join([x['last'] for x in author_names]),
            }
            try:
                query_results = query_crossref(query_params)
            except Exception as e:
                query_results = None
                print(e)
            if query_results is not None:
                crossref_results.extend(query_results)

        # TODO: might need to double check if exact title matching be perfect (also, author might be different?)

        # filter out query results without DOI
        crossref_results = list(
            filter(
                lambda x: ('DOI' in x and isinstance(x['DOI'], str) and len(x[
                    'DOI']) > 0), crossref_results))

        # filter out query results without title or abstract
        crossref_results = list(
            filter(
                lambda x: (
                    ('title' in x and isinstance(x['title'], list) and len(x[
                        'title']) > 0) or ('abstract' in x and isinstance(
                            x['abstract'], str) and len(x['abstract']) > 0)),
                crossref_results))

        # match by title directly
        matched_item = None
        matched_candidates = []

        if title is not None and matched_item is None:
            for item in crossref_results:
                if not ('title' in item and isinstance(item['title'], list)
                        and len(item['title']) > 0):
                    continue
                if len(item['title']) != 1:
                    print("len(item['title']) != 1", len(item['title']))
                cr_title = clean_title(item['title'][0])
                similarity = text_similarity_by_char(
                    cr_title,
                    title,
                    enable_ignore_begin_end=True,
                    ignore_begin_end_text_len=FIVE_PERCENT_TITLE_LEN,
                    ignore_begin_end_similarity=
                    IGNORE_BEGIN_END_TITLE_SIMILARITY,
                )
                if (len(cr_title) > LEAST_TITLE_LEN
                        and len(title) > LEAST_TITLE_LEN
                        and similarity > LEAST_TITLE_SIMILARITY):
                    print('raw_title: ', raw_title)
                    print('title', title)
                    print("cr_title", cr_title)
                    print('similarity', similarity)
                    print()
                    matched_item = item
                    break
                elif (len(cr_title) > LEAST_TITLE_LEN
                      and len(title) > LEAST_TITLE_LEN and similarity > 0.5):
                    matched_candidates.insert(0, item)

        # match by abstract
        if abstract is not None and matched_item is None:
            for item in crossref_results:
                if not ('abstract' in item and isinstance(
                        item['abstract'], str) and len(item['abstract']) > 0):
                    continue
                cr_abstract = item['abstract']
                similarity = text_similarity_by_char(
                    cr_abstract,
                    abstract,
                    enable_ignore_begin_end=True,
                    ignore_begin_end_text_len=FIVE_PERCENT_ABS_LEN,
                    ignore_begin_end_similarity=IGNORE_BEGIN_END_ABS_SIMILARITY,
                )
                if (len(cr_abstract) > LEAST_ABS_LEN
                        and len(abstract) > LEAST_ABS_LEN
                        and similarity > LEAST_ABS_SIMILARITY):
                    print('abstract: ', abstract)
                    print("cr_abstract", cr_abstract)
                    print('similarity', similarity)
                    print()
                    matched_item = item
                    break
                elif (len(cr_abstract) > LEAST_ABS_LEN
                      and len(abstract) > LEAST_ABS_LEN and similarity > 0.5):
                    matched_candidates.insert(0, item)

        if (matched_item is None and len(matched_candidates) > 0
                and author_names is not None):
            # match by author
            for candidate in matched_candidates:
                if not ('author' in candidate
                        and isinstance(candidate['author'], list)
                        and len(candidate['author']) > 0):
                    continue
                names_parsed = parse_names(candidate['author'])
                name_cmp_result = compare_author_names(author_names,
                                                       names_parsed)
                print('raw_title: ', raw_title)
                print("candidate['title']", candidate.get('title'))
                print('abstract', abstract)
                print("candidate['abstract']", candidate.get('abstract'))
                print('author_names', [{
                    'first': x['first'],
                    'last': x['last']
                } for x in author_names])
                print("candidate['author']", candidate.get('author'))
                print('name_cmp_result', name_cmp_result)
                print()
                if name_cmp_result:
                    matched_item = candidate
                    break

        if matched_item is None and len(matched_candidates) == 0:
            print('no similar and no candidates!')
            print('raw_title: ', raw_title)
            print('abstract', abstract)
            if author_names:
                print('author_names', [{
                    'first': x['first'],
                    'last': x['last']
                } for x in author_names])
            else:
                print('author_names', author_names)
            print()

        # update db
        set_params = {
            "tried_crossref_doi": True,
            'last_updated': datetime.now(),
        }
        if matched_item is not None:
            print("FOUND")
            print()
            set_params['crossref_raw_result'] = matched_item
            if (matched_item.get('DOI')
                    and isinstance(matched_item['DOI'], str)
                    and len(matched_item['DOI'].strip()) > 0):
                set_params['doi'] = matched_item['DOI'].strip()

            doc_updated = True

        try:
            col.find_one_and_update({"_id": doc['_id']}, {
                "$set": set_params,
            })
        except Exception as e:
            print('matched_item')
            pprint(matched_item)
            print(e)
            raise e
Ejemplo n.º 6
0
def doi_match_a_batch_by_csv(task_batch):
    mongo_db = get_mongo_db('../config.json')
    csv_data = pd.read_csv('../rsc/metadata.csv')
    csv_data = csv_data.fillna('')
    csv_data['title'] = csv_data['title'].str.lower()
    for i, task in enumerate(task_batch):
        if i % 100 == 0:
            print('thread', threading.currentThread().getName())
            print('processing the {}th out of {}'.format(i, len(task_batch)))
        col = mongo_db[task['col_name']]

        # get doc
        doc = col.find_one({'_id': task['_id']})
        if doc is None:
            continue

        doc_updated = False

        # get metadata
        metadata = None
        if ('metadata' in doc):
            metadata = doc['metadata']
        else:
            # let's supporse metadata is always used first
            # TODO: we can also use abstract when metadata is not available
            continue

        # get title
        title = None
        raw_title = None
        if metadata is not None:
            if not ('title' in metadata and isinstance(metadata['title'], str)
                    and len(metadata['title'].strip()) > 0):
                # doc w/o is minor part let's ignore them first
                # TODO: we can also use abstract when metadata is not available
                continue
            raw_title = metadata['title']
            print('raw_title', raw_title)
            title = clean_title(raw_title)

        # get author
        author_names = None
        if metadata is not None:
            try:
                author_names = ",".join(
                    [a['last'] for a in metadata['authors']])
            except KeyError:
                author_names = None

        # get abstract
        abstract = None
        if 'abstract' in doc and len(doc['abstract']) > 0:
            abstract = ''
            for fragment in doc['abstract']:
                if ('text' in fragment and isinstance(fragment['text'], str)
                        and len(fragment['text']) > 0):
                    abstract += fragment['text'].strip() + ' '

            abstract = abstract.strip()
            if len(abstract) == 0:
                abstract = None

        # query csv_data
        matched_item = None

        # match by title
        if title is not None:
            similarity = csv_data.apply(lambda x: text_similarity_by_char(
                x['title'], title, quick_mode=True),
                                        axis=1)
            sim_csv_data = csv_data[similarity >= 2 * LEAST_TITLE_SIMILARITY -
                                    1]
            if len(sim_csv_data) > 0:
                similarity = sim_csv_data.apply(
                    lambda x: text_similarity_by_char(
                        x['title'], title, quick_mode=False),
                    axis=1)
                sorted_similarity = similarity.sort_values(ascending=False)
                sorted_data = sim_csv_data.reindex(
                    index=sorted_similarity.index)
                if (len(title) > LEAST_TITLE_LEN
                        and len(sorted_data.iloc[0]['title']) > LEAST_TITLE_LEN
                        and
                        sorted_similarity.iloc[0] > LEAST_TITLE_SIMILARITY):
                    print('raw_title: ', raw_title)
                    print('title', title)
                    print("csv_title", sorted_data.iloc[0]['title'])
                    print('similarity', sorted_similarity.iloc[0])
                    print()
                    matched_item = correct_pd_dict(
                        sorted_data.iloc[0].to_dict())

            if matched_item is None and len(sim_csv_data) > 0:
                similarity = sim_csv_data.apply(
                    lambda x: text_similarity_by_char(x['title'],
                                                      title,
                                                      quick_mode=False,
                                                      enable_ignore_begin_end=
                                                      True),
                    axis=1)
                sorted_similarity = similarity.sort_values(ascending=False)
                sorted_data = sim_csv_data.reindex(
                    index=sorted_similarity.index)
                if (len(title) > LEAST_TITLE_LEN
                        and len(sorted_data.iloc[0]['title']) > LEAST_TITLE_LEN
                        and
                        sorted_similarity.iloc[0] > LEAST_TITLE_SIMILARITY):
                    print('result after ignore_begin_end')
                    print('raw_title: ', raw_title)
                    print('title', title)
                    print("csv_title", sorted_data.iloc[0]['title'])
                    print('similarity', sorted_similarity.iloc[0])
                    print()
                    matched_item = correct_pd_dict(
                        sorted_data.iloc[0].to_dict())

        if abstract is not None and matched_item is None:
            # match by abstract
            similarity = csv_data.apply(lambda x: text_similarity_by_char(
                x['abstract'], abstract, quick_mode=True),
                                        axis=1)
            sim_csv_data = csv_data[similarity >= 2 * LEAST_ABS_SIMILARITY - 1]
            if len(sim_csv_data) > 0:
                similarity = sim_csv_data.apply(
                    lambda x: text_similarity_by_char(
                        x['abstract'], abstract, quick_mode=False),
                    axis=1)
                sorted_similarity = similarity.sort_values(ascending=False)
                sorted_data = sim_csv_data.reindex(
                    index=sorted_similarity.index)
                if (len(abstract) > LEAST_ABS_LEN and
                        len(sorted_data.iloc[0]['abstract']) > LEAST_TITLE_LEN
                        and sorted_similarity.iloc[0] > LEAST_ABS_SIMILARITY):
                    print('abstract', abstract)
                    print("csv_abstract", sorted_data.iloc[0]['abstract'])
                    print('similarity', sorted_similarity.iloc[0])
                    print()
                    matched_item = correct_pd_dict(
                        sorted_data.iloc[0].to_dict())

        # update db
        set_params = {
            "tried_csv_doi": True,
            'last_updated': datetime.now(),
        }

        # update doi found
        if matched_item is not None:
            print("FOUND")
            print()
            set_params['csv_raw_result'] = matched_item
            if matched_item.get('doi') and (isinstance(matched_item['doi'],
                                                       str)):
                set_params['doi'] = matched_item['doi']

            doc_updated = True

        try:
            col.find_one_and_update({"_id": doc['_id']}, {
                "$set": set_params,
            })
        except Exception as e:
            print('matched_item')
            pprint(matched_item)
            print(e)
            raise e
Ejemplo n.º 7
0
def doi_match_a_batch_by_crossref(task_batch):
    mongo_db = get_mongo_db('../config.json')
    for i, task in enumerate(task_batch):
        if i % 100 == 0:
            print('thread', threading.currentThread().getName())
            print('processing the {}th out of {}'.format(i, len(task_batch)))
        col = mongo_db[task['col_name']]

        # get doc
        doc = col.find_one({'_id': task['_id']})
        if doc is None:
            continue

        doc_updated = False

        # get metadata
        metadata = None
        if ('metadata' in doc):
            metadata = doc['metadata']
        else:
            # let's supporse metadata is always used first
            # TODO: we can also use abstract when metadata is not available
            continue

        # get title
        title = None
        raw_title = None
        if metadata is not None:
            if not ('title' in metadata and isinstance(metadata['title'], str)
                    and len(metadata['title'].strip()) > 0):
                # doc w/o is minor part let's ignore them first
                # TODO: we can also use abstract when metadata is not available
                continue
            raw_title = metadata['title']
            print('raw_title', raw_title)
            title = clean_title(raw_title)

        # get author
        author_names = None
        if metadata is not None:
            try:
                author_names = ",".join(
                    [a['last'] for a in metadata['authors']])
            except KeyError:
                author_names = None

        # query cross_ref
        query_url = 'https://api.crossref.org/works'
        query_params = {
            'sort': 'relevance',
            'order': 'desc',
        }
        if title:
            # after some experiments, we use pass the query value in plain str rather than html str
            # therefore, use title instead of urllib.parse.quote_plus(title)
            query_params['query.bibliographic'] = title
        # TODO: might need to double check if exact title matching be perfect (author might be different?)
        # TODO: might be wrong here need to clean db when only author info is used to retrieve data
        elif author_names:
            query_params['query.bibliographic'] = author_names
        # TODO: might also use email to search?

        try:
            cross_ref_results = requests.get(
                query_url,
                params=query_params,
            )
        except:
            print('request to cross_ref failed!')
            continue
        try:
            cross_ref_results = cross_ref_results.json()
        except Exception as e:
            print('query result cannot be jsonified!')
            print('cross_ref_results.text', cross_ref_results.text)
            print('cross_ref_results.status_code',
                  cross_ref_results.status_code)
            print('cross_ref_results.reason', cross_ref_results.reason)
            print()
            continue

        # filter out empty query results
        if not ('message' in cross_ref_results
                and 'items' in cross_ref_results['message']
                and isinstance(cross_ref_results['message']['items'], list)
                and len(cross_ref_results['message']['items']) > 0):
            print('EMPTY RESULT')
            pprint(cross_ref_results)
            print()
            continue
        else:
            cross_ref_results = cross_ref_results['message']['items']

        # filter out query results without title
        # TODO: maybe abtract is available
        #  use item['abstract']
        cross_ref_results = list(
            filter(
                lambda x: ('title' in x and isinstance(x['title'], list) and
                           len(x['title']) > 0), cross_ref_results))
        # exit()

        # print('title', title)
        # print("metadata['title']", metadata['title'])
        # print('author_names', author_names)
        # pprint(r.json())
        # exit()

        # match by title directly
        matched_item = None
        if matched_item is None:
            for item in cross_ref_results:
                if len(item['title']) != 1:
                    print("len(item['title'])", len(item['title']))
                cr_title = clean_title(item['title'][0])
                similarity = text_similarity_by_char(cr_title, title)
                if (len(cr_title) > LEAST_TITLE_LEN
                        and len(title) > LEAST_TITLE_LEN
                        and similarity > LEAST_TITLE_SIMILARITY):
                    print('raw_title: ', raw_title)
                    print('title', title)
                    print("cr_title", cr_title)
                    print('similarity', similarity)
                    matched_item = item
                    break

                # if cr_title == title:
                #     matched_item = item
                #     break

        # # match by revised title
        # if matched_item is None:
        #     title = re.sub(' [0-9] ', ' ', title)
        #     for item in cross_ref_results:
        #         cr_title = item['title'][0]
        #         if cr_title == title:
        #             matched_item = item
        #             break

        # update doi found
        if matched_item is not None:
            print("FOUND")
            print()
            col.find_one_and_update({"_id": doc['_id']}, {
                "$set": {
                    "doi": matched_item['DOI'],
                    'tried_crossref_doi': True,
                    'crossref_raw_result': matched_item,
                    'last_updated': datetime.now(),
                }
            })
            doc_updated = True
        # else:
        #     print('query_params', query_params)
        #     print('\n'.join([x['title'][0] for x in cross_ref_results]))

        # mark tried even if doi is not found but searching is completed
        if not doc_updated:
            col.find_one_and_update({"_id": doc['_id']}, {
                "$set": {
                    "tried_crossref_doi": True,
                    'last_updated': datetime.now(),
                }
            })