def collect_samples(): samples = [] db = get_mongo_db('../config.json') print(db.collection_names()) query = db['entries'].aggregate([ { '$match': { "abstract": { "$exists": True }, "doi": { "$exists": True }, "keywords.0": { "$exists": True }, }, }, { '$sample': { 'size': 100 } }, ], allowDiskUse=True) for doc in query: if doc['abstract']: samples.append(doc) print('len(samples)', len(samples)) with open('../scratch/paper_samples.json', 'w') as fw: json.dump(samples, fw, indent=2, default=json_util.default)
error_doi = [] print('col_name', col_name) col = mongo_db[col_name] query = col.find({'doi': {'$exists': True}}) for doc in query: valid = valid_a_doi(doi=doc['doi'], abstract=doc.get('abstract')) print(doc['doi'], valid) if valid == False: error_doi.append(doc['doi']) # break return error_doi def foo(): fake_dois = [ '10.7326/m20-0504', '10.7326/m20-050423423423', '10.7326/m20', '10.3390/v12010064', ] for doi in fake_dois: valid_a_doi(doi) if __name__ == '__main__': db = get_mongo_db('../config.json') print(db.collection_names()) # foo() valid_existing_doi(db, 'google_form_submissions')
def doi_match_a_batch_by_csv(task_batch): mongo_db = get_mongo_db('../config.json') csv_data = pd.read_csv('../rsc/metadata.csv', dtype={ 'pubmed_id': str, 'pmcid': str, 'publish_time': str, 'Microsoft Academic Paper ID': str, }) csv_data = csv_data.fillna('') csv_data['title'] = csv_data['title'].str.lower() for i, task in enumerate(task_batch): if i % 10 == 0: print('thread', threading.currentThread().getName()) print('processing the {}th out of {}'.format(i, len(task_batch))) col = mongo_db[task['col_name']] # get doc doc = col.find_one({'_id': task['_id']}) if doc is None: continue doc_updated = False # get metadata metadata = None if ('metadata' in doc): metadata = doc['metadata'] # get title title = None raw_title = None if metadata is not None: if ('title' in metadata and isinstance(metadata['title'], str) and len(metadata['title'].strip()) > 0): raw_title = metadata['title'] # print('raw_title', raw_title) title = clean_title(raw_title) # get author author_names = None if metadata is not None: author_names = metadata.get('authors') if not (isinstance(author_names, list) and len(author_names) > 0): author_names = None # get abstract abstract = None if 'abstract' in doc and len(doc['abstract']) > 0: abstract = '' for fragment in doc['abstract']: if ('text' in fragment and isinstance(fragment['text'], str) and len(fragment['text']) > 0): abstract += fragment['text'].strip() + ' ' abstract = abstract.strip() if len(abstract) == 0: abstract = None # query csv_data matched_item = None matched_candidates = [] # match by title if title is not None and matched_item is None: similarity = csv_data.apply(lambda x: text_similarity_by_char( x['title'], title, quick_mode=True), axis=1) sim_csv_data = csv_data[similarity >= 0.5] if len(sim_csv_data) > 0: similarity = sim_csv_data.apply( lambda x: text_similarity_by_char( x['title'], title, quick_mode=False), axis=1) sorted_similarity = similarity.sort_values(ascending=False) sorted_data = sim_csv_data.reindex( index=sorted_similarity.index) print('raw_title: ', raw_title) print('title', title) print("csv_title", sorted_data.iloc[0]['title']) print('similarity', sorted_similarity.iloc[0]) print(sorted_similarity.head(10)) print('len(raw_title)', len(raw_title)) print('doi', sorted_data.iloc[0]['doi']) print() if (len(title) > LEAST_TITLE_LEN and len(sorted_data.iloc[0]['title']) > LEAST_TITLE_LEN and sorted_similarity.iloc[0] > LEAST_TITLE_SIMILARITY): matched_item = correct_pd_dict( sorted_data.iloc[0].to_dict()) if matched_item is None and len(sim_csv_data) > 0: similarity = sim_csv_data.apply( lambda x: text_similarity_by_char( x['title'], title, quick_mode=False, enable_ignore_begin_end=True, ignore_begin_end_text_len=FIVE_PERCENT_TITLE_LEN, ignore_begin_end_similarity= IGNORE_BEGIN_END_TITLE_SIMILARITY, ), axis=1) sorted_similarity = similarity.sort_values(ascending=False) sorted_data = sim_csv_data.reindex( index=sorted_similarity.index) if (len(title) > LEAST_TITLE_LEN and len(sorted_data.iloc[0]['title']) > LEAST_TITLE_LEN and sorted_similarity.iloc[0] > LEAST_TITLE_SIMILARITY): print('result after ignore_begin_end') print('raw_title: ', raw_title) print('title', title) print("csv_title", sorted_data.iloc[0]['title']) print('similarity', sorted_similarity.iloc[0]) print() matched_item = correct_pd_dict( sorted_data.iloc[0].to_dict()) elif (len(title) > LEAST_TITLE_LEN and len(sorted_data.iloc[0]['title']) > LEAST_TITLE_LEN and sorted_similarity.iloc[0] > 0.5): matched_candidates.insert( 0, correct_pd_dict(sorted_data.iloc[0].to_dict())) if abstract is not None and matched_item is None: # match by abstract similarity = csv_data.apply(lambda x: text_similarity_by_char( x['abstract'], abstract, quick_mode=True), axis=1) sim_csv_data = csv_data[similarity >= 0.5] if len(sim_csv_data) > 0: similarity = sim_csv_data.apply( lambda x: text_similarity_by_char( x['abstract'], abstract, quick_mode=False), axis=1) sorted_similarity = similarity.sort_values(ascending=False) sorted_data = sim_csv_data.reindex( index=sorted_similarity.index) print('abstract', abstract) print("csv_abstract", sorted_data.iloc[0]['abstract']) print('similarity', sorted_similarity.iloc[0]) print() if (len(abstract) > LEAST_ABS_LEN and len(sorted_data.iloc[0]['abstract']) > LEAST_ABS_LEN and sorted_similarity.iloc[0] > LEAST_ABS_SIMILARITY): matched_item = correct_pd_dict( sorted_data.iloc[0].to_dict()) elif (len(abstract) > LEAST_ABS_LEN and len(sorted_data.iloc[0]['abstract']) > LEAST_ABS_LEN and sorted_similarity.iloc[0] > 0.5): matched_candidates.insert( 0, correct_pd_dict(sorted_data.iloc[0].to_dict())) if matched_item is None and len(sim_csv_data) > 0: similarity = sim_csv_data.apply( lambda x: text_similarity_by_char( x['abstract'], abstract, quick_mode=False, enable_ignore_begin_end=True, ignore_begin_end_text_len=FIVE_PERCENT_ABS_LEN, ignore_begin_end_similarity= IGNORE_BEGIN_END_ABS_SIMILARITY, ), axis=1) sorted_similarity = similarity.sort_values(ascending=False) sorted_data = sim_csv_data.reindex( index=sorted_similarity.index) if (len(abstract) > LEAST_ABS_LEN and len(sorted_data.iloc[0]['abstract']) > LEAST_ABS_LEN and sorted_similarity.iloc[0] > LEAST_ABS_SIMILARITY): print('result after ignore_begin_end') print('abstract', abstract) print("csv_abstract", sorted_data.iloc[0]['abstract']) print('similarity', sorted_similarity.iloc[0]) print() matched_item = correct_pd_dict( sorted_data.iloc[0].to_dict()) if (matched_item is None and len(matched_candidates) > 0 and author_names is not None): # match by author for candidate in matched_candidates: if not candidate['authors']: continue names_parsed = parse_names(candidate['authors']) name_cmp_result = compare_author_names(author_names, names_parsed) print('raw_title: ', raw_title) print("candidate['title']", candidate['title']) print('abstract', abstract) print("candidate['abstract']", candidate['abstract']) print('author_names', [{ 'first': x['first'], 'last': x['last'] } for x in author_names]) print("candidate['authors']", candidate['authors']) print('name_cmp_result', name_cmp_result) print() if name_cmp_result: matched_item = candidate break if matched_item is None and len(matched_candidates) == 0: print('no similar and no candidates!') print('raw_title: ', raw_title) print('abstract', abstract) if author_names: print('author_names', [{ 'first': x['first'], 'last': x['last'] } for x in author_names]) else: print('author_names', author_names) print() # update db set_params = { "tried_csv_doi": True, 'last_updated': datetime.now(), } # update doi found if matched_item is not None: print("FOUND") print() set_params['csv_raw_result'] = matched_item if (matched_item.get('doi') and isinstance(matched_item['doi'], str) and len(matched_item['doi'].strip()) > 0): set_params['doi'] = matched_item['doi'].strip() doc_updated = True try: col.find_one_and_update({"_id": doc['_id']}, { "$set": set_params, }) except Exception as e: print('matched_item') pprint(matched_item) print(e) raise e
def doi_match_a_batch_by_csv_new(task_batch): mongo_db = get_mongo_db('../config.json') csv_data = pd.read_csv('../rsc/metadata.csv', dtype={ 'pubmed_id': str, 'pmcid': str, 'publish_time': str, 'Microsoft Academic Paper ID': str, }) csv_data = csv_data.fillna('') csv_data['title'] = csv_data['title'].str.lower() data = csv_data[csv_data['sha'] != ''] print('data.shape', data.shape) for i, task in enumerate(task_batch): if i % 10 == 0: print('thread', threading.currentThread().getName()) print('processing the {}th out of {}'.format(i, len(task_batch))) col = mongo_db[task['col_name']] # get doc doc = col.find_one({'_id': task['_id']}) if doc is None: continue doc_updated = False # get cord_id cord_id = None if ('paper_id' in doc and isinstance(doc['paper_id'], str) and len(doc['paper_id']) > 0): cord_id = doc['paper_id'] # query csv_data matched_item = None # match by title if cord_id is not None and matched_item is None: data_w_cord_id = csv_data[csv_data['sha'] == cord_id] if len(data_w_cord_id) == 1: # print('raw_title: ', raw_title) # print('title', title) # print("csv_title", sorted_data.iloc[0]['title']) # print('similarity', sorted_similarity.iloc[0]) # print(sorted_similarity.head(10)) # print('len(raw_title)', len(raw_title)) # print('doi', sorted_data.iloc[0]['doi']) # print() # # if (len(title) > LEAST_TITLE_LEN # and len(sorted_data.iloc[0]['title']) > LEAST_TITLE_LEN # and sorted_similarity.iloc[0] > LEAST_TITLE_SIMILARITY): matched_item = correct_pd_dict( data_w_cord_id.iloc[0].to_dict()) elif len(data_w_cord_id) > 1: print('more than 1 entries matched!') print('cord_id', cord_id) print(', '.join(list(data_w_cord_id['sha']))) else: print('no entry matched!') print('cord_id', cord_id) if matched_item is None: print('no entry matched!') print('cord_id', cord_id) print() # update db set_params = { "tried_csv_doi": True, 'last_updated': datetime.now(), } # update doi found if matched_item is not None: print("FOUND") print() set_params['csv_raw_result'] = matched_item if (matched_item.get('doi') and isinstance(matched_item['doi'], str) and len(matched_item['doi'].strip()) > 0): set_params['doi'] = matched_item['doi'].strip() doc_updated = True try: col.find_one_and_update({"_id": doc['_id']}, { "$set": set_params, }) except Exception as e: print('matched_item') pprint(matched_item) print(e) raise e
def doi_match_a_batch_by_crossref(task_batch): mongo_db = get_mongo_db('../config.json') for i, task in enumerate(task_batch): if i % 100 == 0: print('thread', threading.currentThread().getName()) print('processing the {}th out of {}'.format(i, len(task_batch))) col = mongo_db[task['col_name']] # get doc doc = col.find_one({'_id': task['_id']}) if doc is None: continue doc_updated = False # get metadata metadata = None if ('metadata' in doc): metadata = doc['metadata'] # get title title = None raw_title = None if metadata is not None: if ('title' in metadata and isinstance(metadata['title'], str) and len(metadata['title'].strip()) > 0): raw_title = metadata['title'] title = clean_title(raw_title) # get author author_names = None if metadata is not None: author_names = metadata.get('authors') if not (isinstance(author_names, list) and len(author_names) > 0): author_names = None # get abstract abstract = None if 'abstract' in doc and len(doc['abstract']) > 0: abstract = '' for fragment in doc['abstract']: if ('text' in fragment and isinstance(fragment['text'], str) and len(fragment['text']) > 0): abstract += fragment['text'].strip() + ' ' abstract = abstract.strip() if len(abstract) == 0: abstract = None # query crossref crossref_results = [] if title: # after some experiments, we use pass the query value in plain str rather than html str # therefore, use title instead of urllib.parse.quote_plus(title) query_params = { 'sort': 'relevance', 'order': 'desc', 'query.bibliographic': title, } try: query_results = query_crossref(query_params) except Exception as e: query_results = None print(e) if query_results is not None: crossref_results.extend(query_results) if author_names: query_params = { 'sort': 'relevance', 'order': 'desc', 'query.bibliographic': ', '.join([x['last'] for x in author_names]), } try: query_results = query_crossref(query_params) except Exception as e: query_results = None print(e) if query_results is not None: crossref_results.extend(query_results) # TODO: might need to double check if exact title matching be perfect (also, author might be different?) # filter out query results without DOI crossref_results = list( filter( lambda x: ('DOI' in x and isinstance(x['DOI'], str) and len(x[ 'DOI']) > 0), crossref_results)) # filter out query results without title or abstract crossref_results = list( filter( lambda x: ( ('title' in x and isinstance(x['title'], list) and len(x[ 'title']) > 0) or ('abstract' in x and isinstance( x['abstract'], str) and len(x['abstract']) > 0)), crossref_results)) # match by title directly matched_item = None matched_candidates = [] if title is not None and matched_item is None: for item in crossref_results: if not ('title' in item and isinstance(item['title'], list) and len(item['title']) > 0): continue if len(item['title']) != 1: print("len(item['title']) != 1", len(item['title'])) cr_title = clean_title(item['title'][0]) similarity = text_similarity_by_char( cr_title, title, enable_ignore_begin_end=True, ignore_begin_end_text_len=FIVE_PERCENT_TITLE_LEN, ignore_begin_end_similarity= IGNORE_BEGIN_END_TITLE_SIMILARITY, ) if (len(cr_title) > LEAST_TITLE_LEN and len(title) > LEAST_TITLE_LEN and similarity > LEAST_TITLE_SIMILARITY): print('raw_title: ', raw_title) print('title', title) print("cr_title", cr_title) print('similarity', similarity) print() matched_item = item break elif (len(cr_title) > LEAST_TITLE_LEN and len(title) > LEAST_TITLE_LEN and similarity > 0.5): matched_candidates.insert(0, item) # match by abstract if abstract is not None and matched_item is None: for item in crossref_results: if not ('abstract' in item and isinstance( item['abstract'], str) and len(item['abstract']) > 0): continue cr_abstract = item['abstract'] similarity = text_similarity_by_char( cr_abstract, abstract, enable_ignore_begin_end=True, ignore_begin_end_text_len=FIVE_PERCENT_ABS_LEN, ignore_begin_end_similarity=IGNORE_BEGIN_END_ABS_SIMILARITY, ) if (len(cr_abstract) > LEAST_ABS_LEN and len(abstract) > LEAST_ABS_LEN and similarity > LEAST_ABS_SIMILARITY): print('abstract: ', abstract) print("cr_abstract", cr_abstract) print('similarity', similarity) print() matched_item = item break elif (len(cr_abstract) > LEAST_ABS_LEN and len(abstract) > LEAST_ABS_LEN and similarity > 0.5): matched_candidates.insert(0, item) if (matched_item is None and len(matched_candidates) > 0 and author_names is not None): # match by author for candidate in matched_candidates: if not ('author' in candidate and isinstance(candidate['author'], list) and len(candidate['author']) > 0): continue names_parsed = parse_names(candidate['author']) name_cmp_result = compare_author_names(author_names, names_parsed) print('raw_title: ', raw_title) print("candidate['title']", candidate.get('title')) print('abstract', abstract) print("candidate['abstract']", candidate.get('abstract')) print('author_names', [{ 'first': x['first'], 'last': x['last'] } for x in author_names]) print("candidate['author']", candidate.get('author')) print('name_cmp_result', name_cmp_result) print() if name_cmp_result: matched_item = candidate break if matched_item is None and len(matched_candidates) == 0: print('no similar and no candidates!') print('raw_title: ', raw_title) print('abstract', abstract) if author_names: print('author_names', [{ 'first': x['first'], 'last': x['last'] } for x in author_names]) else: print('author_names', author_names) print() # update db set_params = { "tried_crossref_doi": True, 'last_updated': datetime.now(), } if matched_item is not None: print("FOUND") print() set_params['crossref_raw_result'] = matched_item if (matched_item.get('DOI') and isinstance(matched_item['DOI'], str) and len(matched_item['DOI'].strip()) > 0): set_params['doi'] = matched_item['DOI'].strip() doc_updated = True try: col.find_one_and_update({"_id": doc['_id']}, { "$set": set_params, }) except Exception as e: print('matched_item') pprint(matched_item) print(e) raise e
def doi_match_a_batch_by_csv(task_batch): mongo_db = get_mongo_db('../config.json') csv_data = pd.read_csv('../rsc/metadata.csv') csv_data = csv_data.fillna('') csv_data['title'] = csv_data['title'].str.lower() for i, task in enumerate(task_batch): if i % 100 == 0: print('thread', threading.currentThread().getName()) print('processing the {}th out of {}'.format(i, len(task_batch))) col = mongo_db[task['col_name']] # get doc doc = col.find_one({'_id': task['_id']}) if doc is None: continue doc_updated = False # get metadata metadata = None if ('metadata' in doc): metadata = doc['metadata'] else: # let's supporse metadata is always used first # TODO: we can also use abstract when metadata is not available continue # get title title = None raw_title = None if metadata is not None: if not ('title' in metadata and isinstance(metadata['title'], str) and len(metadata['title'].strip()) > 0): # doc w/o is minor part let's ignore them first # TODO: we can also use abstract when metadata is not available continue raw_title = metadata['title'] print('raw_title', raw_title) title = clean_title(raw_title) # get author author_names = None if metadata is not None: try: author_names = ",".join( [a['last'] for a in metadata['authors']]) except KeyError: author_names = None # get abstract abstract = None if 'abstract' in doc and len(doc['abstract']) > 0: abstract = '' for fragment in doc['abstract']: if ('text' in fragment and isinstance(fragment['text'], str) and len(fragment['text']) > 0): abstract += fragment['text'].strip() + ' ' abstract = abstract.strip() if len(abstract) == 0: abstract = None # query csv_data matched_item = None # match by title if title is not None: similarity = csv_data.apply(lambda x: text_similarity_by_char( x['title'], title, quick_mode=True), axis=1) sim_csv_data = csv_data[similarity >= 2 * LEAST_TITLE_SIMILARITY - 1] if len(sim_csv_data) > 0: similarity = sim_csv_data.apply( lambda x: text_similarity_by_char( x['title'], title, quick_mode=False), axis=1) sorted_similarity = similarity.sort_values(ascending=False) sorted_data = sim_csv_data.reindex( index=sorted_similarity.index) if (len(title) > LEAST_TITLE_LEN and len(sorted_data.iloc[0]['title']) > LEAST_TITLE_LEN and sorted_similarity.iloc[0] > LEAST_TITLE_SIMILARITY): print('raw_title: ', raw_title) print('title', title) print("csv_title", sorted_data.iloc[0]['title']) print('similarity', sorted_similarity.iloc[0]) print() matched_item = correct_pd_dict( sorted_data.iloc[0].to_dict()) if matched_item is None and len(sim_csv_data) > 0: similarity = sim_csv_data.apply( lambda x: text_similarity_by_char(x['title'], title, quick_mode=False, enable_ignore_begin_end= True), axis=1) sorted_similarity = similarity.sort_values(ascending=False) sorted_data = sim_csv_data.reindex( index=sorted_similarity.index) if (len(title) > LEAST_TITLE_LEN and len(sorted_data.iloc[0]['title']) > LEAST_TITLE_LEN and sorted_similarity.iloc[0] > LEAST_TITLE_SIMILARITY): print('result after ignore_begin_end') print('raw_title: ', raw_title) print('title', title) print("csv_title", sorted_data.iloc[0]['title']) print('similarity', sorted_similarity.iloc[0]) print() matched_item = correct_pd_dict( sorted_data.iloc[0].to_dict()) if abstract is not None and matched_item is None: # match by abstract similarity = csv_data.apply(lambda x: text_similarity_by_char( x['abstract'], abstract, quick_mode=True), axis=1) sim_csv_data = csv_data[similarity >= 2 * LEAST_ABS_SIMILARITY - 1] if len(sim_csv_data) > 0: similarity = sim_csv_data.apply( lambda x: text_similarity_by_char( x['abstract'], abstract, quick_mode=False), axis=1) sorted_similarity = similarity.sort_values(ascending=False) sorted_data = sim_csv_data.reindex( index=sorted_similarity.index) if (len(abstract) > LEAST_ABS_LEN and len(sorted_data.iloc[0]['abstract']) > LEAST_TITLE_LEN and sorted_similarity.iloc[0] > LEAST_ABS_SIMILARITY): print('abstract', abstract) print("csv_abstract", sorted_data.iloc[0]['abstract']) print('similarity', sorted_similarity.iloc[0]) print() matched_item = correct_pd_dict( sorted_data.iloc[0].to_dict()) # update db set_params = { "tried_csv_doi": True, 'last_updated': datetime.now(), } # update doi found if matched_item is not None: print("FOUND") print() set_params['csv_raw_result'] = matched_item if matched_item.get('doi') and (isinstance(matched_item['doi'], str)): set_params['doi'] = matched_item['doi'] doc_updated = True try: col.find_one_and_update({"_id": doc['_id']}, { "$set": set_params, }) except Exception as e: print('matched_item') pprint(matched_item) print(e) raise e
def doi_match_a_batch_by_crossref(task_batch): mongo_db = get_mongo_db('../config.json') for i, task in enumerate(task_batch): if i % 100 == 0: print('thread', threading.currentThread().getName()) print('processing the {}th out of {}'.format(i, len(task_batch))) col = mongo_db[task['col_name']] # get doc doc = col.find_one({'_id': task['_id']}) if doc is None: continue doc_updated = False # get metadata metadata = None if ('metadata' in doc): metadata = doc['metadata'] else: # let's supporse metadata is always used first # TODO: we can also use abstract when metadata is not available continue # get title title = None raw_title = None if metadata is not None: if not ('title' in metadata and isinstance(metadata['title'], str) and len(metadata['title'].strip()) > 0): # doc w/o is minor part let's ignore them first # TODO: we can also use abstract when metadata is not available continue raw_title = metadata['title'] print('raw_title', raw_title) title = clean_title(raw_title) # get author author_names = None if metadata is not None: try: author_names = ",".join( [a['last'] for a in metadata['authors']]) except KeyError: author_names = None # query cross_ref query_url = 'https://api.crossref.org/works' query_params = { 'sort': 'relevance', 'order': 'desc', } if title: # after some experiments, we use pass the query value in plain str rather than html str # therefore, use title instead of urllib.parse.quote_plus(title) query_params['query.bibliographic'] = title # TODO: might need to double check if exact title matching be perfect (author might be different?) # TODO: might be wrong here need to clean db when only author info is used to retrieve data elif author_names: query_params['query.bibliographic'] = author_names # TODO: might also use email to search? try: cross_ref_results = requests.get( query_url, params=query_params, ) except: print('request to cross_ref failed!') continue try: cross_ref_results = cross_ref_results.json() except Exception as e: print('query result cannot be jsonified!') print('cross_ref_results.text', cross_ref_results.text) print('cross_ref_results.status_code', cross_ref_results.status_code) print('cross_ref_results.reason', cross_ref_results.reason) print() continue # filter out empty query results if not ('message' in cross_ref_results and 'items' in cross_ref_results['message'] and isinstance(cross_ref_results['message']['items'], list) and len(cross_ref_results['message']['items']) > 0): print('EMPTY RESULT') pprint(cross_ref_results) print() continue else: cross_ref_results = cross_ref_results['message']['items'] # filter out query results without title # TODO: maybe abtract is available # use item['abstract'] cross_ref_results = list( filter( lambda x: ('title' in x and isinstance(x['title'], list) and len(x['title']) > 0), cross_ref_results)) # exit() # print('title', title) # print("metadata['title']", metadata['title']) # print('author_names', author_names) # pprint(r.json()) # exit() # match by title directly matched_item = None if matched_item is None: for item in cross_ref_results: if len(item['title']) != 1: print("len(item['title'])", len(item['title'])) cr_title = clean_title(item['title'][0]) similarity = text_similarity_by_char(cr_title, title) if (len(cr_title) > LEAST_TITLE_LEN and len(title) > LEAST_TITLE_LEN and similarity > LEAST_TITLE_SIMILARITY): print('raw_title: ', raw_title) print('title', title) print("cr_title", cr_title) print('similarity', similarity) matched_item = item break # if cr_title == title: # matched_item = item # break # # match by revised title # if matched_item is None: # title = re.sub(' [0-9] ', ' ', title) # for item in cross_ref_results: # cr_title = item['title'][0] # if cr_title == title: # matched_item = item # break # update doi found if matched_item is not None: print("FOUND") print() col.find_one_and_update({"_id": doc['_id']}, { "$set": { "doi": matched_item['DOI'], 'tried_crossref_doi': True, 'crossref_raw_result': matched_item, 'last_updated': datetime.now(), } }) doc_updated = True # else: # print('query_params', query_params) # print('\n'.join([x['title'][0] for x in cross_ref_results])) # mark tried even if doi is not found but searching is completed if not doc_updated: col.find_one_and_update({"_id": doc['_id']}, { "$set": { "tried_crossref_doi": True, 'last_updated': datetime.now(), } })