def __init__(self): self.missing_ids = {} self.new_ids = {} self.data_utils = DataUtils() self.data_loader_utils = DataLoaderUtils(SERVER, OLD_INDEX, OLD_TYPE, '', '') self.docs_for_dolan = {}
def __init__(self, ct_load_config): self.ct_load_config = ct_load_config self.pubmed_load_config = self.get_pubmed_load_config() self.pubmed_relations = {} self.ct_relations = {} self.processed_docs = 0 self.data_utils = DataUtils()
def __init__(self, server, index, type): self.server = server self.src_index = src_index self.src_type = src_type self.docs_with_issues = {} self.processed_docs = 0 self.data_utils = DataUtils()
def __init__(self, src_server, dest_server, src_index, src_type, dst_index, dst_type, username, password): self.src_data_loader_utils = DataLoaderUtils(src_server, src_index, src_type) self.dest_data_loader_utils = DataLoaderUtils(dest_server, dst_index, dst_type) self.data_utils = DataUtils() self.username = username self.password = password file_utils.make_directory(TEMP_DIR)
def __init__(self, src_server, dest_server, src_index, src_type, dst_index, dst_type): self.src_data_loader_utils = DataLoaderUtils(src_server, src_index, src_type) self.dest_data_loader_utils = DataLoaderUtils(dest_server, dst_index, dst_type) self.processed_doc_count = 0 self.total_doc_count = 0 self.data_utils = DataUtils()
def __init__(self, server, src_index, src_type, process_doc_method): self.server = server self.index = src_index self.type = src_type self.process_doc_method = process_doc_method self.batch_size = 5000 self.process_count = 2 self.process_spawn_delay = 0.15 self.bulk_data_size = 300000 self.data_loader_utils = DataLoaderUtils(self.server, self.index, self.type) self.data_utils = DataUtils()
def __init__(self, load_config, data_source, data_source_summary): super(PubmedRelationshipProcessor, self).__init__(load_config, data_source) self.data_source_summary = data_source_summary self.data_loader_utils = DataLoaderUtils( self.load_config.server, self.load_config.index, self.load_config.type, self.load_config.server_username, self.load_config.server_password) self.load_relationships = True self.docs_with_new_citations = {} self.docs_citations_history = {} self.existing_docs = {} self.data_utils = DataUtils()
def run(self): doc_ids = get_doc_ids( server=self.load_config.server, src_index=self.load_config.index, src_type=self.load_config.type, dest_dir=self.load_config.other_files_directory(), dest_file_name="INITIAL_GRANT_ALL_IRDB_IDS.json") doc_ids = doc_ids.keys() self.total_doc_count = len(doc_ids) data_utils = DataUtils() data_utils.batch_fetch_docs_for_ids(base_url=self.load_config.server, ids=doc_ids, index=self.load_config.index, type=self.load_config.type, docs_fetched=self.docs_fetched) self.process_grant_num_groups()
def __init__(self, src_server, dest_server, src_index, src_type, dst_index, dst_type, username, password): self.src_data_loader_utils = DataLoaderUtils(src_server, src_index, src_type) self.dest_data_loader_utils = DataLoaderUtils(dest_server, dst_index, dst_type) self.processed_doc_count = 0 self.total_doc_count = 0 self.data_utils = DataUtils() self.relations_to_exclude = [] self.missing_destination_ids = [] self.username = username self.password = password self.last_time_stamp = 0 self.diff_average = 0
class DocProcessor(object): def __init__(self, server, index, type): self.server = server self.src_index = src_index self.src_type = src_type self.docs_with_issues = {} self.processed_docs = 0 self.data_utils = DataUtils() def run(self): self.processed_docs = 0 query = {"match_all": {}} self.data_utils.batch_fetch_ids_for_query(base_url=self.server, index=self.src_index, type=self.src_type, query=query, ids_fetched=self.ids_fetched) def docs_fetched(self, docs, index, type): for doc in docs: self.process_doc(doc) def process_doc(self, doc): pass def ids_fetched(self, ids, index, type): self.data_utils.batch_fetch_docs_for_ids( base_url=self.server, ids=ids, index=self.src_index, type=self.src_type, docs_fetched=self.docs_fetched) # def start(): # doc_processor = DocProcessor(LOCAL_SERVER, 'pubmed2018_v5', 'article') # doc_processor.run()
def batch_fetch_docs(self, ids, index_id): data_utils = DataUtils() if index_id == ID_IRDB: data_utils.batch_fetch_docs_for_ids(LOCAL_SERVER, ids, INDEX, TYPE, self.docs_fetched_irdb, 1000) elif index_id == ID_PUBMED: data_utils.batch_fetch_docs_for_ids( SERVER, ids, INDEX_MAPPING[index_id]['index'], INDEX_MAPPING[index_id]['type'], self.docs_fetched_pubmed, 1000)
class ProcessIndex(object): def __init__(self, server, src_index, src_type, process_doc_method): self.server = server self.index = src_index self.type = src_type self.process_doc_method = process_doc_method self.batch_size = 5000 self.process_count = 2 self.process_spawn_delay = 0.15 self.bulk_data_size = 300000 self.data_loader_utils = DataLoaderUtils(self.server, self.index, self.type) self.data_utils = DataUtils() def run(self): # doc_ids = export_doc_ids( self.server, self.index, # self.type, self.index + '_' + self.type , 'doc_ids.json') doc_ids = file_utils.load_file(self.index, self.index + '_ids.json') if len(doc_ids) == 0: doc_ids = export_doc_ids.export_doc_ids(self.server, self.index, self.type) doc_ids = doc_ids.keys() batch_doc_processor = BatchDocProcessor(doc_ids, self.process_batch, self.batch_size, self.process_count, self.process_spawn_delay) batch_doc_processor.run() def docs_fetched(self, docs, index, type): docs_to_process = {} print 'Docs fetched', len(docs) for doc in docs: _id = doc['_id'] if '_source' in doc: existing_doc = doc['_source'] docs_to_process[_id] = existing_doc self.process_docs(docs_to_process) def process_docs(self, docs): bulk_data = '' for _id in docs: doc = docs[_id] processed_doc = self.process_doc_method(_id, doc) if processed_doc is not None: bulk_data += self.data_loader_utils.bulk_update_header(_id) bulk_data += '\n' updated_doc = {'doc': processed_doc} bulk_data += json.dumps(updated_doc) bulk_data += '\n' if len(bulk_data) >= self.bulk_data_size: # print 'loading bulk data...' self.load_bulk_data(bulk_data) bulk_data = '' if len(bulk_data) > 0: # print 'loading bulk data...' self.load_bulk_data(bulk_data) def load_bulk_data(self, bulk_data): self.data_loader_utils.load_bulk_data(bulk_data) # pass def process_batch(self, doc_ids): self.data_utils.batch_fetch_docs_for_ids( base_url=self.server, ids=doc_ids, index=self.index, type=self.type, docs_fetched=self.docs_fetched)
class CopyGrants(object): def __init__(self, src_server, dest_server, src_index, src_type, dst_index, dst_type, username, password): self.src_data_loader_utils = DataLoaderUtils(src_server, src_index, src_type) self.dest_data_loader_utils = DataLoaderUtils(dest_server, dst_index, dst_type) self.data_utils = DataUtils() self.username = username self.password = password file_utils.make_directory(TEMP_DIR) def run(self): self.process_batches() def process_batches(self): batch_file_names = [] for batch_file_name in os.listdir(TEMP_DIR): file_path = os.path.join(TEMP_DIR, batch_file_name) if os.path.isfile(file_path) and batch_file_name.startswith('batch_'): batch_file_names.append(batch_file_name) print "Generated ", len(batch_file_names), 'batch file names' batch_file_names.sort() if len(batch_file_names) == 0: batch_file_names = self.split_to_batches() print len(batch_file_names) raw_input('Continue?') processed_batches = file_utils.load_file(TEMP_DIR, 'processed_pubmed2018_docs_with_grants_batches.json') for batch_file_name in batch_file_names: if batch_file_name not in processed_batches: print 'Loading batch', batch_file_name batch = file_utils.load_file(TEMP_DIR, batch_file_name) self.copy_docs_batch(batch) processed_batches[batch_file_name] = 0 file_utils.save_file(TEMP_DIR, 'processed_pubmed2018_docs_with_grants_batches.json', processed_batches) def split_to_batches(self): server = self.src_data_loader_utils.server src_index = self.src_data_loader_utils.index src_type = self.src_data_loader_utils.type print 'Fetching doc ids for', src_index, src_type query = { "nested": { "path": "grants", "query": { "bool": { "must": [ { "exists": { "field": "grants" } } ] } } } } all_pubmed_ids = export_doc_ids.get_doc_ids(server, src_index, src_type, TEMP_DIR, 'pubmed2018_docs_with_grants.json', query=query) # all_pubmed_ids = all_pubmed_ids.keys() # all_pubmed_ids.sort() self.total_doc_count = len(all_pubmed_ids) max_batch_count = 5000 batch_file_names = [] batch_index = 0 batch_ids = [] # Splitting into batches for _id in all_pubmed_ids: batch_ids.append(_id) if len(batch_ids) >= max_batch_count: print 'Writing batch:', batch_index batch_file_name = 'batch_' + str(batch_index) + '.json' batch_file_names.append(batch_file_name) file_utils.save_file(TEMP_DIR, batch_file_name, batch_ids) batch_ids = [] batch_index += 1 if len(batch_ids) > 0: print 'Writing batch:', batch_index batch_file_name = 'batch_' + str(batch_index) + '.json' batch_file_names.append(batch_file_name) file_utils.save_file(TEMP_DIR, batch_file_name, batch_ids) batch_index += 1 return batch_file_names def copy_docs_batch(self, doc_ids): print 'Fetching docs' self.data_utils.batch_fetch_docs_for_ids(base_url=self.src_data_loader_utils.server, ids=doc_ids, index=self.src_data_loader_utils.index, type=self.src_data_loader_utils.type, docs_fetched=self.docs_fetched, batch_size=500) def docs_fetched(self, docs, index, type): print 'Docs fetched', len(docs) docs_to_copy = {} # print 'Docs fetched', len(docs) for doc in docs: _id = doc['_id'] if '_source' in doc: existing_doc = doc['_source'] docs_to_copy[_id] = existing_doc self.copy_relations(docs_to_copy) def load_bulk_data(self, bulk_data): print 'Bulk data size', len(bulk_data), 'loading...' response = self.dest_data_loader_utils.load_bulk_data(bulk_data) if response: pass # print 'Done loading bulk data, saving response' else: print 'Bulk data load failed' def copy_relations(self, src_docs): bulk_data = '' count = 0 # Copy relations for _id in src_docs: src_doc = src_docs[_id] doc = {} if 'grants' in src_doc: doc['grants'] = src_doc['grants'] count += 1 if len(doc) > 0: bulk_data += self.dest_data_loader_utils.bulk_update_header(_id) bulk_data += '\n' doc = { 'doc': doc } bulk_data += json.dumps(doc) bulk_data += '\n' # if count % 1000 == 0: # print 'Processed', 1000, 'docs' if len(bulk_data) >= 150000: print _id self.load_bulk_data(bulk_data) # print 'Copied', count, 'docs' bulk_data = '' if len(bulk_data) > 0: self.load_bulk_data(bulk_data) pass
def __init__(self, batch_docs_directory, load_config, batch_name): self.load_config = load_config # self.batch = batch self.batch_docs_directory = batch_docs_directory self.batch_name = batch_name self.data_utils = DataUtils()
class ProcessBatch(object): def __init__(self, batch_docs_directory, load_config, batch_name): self.load_config = load_config # self.batch = batch self.batch_docs_directory = batch_docs_directory self.batch_name = batch_name self.data_utils = DataUtils() def run(self, batch): print 'Fetching docs', len(batch) self.data_utils.batch_fetch_docs_for_ids( base_url=self.load_config.server, ids=batch, index=self.load_config.index, type=self.load_config.type, docs_fetched=self.docs_fetched, batch_size=500) def docs_fetched(self, docs, index, type): print 'Docs fetched', len(docs) docs_to_process = {} # print 'Docs fetched', len(docs) for doc in docs: _id = doc['_id'] if '_source' in doc: existing_doc = doc['_source'] docs_to_process[_id] = existing_doc self.process_docs(docs_to_process) def process_docs(self, docs): print 'Processing docs', len(docs) citation_errors = {} for _id in docs: # print 'Processing doc', _id doc = docs[_id] citations_from_update_history = self.get_citations_from_data(doc) current_citations = self.get_citations(doc) if len(current_citations) != len(citations_from_update_history): citation_errors[_id] = citations_from_update_history print _id, 'current citations:', len( current_citations), 'citations from update history:', len( citations_from_update_history) file_utils.save_file(self.batch_docs_directory, 'citation_errors_' + self.batch_name + '.json', citation_errors) def get_citations_from_data(self, doc): citations = [] if 'PubmedData' in doc: if 'ReferenceList' in doc['PubmedData']: if 'Reference' in doc['PubmedData']['ReferenceList']: reference_list = doc['PubmedData']['ReferenceList'][ 'Reference'] if not isinstance(reference_list, list): reference_list = [reference_list] for reference in reference_list: if 'ArticleIdList' in reference: article_id_list = reference['ArticleIdList'] if 'ArticleId' in article_id_list: article_ids = article_id_list['ArticleId'] if not isinstance(article_ids, list): article_ids = [article_ids] for article_id in article_ids: if 'IdType' in article_id: article_id_type = article_id['IdType'] if article_id_type == 'pubmed': pmid = article_id['content'] citations.append(pmid) return citations def get_citations(self, doc): citations = [] if doc is not None and 'citations' in doc: citations_array = doc['citations'] for citation_item in citations_array: source = citation_item['source'] index_id = citation_item['index_id'] if source == self.load_config.source and index_id == ID_PUBMED: citations = citation_item['ids'] break return citations
class CTPublicationsRelationshipProcessor(object): def __init__(self, ct_load_config): self.ct_load_config = ct_load_config self.pubmed_load_config = self.get_pubmed_load_config() self.pubmed_relations = {} self.ct_relations = {} self.processed_docs = 0 self.data_utils = DataUtils() def run(self): # doc_ids = export_doc_ids( self.server, self.index, # self.type, self.index + '_' + self.type , 'doc_ids.json') doc_ids = file_utils.load_file( self.ct_load_config.index, self.ct_load_config.index + '_ids.json') if len(doc_ids) == 0: doc_ids = export_doc_ids.export_doc_ids(self.ct_load_config.server, self.ct_load_config.index, self.ct_load_config.type) doc_ids = doc_ids.keys() self.data_utils.batch_fetch_docs_for_ids(base_url= self.ct_load_config.server, ids=doc_ids, index= self.ct_load_config.index, type= self.ct_load_config.type, docs_fetched=self.docs_fetched) print 'Total pubmed relations', len(self.pubmed_relations) print 'Total ct relations', len(self.pubmed_relations) # Load Pubmed relations pubmed_ids = {} pubmed_ids = data_mapper.reformat(reformatted_array=pubmed_ids, relations_array=self.pubmed_relations, dest_index_id=ID_CLINICAL_TRIALS, relationship_type=RELATIONSHIP_TYPE_CITATIONS) print 'Reformatted pubmed ids', len(pubmed_ids) self.pubmed_load_config.append_relations = True self.pubmed_load_config.source = 'ct_publications' self.pubmed_load_config.data_source_name = 'ct_publications_relations' data_load_batcher = DataLoadBatcher(self.pubmed_load_config, self.pubmed_load_config.index, self.pubmed_load_config.type) data_load_batcher.load_relationships = True data_load_batcher.process_data_rows('pubmed_ct_citations', pubmed_ids) # Load Clinical trials relations ct_ids = {} ct_ids = data_mapper.reformat(reformatted_array=ct_ids, relations_array=self.ct_relations, dest_index_id=ID_PUBMED, relationship_type=RELATIONSHIP_TYPE_CITED_BYS) print 'Reformatted ct ids', len(ct_ids) self.ct_load_config.append_relations = True self.ct_load_config.source = 'ct_publications' self.ct_load_config.data_source_name = 'ct_publications_relations' data_load_batcher = DataLoadBatcher(self.ct_load_config, self.ct_load_config.index, self.ct_load_config.type) data_load_batcher.load_relationships = True data_load_batcher.process_data_rows('ct_pubmed_cited_bys', ct_ids) def get_pubmed_load_config(self): index_item = es_utils.get_info_for_index_id(ID_PUBMED) pubmed_index = index_item['index'] pubmed_type = index_item['index_type'] load_config = LoadConfig() load_config.root_directory = self.ct_load_config.root_directory load_config.server = self.ct_load_config.server load_config.index = pubmed_index load_config.type = pubmed_type load_config.data_extractor = PubmedDataExtractor() load_config.data_mapper = PubmedDataMapper() return load_config def docs_fetched(self, docs, index, type): docs_to_process = {} print 'Docs fetched', len(docs) for doc in docs: _id = doc['_id'] if '_source' in doc: existing_doc = doc['_source'] docs_to_process[_id] = existing_doc self.process_docs(docs_to_process) self.processed_docs += len(docs) print 'Processed docs', self.processed_docs, 'Pubmed relations', len(self.pubmed_relations) def process_docs(self, docs): bulk_data = '' for _id in docs: doc = docs[_id] processed_doc = self.process_doc(_id, doc) def process_doc(self, _id, doc): if 'ct_publications' in doc: cited_bys = [] ct_publications = doc['ct_publications'] for ct_publication in ct_publications: if 'pmid' in ct_publication: pmid = ct_publication['pmid'] pmid = str(pmid) if len(pmid) > 0: cited_bys.append(pmid) if pmid not in self.pubmed_relations: self.pubmed_relations[pmid] = [] self.pubmed_relations[pmid].append(_id) if _id not in self.ct_relations: self.ct_relations[_id] = [] self.ct_relations[_id].append(pmid) return None
class CopyDocs(object): def __init__(self, src_server, dest_server, src_index, src_type, dst_index, dst_type): self.src_data_loader_utils = DataLoaderUtils(src_server, src_index, src_type) self.dest_data_loader_utils = DataLoaderUtils(dest_server, dst_index, dst_type) self.processed_doc_count = 0 self.total_doc_count = 0 self.data_utils = DataUtils() def get_total_doc_count(self): return self.data_utils.get_total_doc_count( base_url=self.src_data_loader_utils.server, index=self.src_data_loader_utils.index, type=self.src_data_loader_utils.type) def docs_fetched(self, docs, index, type): docs_to_copy = {} # print 'Docs fetched', len(docs) for doc in docs: _id = doc['_id'] if '_source' in doc: existing_doc = doc['_source'] docs_to_copy[_id] = existing_doc self.index_docs(docs_to_copy) self.processed_doc_count += len(docs) progress = ((self.processed_doc_count / float(self.total_doc_count)) * 100) print '---------------------------------------------------------------------------------------------' print 'Progress', self.processed_doc_count, '/', self.total_doc_count, progress, '%' print '---------------------------------------------------------------------------------------------' def export_doc_ids(self, server, src_index, src_type): print 'Fetching doc ids for', src_index, src_type query = {"match_all": {}} self.data_utils.batch_fetch_ids_for_query(base_url=server, index=src_index, type=src_type, query=query, ids_fetched=self.ids_fetched) # print 'Done, fetched', len(documents_ids), 'doc ids' def ids_fetched(self, ids, index, type): self.copy_docs_batch(ids) def create_destination_index(self, mapping=None): if mapping is None: # Get mapping from src index mapping = self.src_data_loader_utils.get_mapping_from_server() if not self.dest_data_loader_utils.index_exists(): print 'Creating index' self.dest_data_loader_utils.put_mapping(mapping) # migrate_index(self.dest_data_loader_utils.index) else: print self.dest_data_loader_utils.index, 'exists' def copy_docs(self): self.processed_doc_count = 0 self.total_doc_count = self.get_total_doc_count() print 'Total doc count', self.total_doc_count self.create_destination_index(mapping=None) self.export_doc_ids(server=self.src_data_loader_utils.server, src_index=self.src_data_loader_utils.index, src_type=self.src_data_loader_utils.type) def copy_docs_for_ids(self, doc_ids, mapping=None): self.processed_doc_count = 0 self.total_doc_count = len(doc_ids) print 'Total doc count', self.total_doc_count self.create_destination_index(mapping) print 'Fetching docs from source index' batch_doc_processor = BatchDocProcessor(doc_ids, self.copy_docs_batch, 3000, 16, 0.33) batch_doc_processor.run() def copy_docs_batch(self, doc_ids): self.data_utils.batch_fetch_docs_for_ids( base_url=self.src_data_loader_utils.server, ids=doc_ids, index=self.src_data_loader_utils.index, type=self.src_data_loader_utils.type, docs_fetched=self.docs_fetched) def index_docs(self, docs_to_copy): bulk_data = '' count = 0 for es_id in docs_to_copy: count += 1 doc = docs_to_copy[es_id] bulk_data += self.dest_data_loader_utils.bulk_index_header(es_id) bulk_data += '\n' bulk_data += json.dumps(doc) bulk_data += '\n' # if count % 1000 == 0: # print 'Processed', 1000, 'docs' if len(bulk_data) >= 150000: self.load_bulk_data(bulk_data) # print 'Copied', count, 'docs' bulk_data = '' if len(bulk_data) > 0: self.load_bulk_data(bulk_data) # print 'Copied', count, 'docs' def load_bulk_data(self, bulk_data): # print 'Bulk data size', len(bulk_data), 'loading...' response = self.dest_data_loader_utils.load_bulk_data(bulk_data) if response: pass # print 'Done loading bulk data, saving response' else: print 'Bulk data load failed' # src_server = 'http://localhost:9200' # src_index = 'irdb_v3' # src_type = 'grant' # dest_server = 'http://localhost:9200' # dest_index = 'irdb_v4' # dest_type = 'grant' # copy_docs = CopyDocs(src_server=src_server, # dest_server=dest_server, # src_index=src_index, # src_type=src_type, # dst_index=dest_index, # dst_type=dest_type) # copy_docs.copy_docs() # copy_relations.relations_to_exclude.append({ # "source": "", # "index_id": ID_PUBMED # }) # copy_relations.run()
class FindMissingIds(object): def __init__(self): self.missing_ids = {} self.new_ids = {} self.data_utils = DataUtils() self.data_loader_utils = DataLoaderUtils(SERVER, OLD_INDEX, OLD_TYPE, '', '') self.docs_for_dolan = {} def run(self): old_ids = export_doc_ids(server=SERVER, src_index=OLD_INDEX, src_type=OLD_TYPE) new_ids = export_doc_ids(server=SERVER, src_index=NEW_INDEX, src_type=NEW_TYPE) for _id in old_ids: if _id not in new_ids: self.missing_ids[_id] = 0 if len(self.missing_ids) % 1000 == 0: print 'Missing ids', len(self.missing_ids) for _id in new_ids: if _id not in old_ids: self.new_ids[_id] = 0 if len(self.new_ids) % 1000 == 0: print 'New ids', len(self.new_ids) print 'Missing ids', len(self.missing_ids) print 'New ids', len(self.new_ids) file_utils.make_directory(missing_ids_directory) file_utils.save_file(missing_ids_directory, 'missing_ids.json', self.missing_ids.keys()) file_utils.save_file(missing_ids_directory, 'new_ids.json', self.new_ids) def check_tags_and_annotations(self): missing_ids = file_utils.load_file(missing_ids_directory, 'missing_ids.json') new_ids = file_utils.load_file(missing_ids_directory, 'new_ids.json') print 'Missing ids', len(missing_ids) print 'New ids', len(new_ids) docs_with_tags = self.fetch_ids() missing_docs_with_tags = [] for _id in missing_ids: if _id in docs_with_tags: missing_docs_with_tags.append(_id) print 'Missing docs with tags', _id print 'Missing docs with tags', len(missing_docs_with_tags) print 'Missing docs with tags', json.dumps(missing_docs_with_tags) for _id in missing_docs_with_tags: existing_doc = self.get_existing_doc(_id) if 'userTags' in existing_doc: user_tags = existing_doc['userTags'] for user_tag in user_tags: added_by = user_tag['added_by'] if added_by == '*****@*****.**': self.docs_for_dolan[_id] = existing_doc print _id print user_tags break print 'Docs for Dolan', len(self.docs_for_dolan) print 'Docs for Dolan', self.docs_for_dolan.keys() def get_existing_doc(self, _id): exisiting_doc = self.data_loader_utils.fetch_doc(_id) if exisiting_doc is not None and '_source' in exisiting_doc: exisiting_doc = exisiting_doc['_source'] return exisiting_doc def fetch_ids(self): combined_docs = {} tags_query = self.tags_query() annotations_query = self.annotations_query() print 'Fetching docs with tags', SERVER, OLD_INDEX, OLD_TYPE docs_with_tags = self.data_utils.batch_fetch_ids_for_query( base_url=SERVER, query=tags_query, index=OLD_INDEX, type=OLD_TYPE, ids_fetched=self.ids_fetched, batch_size=1000) print len(docs_with_tags), 'docs_with_tags' for _id in docs_with_tags: combined_docs[_id] = '' print 'Fetching docs with annotations', SERVER, OLD_INDEX, OLD_TYPE docs_with_annotations = self.data_utils.batch_fetch_ids_for_query( base_url=SERVER, query=annotations_query, index=OLD_INDEX, type=OLD_TYPE, ids_fetched=self.ids_fetched, batch_size=1000) print len(docs_with_annotations), 'docs_with_annotations' for _id in docs_with_annotations: combined_docs[_id] = '' print len(combined_docs), 'combined_docs' return combined_docs def ids_fetched(self, ids, index, type): print len(ids), 'ids fetched' def tags_query(self): tags_query = { "nested": { "path": "userTags", "query": { "bool": { "must": [{ "exists": { "field": "userTags" } }] } } } } return tags_query def annotations_query(self): annotations_query = { "nested": { "path": "annotations", "query": { "bool": { "must": [{ "exists": { "field": "annotations" } }] } } } } return annotations_query
def process_id(self, _id): grant_numbers = [] derwent_ids = [] if _id in self.irdb_docs: doc = self.irdb_docs[_id] if doc is not None: admin_phs_org_code = None if 'admin_phs_org_code' in doc: admin_phs_org_code = doc['admin_phs_org_code'] serial_num = None if 'serial_num' in doc: serial_num = doc['serial_num'] if admin_phs_org_code is not None and serial_num is not None: grant_number = admin_phs_org_code + '' + serial_num grant_numbers.append(grant_number) grant_number = admin_phs_org_code + '-' + serial_num grant_numbers.append(grant_number) grant_number = admin_phs_org_code + '0' + serial_num grant_numbers.append(grant_number) grant_number = admin_phs_org_code + '-0' + serial_num grant_numbers.append(grant_number) grant_number = admin_phs_org_code + ' ' + serial_num grant_numbers.append(grant_number) grant_number = admin_phs_org_code + ' 0' + serial_num grant_numbers.append(grant_number) if len(grant_numbers) > 0: should_query = [] for grant_number in grant_numbers: match_phrase_query = { "match_phrase": { "government_support": grant_number } } should_query.append(match_phrase_query) query = {"bool": {"should": should_query}} data_utils = DataUtils(self.session) derwent_ids = data_utils.batch_fetch_ids_for_query( base_url=SERVER, query=query, index=INDEX_MAPPING[ID_DERWENT_PATENTS]['index'], type=INDEX_MAPPING[ID_DERWENT_PATENTS]['type']) # if len(derwent_ids) > 0: # print _id, len(derwent_ids) # if len(derwent_ids) < 5: # print derwent_ids # time.sleep(5) return derwent_ids
def __init__(self, load_config): super(FixCitedBys, self).__init__(load_config, batch_doc_count=5000, multiprocess=True) self.load_config = load_config self.data_utils = DataUtils()
class FixCitedBys(BatchProcessor): def __init__(self, load_config): super(FixCitedBys, self).__init__(load_config, batch_doc_count=5000, multiprocess=True) self.load_config = load_config self.data_utils = DataUtils() def create_processed_files(self): processed_batches = file_utils.load_file(self.batch_docs_directory(), PROCESSED_BATCHES_FILE) for batch_file_name in processed_batches: file_utils.save_file(self.batch_docs_directory(), RESULTS_FILE_PREFIX + batch_file_name, {}) def process_completed(self): pass def get_batch_docs_directory(self): return DIR # def get_query(self): # return { # "exists": { # "field": "update_history" # } # } def process_docs_batch(self, batch, batch_name): print 'Processing docs', len(batch) pubmed_cited_bys_pubmed = {} for _id in batch: cited_bys = self.get_cited_bys(_id) if len(cited_bys) > 0: pubmed_cited_bys_pubmed[_id] = cited_bys # print _id, len(cited_bys) pubmed_ids = {} pubmed_ids = self.load_config.data_mapper.reformat(reformatted_array=pubmed_ids, relations_array=pubmed_cited_bys_pubmed, dest_index_id=ID_PUBMED, relationship_type=RELATIONSHIP_TYPE_CITED_BYS, removed_ids=[]) print batch_name, len(pubmed_ids), 'ids to update' relationship_loader = RelationshipLoader(self.load_config, pubmed_ids, self.load_config.index, self.load_config.type, 'ds_batch_fix_cited_bys') relationship_loader.run() return {} def get_cited_bys(self, _id): """ Search elasticsearch for any docs citing the given id """ query = { "bool": { "must": [ { "match": { "citations.ids": _id } }, { "match": { "citations.source": "" } }, { "match": { "citations.index_id": ID_PUBMED } } ] } } ids = self.data_utils.batch_fetch_ids_for_query(base_url=self.load_config.server, query=query, index=self.load_config.index, type=self.load_config.type) return ids
class PubmedRelationshipProcessor(DataSourceProcessor): def __init__(self, load_config, data_source, data_source_summary): super(PubmedRelationshipProcessor, self).__init__(load_config, data_source) self.data_source_summary = data_source_summary self.data_loader_utils = DataLoaderUtils( self.load_config.server, self.load_config.index, self.load_config.type, self.load_config.server_username, self.load_config.server_password) self.load_relationships = True self.docs_with_new_citations = {} self.docs_citations_history = {} self.existing_docs = {} self.data_utils = DataUtils() def docs_fetched(self, docs, index, type): self.load_config.log(LOG_LEVEL_TRACE, 'Docs fetched', len(docs)) for doc in docs: _id = doc['_id'] if '_source' in doc: existing_doc = doc['_source'] self.existing_docs[_id] = existing_doc def get_docs_with_new_citations(self): return self.docs_with_new_citations def get_citations_history(self): return self.docs_citations_history def update_citations_history(self, new_doc, _id, new_citations, existing_citations): # Update citation history if _id not in self.docs_citations_history: self.docs_citations_history[_id] = {} # Set the new doc flag self.docs_citations_history[_id]['new'] = new_doc # Update new citations if 'new_citations' not in self.docs_citations_history[_id]: self.docs_citations_history[_id]['new_citations'] = [] self.docs_citations_history[_id]['new_citations'].extend(new_citations) # Update existing citations if 'existing_citations' not in self.docs_citations_history[_id]: self.docs_citations_history[_id]['existing_citations'] = [] self.docs_citations_history[_id]['existing_citations'].extend( existing_citations) def process_relationships(self, extracted_ids): # all_indexed_ids = {} # if 'indexed_ids' in self.data_source_summary: # all_indexed_ids = self.data_source_summary['indexed_ids'] all_updated_ids = {} if 'updated_ids' in self.data_source_summary: all_updated_ids = self.data_source_summary['updated_ids'] print 'all_updated_ids', len(all_updated_ids) print 'extracted_ids', len(extracted_ids) # Fetch existing (updated) docs self.load_config.log(LOG_LEVEL_DEBUG, 'Fetching docs', self.load_config.server, self.load_config.index, self.load_config.type) ids_to_fetch = all_updated_ids.keys() self.data_utils.batch_fetch_docs_for_ids( self.load_config.server, ids_to_fetch, self.load_config.index, self.load_config.type, self.docs_fetched, self.load_config.doc_fetch_batch_size, self.load_config.server_username, self.load_config.server_password) print 'existing_docs', len(self.existing_docs) pubmed_citations_pubmed = {} pubmed_cited_bys_pubmed = {} citations_to_remove = {} cited_bys_to_remove = {} count = 0 for _id in extracted_ids: count += 1 data = extracted_ids[_id] if len(data) == 0: print 'No data for', _id new_doc = False existing_citations = [] new_citations = self.load_config.data_mapper.get_citations(data) if _id in all_updated_ids: # Existing doc existing_doc = self.get_existing_doc(_id) existing_citations = self.get_citations(existing_doc) new_doc = False else: new_doc = True self.update_citations_history(new_doc, _id, new_citations, existing_citations) added_citations = [] removed_citations = [] # Get removed citations for existing_citation in existing_citations: if existing_citation not in new_citations: removed_citations.append(existing_citation) # Get added citations for new_citation in new_citations: if new_citation not in existing_citations: added_citations.append(new_citation) # Added citations and cited bys for citation in added_citations: # Citations if _id not in pubmed_citations_pubmed: pubmed_citations_pubmed[_id] = [] if citation not in pubmed_citations_pubmed[_id]: pubmed_citations_pubmed[_id].append(citation) # Cited by if citation not in pubmed_cited_bys_pubmed: pubmed_cited_bys_pubmed[citation] = [] if _id not in pubmed_cited_bys_pubmed[citation]: pubmed_cited_bys_pubmed[citation].append(_id) # Get existing cited bys (citations from other existing docs) for the new doc # if new_doc: # existing_cited_bys = self.get_existing_cited_bys(_id) # for cited_by in existing_cited_bys: # if _id not in pubmed_cited_bys_pubmed: # pubmed_cited_bys_pubmed[_id] = [] # if cited_by not in pubmed_cited_bys_pubmed[_id]: # pubmed_cited_bys_pubmed[_id].append(cited_by) # Removed citations and cited bys for removed_citation in removed_citations: # Removed citations if _id not in citations_to_remove: citations_to_remove[_id] = [] if removed_citation not in citations_to_remove[_id]: citations_to_remove[_id].append(removed_citation) # Removed cited_bys if removed_citation not in cited_bys_to_remove: cited_bys_to_remove[removed_citation] = [] if _id not in cited_bys_to_remove[removed_citation]: cited_bys_to_remove[removed_citation].append(_id) # Docs with new citations if len(added_citations) > 0: if _id not in self.docs_with_new_citations: self.docs_with_new_citations[_id] = [] self.docs_with_new_citations[_id].extend(added_citations) if count % 1000 == 0: print 'Processed', count, 'docs' pubmed_ids = {} pubmed_ids = self.load_config.data_mapper.reformat( reformatted_array=pubmed_ids, relations_array=pubmed_citations_pubmed, dest_index_id=ID_PUBMED, relationship_type=RELATIONSHIP_TYPE_CITATIONS, removed_ids=citations_to_remove) pubmed_ids = self.load_config.data_mapper.reformat( reformatted_array=pubmed_ids, relations_array=pubmed_cited_bys_pubmed, dest_index_id=ID_PUBMED, relationship_type=RELATIONSHIP_TYPE_CITED_BYS, removed_ids=cited_bys_to_remove) print 'pubmed_citations_pubmed', len(pubmed_citations_pubmed) print 'pubmed_cited_bys_pubmed', len(pubmed_cited_bys_pubmed) print 'citations_to_remove', len(citations_to_remove) print 'cited_bys_to_remove', len(cited_bys_to_remove) print 'reformatted pubmed_ids', len(pubmed_ids) relationships = dict() relationships[ID_PUBMED] = pubmed_ids return relationships # def get_cited_bys_for_doc(self, _id): # doc = self.fetch_existing_doc(_id) # return self.get_cited_bys(doc) # Fetch existing doc from elasticsearch def fetch_existing_doc(self, _id): existing_doc = self.data_loader_utils.fetch_doc(_id) if existing_doc is not None and '_source' in existing_doc: existing_doc = existing_doc['_source'] return existing_doc def get_existing_doc(self, _id): existing_doc = None if _id in self.existing_docs: existing_doc = self.existing_docs[_id] # Retry two times if not obtained in mget if existing_doc is None or len(existing_doc) == 0: existing_doc = self.fetch_existing_doc(_id) if existing_doc is None or len(existing_doc) == 0: existing_doc = self.fetch_existing_doc(_id) return existing_doc def get_cited_bys(self, doc): cited_bys = [] if doc is not None and 'cited_bys' in doc: cited_bys_array = doc['cited_bys'] for cited_by_item in cited_bys_array: source = cited_by_item['source'] index_id = cited_by_item['index_id'] if source == self.load_config.source and index_id == ID_PUBMED: cited_bys = cited_by_item['ids'] break return cited_bys # Get citations from doc def get_citations(self, doc): citations = [] if doc is not None and 'citations' in doc: citations_array = doc['citations'] for citation_item in citations_array: source = citation_item['source'] index_id = citation_item['index_id'] if source == self.load_config.source and index_id == ID_PUBMED: citations = citation_item['ids'] break return citations def has_multiple_citations(self, doc): citations = [] if 'citations' in doc: citations_array = doc['citations'] if len(citations_array) > 1: return True return False def get_existing_cited_bys(self, _id): """ Search elasticsearch for any docs citing the given id """ query = { "bool": { "must": [{ "match": { "citations.ids": _id } }, { "match": { "citations.source": "" } }, { "match": { "citations.index_id": ID_PUBMED } }] } } ids = self.data_utils.batch_fetch_ids_for_query( base_url=self.load_config.server, query=query, index=self.load_config.index, type=self.load_config.type) return ids def update_doc(self, _id, existing_doc, original_citations, removed_citations, added_citations): if len(removed_citations) > 0 or len(added_citations) > 0: print 'Updating doc:', _id, 'original_citations', len( original_citations), 'removed_citations', len( removed_citations), 'added_citations', len(added_citations) now = datetime.datetime.now() updated_date = now.isoformat() update_file = os.path.basename(self.data_source.data_source_file_path) # Create the update history item update_history_item = { "updated_date": updated_date, "update_file": update_file, "removed_citations": removed_citations, "added_citations": added_citations } # Get the existing update history update_history = [] if 'update_history' in existing_doc: update_history = existing_doc['update_history'] # Add the original citations list if not present if len(update_history) == 0: update_history.append({"original_citations": original_citations}) # Add the new update history item update_history.append(update_history_item) doc = {"update_history": update_history} doc = {'doc': doc} self.data_loader_utils.update_doc(_id, doc)
class CopyRelationships(object): def __init__(self, src_server, dest_server, src_index, src_type, dst_index, dst_type, username, password): self.src_data_loader_utils = DataLoaderUtils(src_server, src_index, src_type) self.dest_data_loader_utils = DataLoaderUtils(dest_server, dst_index, dst_type) self.processed_doc_count = 0 self.total_doc_count = 0 self.data_utils = DataUtils() self.relations_to_exclude = [] self.missing_destination_ids = [] self.username = username self.password = password self.last_time_stamp = 0 self.diff_average = 0 def run(self): self.processed_doc_count = 0 self.total_doc_count = self.get_total_doc_count() print 'Total doc count', self.total_doc_count # self.create_destination_index(mapping=None) self.export_doc_ids(server=self.src_data_loader_utils.server, src_index=self.src_data_loader_utils.index, src_type=self.src_data_loader_utils.type) print 'saving missing docs' file_utils.save_file('/data/data_loading/pubmed_2019', 'missing_docs_pubmed2019.json', self.missing_destination_ids) def run_for_ids(self, doc_ids, mapping=None): self.processed_doc_count = 0 self.total_doc_count = len(doc_ids) print 'Total doc count', self.total_doc_count print 'Fetching docs from source index' batch_doc_processor = BatchDocProcessor(doc_ids, self.copy_docs_batch, 1000, 1, 0) batch_doc_processor.run() file_utils.save_file('/data/data_loading/pubmed_2019', 'missing_docs_pubmed2019.json', self.missing_destination_ids) def export_doc_ids(self, server, src_index, src_type): print 'Fetching doc ids for', src_index, src_type query = {"match_all": {}} self.data_utils.batch_fetch_ids_for_query(base_url=server, index=src_index, type=src_type, query=query, ids_fetched=self.ids_fetched, batch_size=10000) # print 'Done, fetched', len(documents_ids), 'doc ids' def ids_fetched(self, ids, index, type): print 'Ids fetched', len(ids) self.copy_docs_batch(ids) def copy_docs_batch(self, doc_ids): print 'Fetching docs' self.data_utils.batch_fetch_docs_for_ids( base_url=self.src_data_loader_utils.server, ids=doc_ids, index=self.src_data_loader_utils.index, type=self.src_data_loader_utils.type, docs_fetched=self.docs_fetched, batch_size=500) def docs_fetched(self, docs, index, type): print 'Docs fetched', len(docs) docs_to_copy = {} # print 'Docs fetched', len(docs) for doc in docs: _id = doc['_id'] if '_source' in doc: existing_doc = doc['_source'] docs_to_copy[_id] = existing_doc self.copy_relations(docs_to_copy) # Update progress self.processed_doc_count += len(docs) progress = ((self.processed_doc_count / float(self.total_doc_count)) * 100) current_time_stamp = time.time() diff = current_time_stamp - self.last_time_stamp self.diff_average = float(diff + self.diff_average) / 2 time_remaining = diff * (float(self.total_doc_count) / len(docs)) self.last_time_stamp = current_time_stamp print '---------------------------------------------------------------------------------------------' print 'Progress', self.processed_doc_count, '/', self.total_doc_count, progress, '%', time_remaining, 'secs' print '---------------------------------------------------------------------------------------------' def get_src_relations(self, src_doc, relationship_type): src_relations = [] if relationship_type in src_doc: relations = src_doc[relationship_type] for relation_item in relations: exclude_relation_item = False for relation_to_exclude in self.relations_to_exclude: if relation_to_exclude['source'] == relation_item[ 'source'] and relation_to_exclude[ 'index_id'] == relation_item['index_id']: exclude_relation_item = True break if not exclude_relation_item: src_relations.append(relation_item) return src_relations def get_dest_relations(self, dest_doc, relationship_type): dest_relations = [] if relationship_type in dest_doc: dest_relations = dest_doc[relationship_type] return dest_relations def add_relations(self, append_ids, relation, relations_list): relation_found = False for existing_relation in relations_list: # print existing_relation['source'], relation['source'], existing_relation['index_id'], relation['index_id'] if existing_relation['source'] == relation[ 'source'] and existing_relation['index_id'] == relation[ 'index_id']: existing_relation_ids = existing_relation['ids'] if append_ids: relation_ids = relation['ids'] for _id in relation_ids: if _id not in existing_relation_ids: existing_relation_ids.append(_id) existing_relation['ids'] = existing_relation_ids relation_found = True break if not relation_found: relations_list.append(relation) return relations_list def merge_relations(self, src_doc, dest_doc, relationship_type): dest_relations = self.get_dest_relations(dest_doc, relationship_type) src_relations = self.get_src_relations(src_doc, relationship_type) # print 'src_relations', len(src_relations) # print 'dest_relations', len(dest_relations) combined_relations = [] for relation in dest_relations: combined_relations = self.add_relations(True, relation, combined_relations) for relation in src_relations: combined_relations = self.add_relations(True, relation, combined_relations) return combined_relations def copy_relations(self, src_docs): bulk_data = '' count = 0 # Fetch destination docs destination_ids = src_docs.keys() destination_docs_array = self.data_utils.fetch_docs_for_ids( base_url=self.dest_data_loader_utils.server, ids=destination_ids, index=self.dest_data_loader_utils.index, type=self.dest_data_loader_utils.type, username=self.username, password=self.password) # Create destination doc dict destination_docs = {} for doc in destination_docs_array: _id = doc['_id'] if '_source' in doc: destination_docs[_id] = doc['_source'] # Find missing destination docs for _id in destination_ids: if _id not in destination_docs: self.missing_destination_ids.append(_id) print 'Missing ids', len(self.missing_destination_ids) # print 'dest ids', len() # Copy relations for _id in destination_docs: dest_doc = destination_docs[_id] src_doc = src_docs[_id] dest_relations = {} dest_relations[RELATIONSHIP_TYPE_CITATIONS] = self.merge_relations( src_doc, dest_doc, RELATIONSHIP_TYPE_CITATIONS) dest_relations[RELATIONSHIP_TYPE_CITED_BYS] = self.merge_relations( src_doc, dest_doc, RELATIONSHIP_TYPE_CITED_BYS) dest_relations[RELATIONSHIP_TYPE_RELATIONS] = self.merge_relations( src_doc, dest_doc, RELATIONSHIP_TYPE_RELATIONS) doc = {} if len(dest_relations[RELATIONSHIP_TYPE_CITATIONS]) > 0: doc[RELATIONSHIP_TYPE_CITATIONS] = dest_relations[ RELATIONSHIP_TYPE_CITATIONS] if len(dest_relations[RELATIONSHIP_TYPE_CITED_BYS]) > 0: doc[RELATIONSHIP_TYPE_CITED_BYS] = dest_relations[ RELATIONSHIP_TYPE_CITED_BYS] if len(dest_relations[RELATIONSHIP_TYPE_RELATIONS]) > 0: doc[RELATIONSHIP_TYPE_RELATIONS] = dest_relations[ RELATIONSHIP_TYPE_RELATIONS] # if len(dest_relations[RELATIONSHIP_TYPE_CITATIONS]) >= 2: # print _id count += 1 # doc = docs_to_copy[es_id] bulk_data += self.dest_data_loader_utils.bulk_update_header(_id) bulk_data += '\n' doc = {'doc': doc} bulk_data += json.dumps(doc) bulk_data += '\n' # if count % 1000 == 0: # print 'Processed', 1000, 'docs' if len(bulk_data) >= 150000: print _id self.load_bulk_data(bulk_data) # print 'Copied', count, 'docs' bulk_data = '' if len(bulk_data) > 0: self.load_bulk_data(bulk_data) pass # print 'Copied', count, 'docs' # def create_destination_index(self, mapping=None): # if mapping is None: # # Get mapping from src index # mapping = self.src_data_loader_utils.get_mapping_from_server() # if not self.dest_data_loader_utils.index_exists(): # print 'Creating index' # self.dest_data_loader_utils.put_mapping(mapping) # # migrate_index(self.dest_data_loader_utils.index) # else: # print self.dest_data_loader_utils.index, 'exists' def load_bulk_data(self, bulk_data): print 'Bulk data size', len(bulk_data), 'loading...' response = self.dest_data_loader_utils.load_bulk_data(bulk_data) if response: pass # print 'Done loading bulk data, saving response' else: print 'Bulk data load failed' def get_total_doc_count(self): return self.data_utils.get_total_doc_count( base_url=self.src_data_loader_utils.server, index=self.src_data_loader_utils.index, type=self.src_data_loader_utils.type) # src_server = 'http://localhost:9200' # src_index = 'pubmed2018_v5' # src_type = 'article' # dest_server = 'http://localhost:9200' # dest_index = 'pubmed2019' # dest_type = 'article' # copy_relations = CopyRelationships(src_server=src_server, # dest_server=dest_server, # src_index=src_index, # src_type=src_type, # dst_index=dest_index, # dst_type=dest_type, # username='', # password='') # copy_relations.relations_to_exclude.append({ # "source": "", # "index_id": ID_PUBMED # }) # copy_relations.run() # copy_relations.run_for_ids([12620793])