def __init__(self): self.updated_docs = {} self.original_docs = {} self.server = SERVER self.index = INDEX self.type = TYPE self.server_username = '' self.server_password = '' self.load_config = self.get_load_config(clean_citations_directory) self.data_loader_utils = DataLoaderUtils( self.load_config.server, self.load_config.index, self.load_config.type, self.load_config.server_username, self.load_config.server_password) self.docs_with_updates = {} self.inverted_index = {} self.current_baseline_file = None self.current_update_file = None self.processes = [] self.missing_docs = {} self.inverted_index_for_updated_docs = {}
def __init__(self): self.missing_ids = {} self.new_ids = {} self.data_utils = DataUtils() self.data_loader_utils = DataLoaderUtils(SERVER, OLD_INDEX, OLD_TYPE, '', '') self.docs_for_dolan = {}
def __init__(self, load_config): self.load_config = load_config self.grant_num_groups = {} self.data_loader_utils = DataLoaderUtils(SERVER, INDEX, TYPE) self.bulk_data_size = 300000 self.docs_processed = 0 self.total_doc_count = 0
def __init__(self, src_server, dest_server, src_index, src_type, dst_index, dst_type, username, password): self.src_data_loader_utils = DataLoaderUtils(src_server, src_index, src_type) self.dest_data_loader_utils = DataLoaderUtils(dest_server, dst_index, dst_type) self.data_utils = DataUtils() self.username = username self.password = password file_utils.make_directory(TEMP_DIR)
def __init__(self, src_server, dest_server, src_index, src_type, dst_index, dst_type): self.src_data_loader_utils = DataLoaderUtils(src_server, src_index, src_type) self.dest_data_loader_utils = DataLoaderUtils(dest_server, dst_index, dst_type) self.processed_doc_count = 0 self.total_doc_count = 0 self.data_utils = DataUtils()
def duplicate_index(server, src_index, src_type, dst_index, dst_type, mapping=None): src_data_loader_utils = DataLoaderUtils(server, src_index, src_type) dest_data_loader_utils = DataLoaderUtils(server, dst_index, dst_type) if mapping is None: # Get mapping from src index mapping = src_data_loader_utils.get_mapping_from_server() if not dest_data_loader_utils.index_exists(): print 'Creating index' dest_data_loader_utils.put_mapping(mapping) else: print dst_index, 'exists' data = { "source": { "index": src_index }, "dest": { "index": dst_index } } url = server + '/_reindex?wait_for_completion=false' print url print data response = requests.post(url, json=data) print response print json.loads(response.text)
def __init__(self, server, src_index, src_type, process_doc_method): self.server = server self.index = src_index self.type = src_type self.process_doc_method = process_doc_method self.batch_size = 5000 self.process_count = 2 self.process_spawn_delay = 0.15 self.bulk_data_size = 300000 self.data_loader_utils = DataLoaderUtils(self.server, self.index, self.type) self.data_utils = DataUtils()
def __init__(self, load_config, data_source, data_source_summary): super(PubmedRelationshipProcessor, self).__init__(load_config, data_source) self.data_source_summary = data_source_summary self.data_loader_utils = DataLoaderUtils( self.load_config.server, self.load_config.index, self.load_config.type, self.load_config.server_username, self.load_config.server_password) self.load_relationships = True self.docs_with_new_citations = {} self.docs_citations_history = {} self.existing_docs = {} self.data_utils = DataUtils()
def __init__(self, src_server, dest_server, src_index, src_type, dst_index, dst_type, username, password): self.src_data_loader_utils = DataLoaderUtils(src_server, src_index, src_type) self.dest_data_loader_utils = DataLoaderUtils(dest_server, dst_index, dst_type) self.processed_doc_count = 0 self.total_doc_count = 0 self.data_utils = DataUtils() self.relations_to_exclude = [] self.missing_destination_ids = [] self.username = username self.password = password self.last_time_stamp = 0 self.diff_average = 0
def check_and_create_index(self): data_loader_utils = DataLoaderUtils(self.server, self.index, self.type, self.server_username, self.server_password) mapping_file_path = self.mapping_file_path() print 'Checking index...', self.index, self.type if not data_loader_utils.index_exists() and mapping_file_path is not None: mapping = data_loader_utils.load_mapping_from_file(mapping_file_path) data_loader_utils.create_index_from_mapping(mapping)
class AddInitialGrantFlag(object): def __init__(self, load_config): self.load_config = load_config self.grant_num_groups = {} self.data_loader_utils = DataLoaderUtils(SERVER, INDEX, TYPE) self.bulk_data_size = 300000 self.docs_processed = 0 self.total_doc_count = 0 def process_doc(self, _id, doc): if 'grant_num' in doc: grant_num = doc['grant_num'] grant_num_comps = grant_num.split('-') if grant_num_comps[0] not in self.grant_num_groups: self.grant_num_groups[grant_num_comps[0]] = {} fy = None if 'fy' in doc: fy = doc['fy'] if len(fy) > 0: fy = int(fy) else: fy = None if fy is not None: self.grant_num_groups[grant_num_comps[0]][_id] = { 'id': _id, 'fy': fy, 'grant_num': grant_num } updated_doc = {} updated_doc['initial_grant'] = False return updated_doc def process_grant_num_groups(self): print 'Processing', len(self.grant_num_groups), 'grant_num groups' bulk_data = '' total_grant_num_groups = len(self.grant_num_groups) count = 0 for grant_num in self.grant_num_groups: count += 1 progress = ((count / float(total_grant_num_groups)) * 100) print 'Pass 2: progress', count, '/', total_grant_num_groups, progress, '%' grant_num_group = self.grant_num_groups[grant_num] # Find doc with lowest fy lowest_item = None for _id in grant_num_group: fy_data = grant_num_group[_id] if lowest_item is None: lowest_item = fy_data else: fy = fy_data['fy'] lowest_item_fy = lowest_item['fy'] if fy < lowest_item_fy: lowest_item = fy_data if lowest_item is not None: _id = lowest_item['id'] doc = {} doc['initial_grant'] = True bulk_data += self.data_loader_utils.bulk_update_header(_id) bulk_data += '\n' doc = {'doc': doc} bulk_data += json.dumps(doc) bulk_data += '\n' if len(bulk_data) >= self.bulk_data_size: self.load_bulk_data(bulk_data) bulk_data = '' if len(bulk_data) > 0: self.load_bulk_data(bulk_data) def load_bulk_data(self, bulk_data): self.data_loader_utils.load_bulk_data(bulk_data) # pass def run(self): doc_ids = get_doc_ids( server=self.load_config.server, src_index=self.load_config.index, src_type=self.load_config.type, dest_dir=self.load_config.other_files_directory(), dest_file_name="INITIAL_GRANT_ALL_IRDB_IDS.json") doc_ids = doc_ids.keys() self.total_doc_count = len(doc_ids) data_utils = DataUtils() data_utils.batch_fetch_docs_for_ids(base_url=self.load_config.server, ids=doc_ids, index=self.load_config.index, type=self.load_config.type, docs_fetched=self.docs_fetched) self.process_grant_num_groups() def docs_fetched(self, docs, index, type): docs_to_process = {} self.docs_processed += len(docs) progress = ((self.docs_processed / float(self.total_doc_count)) * 100) print 'Pass 1: progress', self.docs_processed, '/', self.total_doc_count, progress, '%' # print 'Docs fetched', len(docs) for doc in docs: _id = doc['_id'] if '_source' in doc: existing_doc = doc['_source'] docs_to_process[_id] = existing_doc self.process_docs(docs_to_process) # print 'Processed docs', self.processed_docs, 'Pubmed relations', len(self.pubmed_relations) def process_docs(self, docs): bulk_data = '' for _id in docs: doc = docs[_id] processed_doc = self.process_doc(_id, doc) if processed_doc is not None: bulk_data += self.data_loader_utils.bulk_update_header(_id) bulk_data += '\n' updated_doc = {'doc': processed_doc} bulk_data += json.dumps(updated_doc) bulk_data += '\n' if len(bulk_data) >= self.bulk_data_size: # print 'loading bulk data...' self.load_bulk_data(bulk_data) bulk_data = '' if len(bulk_data) > 0: # print 'loading bulk data...' self.load_bulk_data(bulk_data)
class CopyGrants(object): def __init__(self, src_server, dest_server, src_index, src_type, dst_index, dst_type, username, password): self.src_data_loader_utils = DataLoaderUtils(src_server, src_index, src_type) self.dest_data_loader_utils = DataLoaderUtils(dest_server, dst_index, dst_type) self.data_utils = DataUtils() self.username = username self.password = password file_utils.make_directory(TEMP_DIR) def run(self): self.process_batches() def process_batches(self): batch_file_names = [] for batch_file_name in os.listdir(TEMP_DIR): file_path = os.path.join(TEMP_DIR, batch_file_name) if os.path.isfile(file_path) and batch_file_name.startswith('batch_'): batch_file_names.append(batch_file_name) print "Generated ", len(batch_file_names), 'batch file names' batch_file_names.sort() if len(batch_file_names) == 0: batch_file_names = self.split_to_batches() print len(batch_file_names) raw_input('Continue?') processed_batches = file_utils.load_file(TEMP_DIR, 'processed_pubmed2018_docs_with_grants_batches.json') for batch_file_name in batch_file_names: if batch_file_name not in processed_batches: print 'Loading batch', batch_file_name batch = file_utils.load_file(TEMP_DIR, batch_file_name) self.copy_docs_batch(batch) processed_batches[batch_file_name] = 0 file_utils.save_file(TEMP_DIR, 'processed_pubmed2018_docs_with_grants_batches.json', processed_batches) def split_to_batches(self): server = self.src_data_loader_utils.server src_index = self.src_data_loader_utils.index src_type = self.src_data_loader_utils.type print 'Fetching doc ids for', src_index, src_type query = { "nested": { "path": "grants", "query": { "bool": { "must": [ { "exists": { "field": "grants" } } ] } } } } all_pubmed_ids = export_doc_ids.get_doc_ids(server, src_index, src_type, TEMP_DIR, 'pubmed2018_docs_with_grants.json', query=query) # all_pubmed_ids = all_pubmed_ids.keys() # all_pubmed_ids.sort() self.total_doc_count = len(all_pubmed_ids) max_batch_count = 5000 batch_file_names = [] batch_index = 0 batch_ids = [] # Splitting into batches for _id in all_pubmed_ids: batch_ids.append(_id) if len(batch_ids) >= max_batch_count: print 'Writing batch:', batch_index batch_file_name = 'batch_' + str(batch_index) + '.json' batch_file_names.append(batch_file_name) file_utils.save_file(TEMP_DIR, batch_file_name, batch_ids) batch_ids = [] batch_index += 1 if len(batch_ids) > 0: print 'Writing batch:', batch_index batch_file_name = 'batch_' + str(batch_index) + '.json' batch_file_names.append(batch_file_name) file_utils.save_file(TEMP_DIR, batch_file_name, batch_ids) batch_index += 1 return batch_file_names def copy_docs_batch(self, doc_ids): print 'Fetching docs' self.data_utils.batch_fetch_docs_for_ids(base_url=self.src_data_loader_utils.server, ids=doc_ids, index=self.src_data_loader_utils.index, type=self.src_data_loader_utils.type, docs_fetched=self.docs_fetched, batch_size=500) def docs_fetched(self, docs, index, type): print 'Docs fetched', len(docs) docs_to_copy = {} # print 'Docs fetched', len(docs) for doc in docs: _id = doc['_id'] if '_source' in doc: existing_doc = doc['_source'] docs_to_copy[_id] = existing_doc self.copy_relations(docs_to_copy) def load_bulk_data(self, bulk_data): print 'Bulk data size', len(bulk_data), 'loading...' response = self.dest_data_loader_utils.load_bulk_data(bulk_data) if response: pass # print 'Done loading bulk data, saving response' else: print 'Bulk data load failed' def copy_relations(self, src_docs): bulk_data = '' count = 0 # Copy relations for _id in src_docs: src_doc = src_docs[_id] doc = {} if 'grants' in src_doc: doc['grants'] = src_doc['grants'] count += 1 if len(doc) > 0: bulk_data += self.dest_data_loader_utils.bulk_update_header(_id) bulk_data += '\n' doc = { 'doc': doc } bulk_data += json.dumps(doc) bulk_data += '\n' # if count % 1000 == 0: # print 'Processed', 1000, 'docs' if len(bulk_data) >= 150000: print _id self.load_bulk_data(bulk_data) # print 'Copied', count, 'docs' bulk_data = '' if len(bulk_data) > 0: self.load_bulk_data(bulk_data) pass
def create_index(): data_loader_utils = DataLoaderUtils(SERVER, INDEX, TYPE) data_loader_utils.check_and_create_index('data_load/clinical_trials/mapping.json')
class CopyDocs(object): def __init__(self, src_server, dest_server, src_index, src_type, dst_index, dst_type): self.src_data_loader_utils = DataLoaderUtils(src_server, src_index, src_type) self.dest_data_loader_utils = DataLoaderUtils(dest_server, dst_index, dst_type) self.processed_doc_count = 0 self.total_doc_count = 0 self.data_utils = DataUtils() def get_total_doc_count(self): return self.data_utils.get_total_doc_count( base_url=self.src_data_loader_utils.server, index=self.src_data_loader_utils.index, type=self.src_data_loader_utils.type) def docs_fetched(self, docs, index, type): docs_to_copy = {} # print 'Docs fetched', len(docs) for doc in docs: _id = doc['_id'] if '_source' in doc: existing_doc = doc['_source'] docs_to_copy[_id] = existing_doc self.index_docs(docs_to_copy) self.processed_doc_count += len(docs) progress = ((self.processed_doc_count / float(self.total_doc_count)) * 100) print '---------------------------------------------------------------------------------------------' print 'Progress', self.processed_doc_count, '/', self.total_doc_count, progress, '%' print '---------------------------------------------------------------------------------------------' def export_doc_ids(self, server, src_index, src_type): print 'Fetching doc ids for', src_index, src_type query = {"match_all": {}} self.data_utils.batch_fetch_ids_for_query(base_url=server, index=src_index, type=src_type, query=query, ids_fetched=self.ids_fetched) # print 'Done, fetched', len(documents_ids), 'doc ids' def ids_fetched(self, ids, index, type): self.copy_docs_batch(ids) def create_destination_index(self, mapping=None): if mapping is None: # Get mapping from src index mapping = self.src_data_loader_utils.get_mapping_from_server() if not self.dest_data_loader_utils.index_exists(): print 'Creating index' self.dest_data_loader_utils.put_mapping(mapping) # migrate_index(self.dest_data_loader_utils.index) else: print self.dest_data_loader_utils.index, 'exists' def copy_docs(self): self.processed_doc_count = 0 self.total_doc_count = self.get_total_doc_count() print 'Total doc count', self.total_doc_count self.create_destination_index(mapping=None) self.export_doc_ids(server=self.src_data_loader_utils.server, src_index=self.src_data_loader_utils.index, src_type=self.src_data_loader_utils.type) def copy_docs_for_ids(self, doc_ids, mapping=None): self.processed_doc_count = 0 self.total_doc_count = len(doc_ids) print 'Total doc count', self.total_doc_count self.create_destination_index(mapping) print 'Fetching docs from source index' batch_doc_processor = BatchDocProcessor(doc_ids, self.copy_docs_batch, 3000, 16, 0.33) batch_doc_processor.run() def copy_docs_batch(self, doc_ids): self.data_utils.batch_fetch_docs_for_ids( base_url=self.src_data_loader_utils.server, ids=doc_ids, index=self.src_data_loader_utils.index, type=self.src_data_loader_utils.type, docs_fetched=self.docs_fetched) def index_docs(self, docs_to_copy): bulk_data = '' count = 0 for es_id in docs_to_copy: count += 1 doc = docs_to_copy[es_id] bulk_data += self.dest_data_loader_utils.bulk_index_header(es_id) bulk_data += '\n' bulk_data += json.dumps(doc) bulk_data += '\n' # if count % 1000 == 0: # print 'Processed', 1000, 'docs' if len(bulk_data) >= 150000: self.load_bulk_data(bulk_data) # print 'Copied', count, 'docs' bulk_data = '' if len(bulk_data) > 0: self.load_bulk_data(bulk_data) # print 'Copied', count, 'docs' def load_bulk_data(self, bulk_data): # print 'Bulk data size', len(bulk_data), 'loading...' response = self.dest_data_loader_utils.load_bulk_data(bulk_data) if response: pass # print 'Done loading bulk data, saving response' else: print 'Bulk data load failed' # src_server = 'http://localhost:9200' # src_index = 'irdb_v3' # src_type = 'grant' # dest_server = 'http://localhost:9200' # dest_index = 'irdb_v4' # dest_type = 'grant' # copy_docs = CopyDocs(src_server=src_server, # dest_server=dest_server, # src_index=src_index, # src_type=src_type, # dst_index=dest_index, # dst_type=dest_type) # copy_docs.copy_docs() # copy_relations.relations_to_exclude.append({ # "source": "", # "index_id": ID_PUBMED # }) # copy_relations.run()
class FindMissingIds(object): def __init__(self): self.missing_ids = {} self.new_ids = {} self.data_utils = DataUtils() self.data_loader_utils = DataLoaderUtils(SERVER, OLD_INDEX, OLD_TYPE, '', '') self.docs_for_dolan = {} def run(self): old_ids = export_doc_ids(server=SERVER, src_index=OLD_INDEX, src_type=OLD_TYPE) new_ids = export_doc_ids(server=SERVER, src_index=NEW_INDEX, src_type=NEW_TYPE) for _id in old_ids: if _id not in new_ids: self.missing_ids[_id] = 0 if len(self.missing_ids) % 1000 == 0: print 'Missing ids', len(self.missing_ids) for _id in new_ids: if _id not in old_ids: self.new_ids[_id] = 0 if len(self.new_ids) % 1000 == 0: print 'New ids', len(self.new_ids) print 'Missing ids', len(self.missing_ids) print 'New ids', len(self.new_ids) file_utils.make_directory(missing_ids_directory) file_utils.save_file(missing_ids_directory, 'missing_ids.json', self.missing_ids.keys()) file_utils.save_file(missing_ids_directory, 'new_ids.json', self.new_ids) def check_tags_and_annotations(self): missing_ids = file_utils.load_file(missing_ids_directory, 'missing_ids.json') new_ids = file_utils.load_file(missing_ids_directory, 'new_ids.json') print 'Missing ids', len(missing_ids) print 'New ids', len(new_ids) docs_with_tags = self.fetch_ids() missing_docs_with_tags = [] for _id in missing_ids: if _id in docs_with_tags: missing_docs_with_tags.append(_id) print 'Missing docs with tags', _id print 'Missing docs with tags', len(missing_docs_with_tags) print 'Missing docs with tags', json.dumps(missing_docs_with_tags) for _id in missing_docs_with_tags: existing_doc = self.get_existing_doc(_id) if 'userTags' in existing_doc: user_tags = existing_doc['userTags'] for user_tag in user_tags: added_by = user_tag['added_by'] if added_by == '*****@*****.**': self.docs_for_dolan[_id] = existing_doc print _id print user_tags break print 'Docs for Dolan', len(self.docs_for_dolan) print 'Docs for Dolan', self.docs_for_dolan.keys() def get_existing_doc(self, _id): exisiting_doc = self.data_loader_utils.fetch_doc(_id) if exisiting_doc is not None and '_source' in exisiting_doc: exisiting_doc = exisiting_doc['_source'] return exisiting_doc def fetch_ids(self): combined_docs = {} tags_query = self.tags_query() annotations_query = self.annotations_query() print 'Fetching docs with tags', SERVER, OLD_INDEX, OLD_TYPE docs_with_tags = self.data_utils.batch_fetch_ids_for_query( base_url=SERVER, query=tags_query, index=OLD_INDEX, type=OLD_TYPE, ids_fetched=self.ids_fetched, batch_size=1000) print len(docs_with_tags), 'docs_with_tags' for _id in docs_with_tags: combined_docs[_id] = '' print 'Fetching docs with annotations', SERVER, OLD_INDEX, OLD_TYPE docs_with_annotations = self.data_utils.batch_fetch_ids_for_query( base_url=SERVER, query=annotations_query, index=OLD_INDEX, type=OLD_TYPE, ids_fetched=self.ids_fetched, batch_size=1000) print len(docs_with_annotations), 'docs_with_annotations' for _id in docs_with_annotations: combined_docs[_id] = '' print len(combined_docs), 'combined_docs' return combined_docs def ids_fetched(self, ids, index, type): print len(ids), 'ids fetched' def tags_query(self): tags_query = { "nested": { "path": "userTags", "query": { "bool": { "must": [{ "exists": { "field": "userTags" } }] } } } } return tags_query def annotations_query(self): annotations_query = { "nested": { "path": "annotations", "query": { "bool": { "must": [{ "exists": { "field": "annotations" } }] } } } } return annotations_query
class ProcessIndex(object): def __init__(self, server, src_index, src_type, process_doc_method): self.server = server self.index = src_index self.type = src_type self.process_doc_method = process_doc_method self.batch_size = 5000 self.process_count = 2 self.process_spawn_delay = 0.15 self.bulk_data_size = 300000 self.data_loader_utils = DataLoaderUtils(self.server, self.index, self.type) self.data_utils = DataUtils() def run(self): # doc_ids = export_doc_ids( self.server, self.index, # self.type, self.index + '_' + self.type , 'doc_ids.json') doc_ids = file_utils.load_file(self.index, self.index + '_ids.json') if len(doc_ids) == 0: doc_ids = export_doc_ids.export_doc_ids(self.server, self.index, self.type) doc_ids = doc_ids.keys() batch_doc_processor = BatchDocProcessor(doc_ids, self.process_batch, self.batch_size, self.process_count, self.process_spawn_delay) batch_doc_processor.run() def docs_fetched(self, docs, index, type): docs_to_process = {} print 'Docs fetched', len(docs) for doc in docs: _id = doc['_id'] if '_source' in doc: existing_doc = doc['_source'] docs_to_process[_id] = existing_doc self.process_docs(docs_to_process) def process_docs(self, docs): bulk_data = '' for _id in docs: doc = docs[_id] processed_doc = self.process_doc_method(_id, doc) if processed_doc is not None: bulk_data += self.data_loader_utils.bulk_update_header(_id) bulk_data += '\n' updated_doc = {'doc': processed_doc} bulk_data += json.dumps(updated_doc) bulk_data += '\n' if len(bulk_data) >= self.bulk_data_size: # print 'loading bulk data...' self.load_bulk_data(bulk_data) bulk_data = '' if len(bulk_data) > 0: # print 'loading bulk data...' self.load_bulk_data(bulk_data) def load_bulk_data(self, bulk_data): self.data_loader_utils.load_bulk_data(bulk_data) # pass def process_batch(self, doc_ids): self.data_utils.batch_fetch_docs_for_ids( base_url=self.server, ids=doc_ids, index=self.index, type=self.type, docs_fetched=self.docs_fetched)
class CleanCitations(object): def __init__(self): self.updated_docs = {} self.original_docs = {} self.server = SERVER self.index = INDEX self.type = TYPE self.server_username = '' self.server_password = '' self.load_config = self.get_load_config(clean_citations_directory) self.data_loader_utils = DataLoaderUtils( self.load_config.server, self.load_config.index, self.load_config.type, self.load_config.server_username, self.load_config.server_password) self.docs_with_updates = {} self.inverted_index = {} self.current_baseline_file = None self.current_update_file = None self.processes = [] self.missing_docs = {} self.inverted_index_for_updated_docs = {} def run(self): # self.get_updated_docs() self.updated_docs = file_utils.load_file( self.load_config.other_files_directory(), 'updated_docs.json') print 'Updated docs:', len(self.updated_docs) print 'Original docs:', len(self.original_docs) # self.get_original_docs() # sys.exit(1) self.original_docs = file_utils.load_file( self.load_config.other_files_directory(), 'original_docs.json') self.inverted_index = file_utils.load_file( self.load_config.other_files_directory(), 'inverted_index.json') self.inverted_index_for_updated_docs = file_utils.load_file( self.load_config.other_files_directory(), 'inverted_index_for_updated_docs.json') print 'Updated docs:', len(self.updated_docs) print 'Original docs:', len(self.original_docs) print 'Inverted index:', len(self.inverted_index) print 'inverted_index_for_updated_docs:', len( self.inverted_index_for_updated_docs) # print json.dumps(self.inverted_index_for_updated_docs) # input = raw_input('Continue?') # if input.lower() in ['n', 'no', '0']: # sys.exit(1) self.update_docs() print 'Docs with updates', len(self.docs_with_updates) # print json.dumps(self.docs_with_updates) print 'Missing docs' print json.dumps(self.missing_docs.keys()) file_utils.save_file(self.load_config.other_files_directory(), 'docs_with_updates.json', self.docs_with_updates) def update_docs(self): for _id in self.updated_docs: if _id in self.original_docs: original_doc = self.original_docs[_id] updated_doc = self.updated_docs[_id] # print original_doc # print updated_doc original_citations = self.load_config.data_mapper.get_citations( [original_doc]) updated_citations = self.load_config.data_mapper.get_citations( [updated_doc]) # print _id, 'original', len(original_citations), 'updated', len(updated_citations) if not self.compare_citations(original_citations, updated_citations): print 'Doc with update', _id self.docs_with_updates[_id] = { 'original_citations': len(original_citations), 'updated_citations': len(updated_citations), 'original_doc': original_doc, 'updated_doc': updated_doc } added_citations = [] removed_citations = [] for citation in updated_citations: if citation not in original_citations: added_citations.append(citation) for citation in original_citations: if citation not in updated_citations: removed_citations.append(citation) if _id in self.inverted_index_for_updated_docs: update_file = self.inverted_index_for_updated_docs[_id] # print update_file # self.update_doc_with_history(_id, update_file, original_citations, removed_citations, added_citations) else: print _id, 'missing from inverted index' # self.update_doc(_id, original_citations) else: updated_doc = self.updated_docs[_id] self.missing_docs[_id] = updated_doc updated_citations = self.load_config.data_mapper.get_citations( [updated_doc]) print 'Missing doc', _id, len(updated_citations) def compare_citations(self, original_citations, updated_citations): for _id in original_citations: if _id not in updated_citations: return False for _id in updated_citations: if _id not in original_citations: return False return True def get_existing_doc(self, _id): exisiting_doc = self.data_loader_utils.fetch_doc(_id) if exisiting_doc is not None and '_source' in exisiting_doc: exisiting_doc = exisiting_doc['_source'] return exisiting_doc def update_doc_with_history(self, _id, update_file, original_citations, removed_citations, added_citations): print _id, update_file, 'original_citations', len( original_citations), 'removed_citations', len( removed_citations), 'added_citations', len(added_citations) now = datetime.datetime.now() # updated_date = now.isoformat() updated_date = "2019-01-14T11:16:01.000000" # 2019-01-17T18:03:43.605774 existing_doc = self.get_existing_doc(_id) # update_file = os.path.basename(self.data_source.data_source_file_path) # Create the update history item update_history_item = { "updated_date": updated_date, "update_file": update_file, "removed_citations": removed_citations, "added_citations": added_citations } # Get the existing update history update_history = [] if 'update_history' in existing_doc: update_history = existing_doc['update_history'] # Add the original citations list if not present if len(update_history) == 0: update_history.append({"original_citations": original_citations}) # Add the new update history item update_history.append(update_history_item) doc = {"update_history": update_history} doc = {'doc': doc} self.data_loader_utils.update_doc(_id, doc) def update_doc(self, _id, original_citations): print 'Updating doc', _id, len(original_citations), 'citations' # input = raw_input('Continue?') # if input.lower() in ['n', 'no', '0']: # sys.exit(1) # Get the existing update history update_history = [] # Add the original citations list if not present if len(update_history) == 0: update_history.append({"original_citations": original_citations}) doc = {"update_history": update_history} doc = {'doc': doc} self.data_loader_utils.update_doc(_id, doc) def get_original_docs(self): load_config = self.get_load_config(baseline_directory) ftp_manager = FTPManager(load_config) baseline_file_urls = ftp_manager.get_baseline_file_urls() # ftp_manager.download_missing_files(file_urls=baseline_file_urls, no_of_files=10) baseline_files = file_manager.get_baseline_files( load_config, baseline_file_urls) # Filter filtered_baseline_files = [] for baseline_file in baseline_files: if 'pubmed19n0511' in baseline_file: filtered_baseline_files.append(baseline_file) elif 'pubmed19n0560' in baseline_file: filtered_baseline_files.append(baseline_file) baseline_files = filtered_baseline_files print 'Baseline files:', len(baseline_files) for baseline_file in baseline_files: # self.process_baseline_file(baseline_file) process = Process(target=self.process_baseline_file, args=(baseline_file, )) process.start() self.processes.append(process) if len(self.processes) >= 16: old_process = self.processes.pop(0) old_process.join() time.sleep(0.5) while len(self.processes) > 0: old_process = self.processes.pop(0) old_process.join() self.combine_inverted_index() self.combine_original_docs() # file_utils.save_file(self.load_config.other_files_directory(), 'original_docs.json', self.original_docs) # file_utils.save_file(self.load_config.other_files_directory(), 'inverted_index.json', self.inverted_index) def combine_inverted_index(self): files = [] generated_files_directory = self.load_config.generated_files_directory( ) for name in os.listdir(generated_files_directory): file_path = os.path.join(generated_files_directory, name) if os.path.isfile(file_path) and name.startswith( 'inverted_index_'): files.append(name) combined = {} for name in files: data = file_utils.load_file(generated_files_directory, name) combined.update(data) print 'Inverted index', len(combined) file_utils.save_file(self.load_config.other_files_directory(), 'inverted_index.json', combined) def combine_original_docs(self): files = [] generated_files_directory = self.load_config.generated_files_directory( ) for name in os.listdir(generated_files_directory): file_path = os.path.join(generated_files_directory, name) if os.path.isfile(file_path) and name.startswith('original_docs_'): files.append(name) combined = {} for name in files: data = file_utils.load_file(generated_files_directory, name) combined.update(data) print 'Original docs', len(combined) file_utils.save_file(self.load_config.other_files_directory(), 'original_docs.json', combined) def process_baseline_file(self, baseline_file): print "Processing file:", baseline_file process_file = ProcessBaselineFile( self.load_config, dict.fromkeys(self.updated_docs.keys()), baseline_file) process_file.run() # def process_baseline_file(self, baseline_file): # print "Processing file:", baseline_file # file_name = os.path.basename(baseline_file) # self.current_baseline_file = file_name.split('.')[0] # last_time_stamp = time.time() # xml_data_source = XMLDataSource(baseline_file, 2) # xml_data_source.process_rows(self.process_baseline_row) # current_time_stamp = time.time() # diff = current_time_stamp - last_time_stamp # print 'Time for file', baseline_file, diff def process_baseline_row(self, row, current_index): if current_index % 100 == 0: print current_index _id = self.extract_id(self.load_config.data_source_name, row, current_index) if _id is not None: self.inverted_index[_id] = self.current_baseline_file if _id in self.updated_docs: doc = self.extract_data(_id, self.load_config.data_source_name, row) if doc is not None and len(doc) > 0: self.original_docs[_id] = doc # if len(self.original_docs) % 100 == 0: print 'Original docs', len(self.original_docs) return True def get_updated_docs(self): load_config = self.get_load_config(updates_directory) ftp_manager = FTPManager(load_config) update_file_urls = ftp_manager.get_update_file_urls() update_file_urls = update_file_urls[:2] ftp_manager.download_missing_files(file_urls=update_file_urls, no_of_files=2) all_files = file_manager.get_all_files(load_config) files_to_process = all_files[:2] # files_to_process = file_manager.get_new_update_files(load_config, update_file_urls, 2) print files_to_process for update_file in files_to_process: file_name = os.path.basename(update_file) self.current_update_file = file_name #file_name.split('.')[0] xml_data_source = XMLDataSource(update_file, 2) xml_data_source.process_rows(self.process_row) print 'Total updated ids:', len(self.updated_docs) file_utils.save_file(self.load_config.other_files_directory(), 'updated_docs.json', self.updated_docs) file_utils.save_file(self.load_config.other_files_directory(), 'inverted_index_for_updated_docs.json', self.inverted_index_for_updated_docs) def process_row(self, row, current_index): _id = self.extract_id(self.load_config.data_source_name, row, current_index) if _id is not None and _id not in self.updated_docs: doc = self.extract_data(_id, self.load_config.data_source_name, row) if doc is not None and len(doc) > 0: self.updated_docs[_id] = doc self.inverted_index_for_updated_docs[ _id] = self.current_update_file if len(self.updated_docs) % 1000 == 0: print 'Updated docs', len(self.updated_docs) return True def get_load_config(self, root_directory): load_config = LoadConfig() load_config.root_directory = root_directory load_config.process_count = psutil.cpu_count() load_config.server = self.server load_config.server_username = self.server_username load_config.server_password = self.server_password load_config.index = self.index load_config.type = self.type load_config.data_mapper = self.get_data_mapper() load_config.data_extractor = self.get_data_extractor() load_config.max_memory_percent = self.get_max_memory_percent() return load_config def get_data_mapper(self): return PubmedDataMapper() def get_data_extractor(self): return PubmedDataExtractor() def get_max_memory_percent(self): return 75 def extract_id(self, name, row, current_index): if self.load_config.data_extractor is not None: if self.load_config.data_extractor.should_generate_id(name): return self.load_config.data_extractor.generate_id( current_index) else: return self.load_config.data_extractor.extract_id(name, row) self.load_config.log(LOG_LEVEL_WARNING, 'Error: no data extractor configured') return None def extract_data(self, _id, name, row): if self.load_config.data_extractor is not None: return self.load_config.data_extractor.extract_data(_id, name, row) return row
def create_index(): data_loader_utils = DataLoaderUtils(SERVER, INDEX, TYPE) data_loader_utils.check_and_create_index('mapping.json')
class PubmedRelationshipProcessor(DataSourceProcessor): def __init__(self, load_config, data_source, data_source_summary): super(PubmedRelationshipProcessor, self).__init__(load_config, data_source) self.data_source_summary = data_source_summary self.data_loader_utils = DataLoaderUtils( self.load_config.server, self.load_config.index, self.load_config.type, self.load_config.server_username, self.load_config.server_password) self.load_relationships = True self.docs_with_new_citations = {} self.docs_citations_history = {} self.existing_docs = {} self.data_utils = DataUtils() def docs_fetched(self, docs, index, type): self.load_config.log(LOG_LEVEL_TRACE, 'Docs fetched', len(docs)) for doc in docs: _id = doc['_id'] if '_source' in doc: existing_doc = doc['_source'] self.existing_docs[_id] = existing_doc def get_docs_with_new_citations(self): return self.docs_with_new_citations def get_citations_history(self): return self.docs_citations_history def update_citations_history(self, new_doc, _id, new_citations, existing_citations): # Update citation history if _id not in self.docs_citations_history: self.docs_citations_history[_id] = {} # Set the new doc flag self.docs_citations_history[_id]['new'] = new_doc # Update new citations if 'new_citations' not in self.docs_citations_history[_id]: self.docs_citations_history[_id]['new_citations'] = [] self.docs_citations_history[_id]['new_citations'].extend(new_citations) # Update existing citations if 'existing_citations' not in self.docs_citations_history[_id]: self.docs_citations_history[_id]['existing_citations'] = [] self.docs_citations_history[_id]['existing_citations'].extend( existing_citations) def process_relationships(self, extracted_ids): # all_indexed_ids = {} # if 'indexed_ids' in self.data_source_summary: # all_indexed_ids = self.data_source_summary['indexed_ids'] all_updated_ids = {} if 'updated_ids' in self.data_source_summary: all_updated_ids = self.data_source_summary['updated_ids'] print 'all_updated_ids', len(all_updated_ids) print 'extracted_ids', len(extracted_ids) # Fetch existing (updated) docs self.load_config.log(LOG_LEVEL_DEBUG, 'Fetching docs', self.load_config.server, self.load_config.index, self.load_config.type) ids_to_fetch = all_updated_ids.keys() self.data_utils.batch_fetch_docs_for_ids( self.load_config.server, ids_to_fetch, self.load_config.index, self.load_config.type, self.docs_fetched, self.load_config.doc_fetch_batch_size, self.load_config.server_username, self.load_config.server_password) print 'existing_docs', len(self.existing_docs) pubmed_citations_pubmed = {} pubmed_cited_bys_pubmed = {} citations_to_remove = {} cited_bys_to_remove = {} count = 0 for _id in extracted_ids: count += 1 data = extracted_ids[_id] if len(data) == 0: print 'No data for', _id new_doc = False existing_citations = [] new_citations = self.load_config.data_mapper.get_citations(data) if _id in all_updated_ids: # Existing doc existing_doc = self.get_existing_doc(_id) existing_citations = self.get_citations(existing_doc) new_doc = False else: new_doc = True self.update_citations_history(new_doc, _id, new_citations, existing_citations) added_citations = [] removed_citations = [] # Get removed citations for existing_citation in existing_citations: if existing_citation not in new_citations: removed_citations.append(existing_citation) # Get added citations for new_citation in new_citations: if new_citation not in existing_citations: added_citations.append(new_citation) # Added citations and cited bys for citation in added_citations: # Citations if _id not in pubmed_citations_pubmed: pubmed_citations_pubmed[_id] = [] if citation not in pubmed_citations_pubmed[_id]: pubmed_citations_pubmed[_id].append(citation) # Cited by if citation not in pubmed_cited_bys_pubmed: pubmed_cited_bys_pubmed[citation] = [] if _id not in pubmed_cited_bys_pubmed[citation]: pubmed_cited_bys_pubmed[citation].append(_id) # Get existing cited bys (citations from other existing docs) for the new doc # if new_doc: # existing_cited_bys = self.get_existing_cited_bys(_id) # for cited_by in existing_cited_bys: # if _id not in pubmed_cited_bys_pubmed: # pubmed_cited_bys_pubmed[_id] = [] # if cited_by not in pubmed_cited_bys_pubmed[_id]: # pubmed_cited_bys_pubmed[_id].append(cited_by) # Removed citations and cited bys for removed_citation in removed_citations: # Removed citations if _id not in citations_to_remove: citations_to_remove[_id] = [] if removed_citation not in citations_to_remove[_id]: citations_to_remove[_id].append(removed_citation) # Removed cited_bys if removed_citation not in cited_bys_to_remove: cited_bys_to_remove[removed_citation] = [] if _id not in cited_bys_to_remove[removed_citation]: cited_bys_to_remove[removed_citation].append(_id) # Docs with new citations if len(added_citations) > 0: if _id not in self.docs_with_new_citations: self.docs_with_new_citations[_id] = [] self.docs_with_new_citations[_id].extend(added_citations) if count % 1000 == 0: print 'Processed', count, 'docs' pubmed_ids = {} pubmed_ids = self.load_config.data_mapper.reformat( reformatted_array=pubmed_ids, relations_array=pubmed_citations_pubmed, dest_index_id=ID_PUBMED, relationship_type=RELATIONSHIP_TYPE_CITATIONS, removed_ids=citations_to_remove) pubmed_ids = self.load_config.data_mapper.reformat( reformatted_array=pubmed_ids, relations_array=pubmed_cited_bys_pubmed, dest_index_id=ID_PUBMED, relationship_type=RELATIONSHIP_TYPE_CITED_BYS, removed_ids=cited_bys_to_remove) print 'pubmed_citations_pubmed', len(pubmed_citations_pubmed) print 'pubmed_cited_bys_pubmed', len(pubmed_cited_bys_pubmed) print 'citations_to_remove', len(citations_to_remove) print 'cited_bys_to_remove', len(cited_bys_to_remove) print 'reformatted pubmed_ids', len(pubmed_ids) relationships = dict() relationships[ID_PUBMED] = pubmed_ids return relationships # def get_cited_bys_for_doc(self, _id): # doc = self.fetch_existing_doc(_id) # return self.get_cited_bys(doc) # Fetch existing doc from elasticsearch def fetch_existing_doc(self, _id): existing_doc = self.data_loader_utils.fetch_doc(_id) if existing_doc is not None and '_source' in existing_doc: existing_doc = existing_doc['_source'] return existing_doc def get_existing_doc(self, _id): existing_doc = None if _id in self.existing_docs: existing_doc = self.existing_docs[_id] # Retry two times if not obtained in mget if existing_doc is None or len(existing_doc) == 0: existing_doc = self.fetch_existing_doc(_id) if existing_doc is None or len(existing_doc) == 0: existing_doc = self.fetch_existing_doc(_id) return existing_doc def get_cited_bys(self, doc): cited_bys = [] if doc is not None and 'cited_bys' in doc: cited_bys_array = doc['cited_bys'] for cited_by_item in cited_bys_array: source = cited_by_item['source'] index_id = cited_by_item['index_id'] if source == self.load_config.source and index_id == ID_PUBMED: cited_bys = cited_by_item['ids'] break return cited_bys # Get citations from doc def get_citations(self, doc): citations = [] if doc is not None and 'citations' in doc: citations_array = doc['citations'] for citation_item in citations_array: source = citation_item['source'] index_id = citation_item['index_id'] if source == self.load_config.source and index_id == ID_PUBMED: citations = citation_item['ids'] break return citations def has_multiple_citations(self, doc): citations = [] if 'citations' in doc: citations_array = doc['citations'] if len(citations_array) > 1: return True return False def get_existing_cited_bys(self, _id): """ Search elasticsearch for any docs citing the given id """ query = { "bool": { "must": [{ "match": { "citations.ids": _id } }, { "match": { "citations.source": "" } }, { "match": { "citations.index_id": ID_PUBMED } }] } } ids = self.data_utils.batch_fetch_ids_for_query( base_url=self.load_config.server, query=query, index=self.load_config.index, type=self.load_config.type) return ids def update_doc(self, _id, existing_doc, original_citations, removed_citations, added_citations): if len(removed_citations) > 0 or len(added_citations) > 0: print 'Updating doc:', _id, 'original_citations', len( original_citations), 'removed_citations', len( removed_citations), 'added_citations', len(added_citations) now = datetime.datetime.now() updated_date = now.isoformat() update_file = os.path.basename(self.data_source.data_source_file_path) # Create the update history item update_history_item = { "updated_date": updated_date, "update_file": update_file, "removed_citations": removed_citations, "added_citations": added_citations } # Get the existing update history update_history = [] if 'update_history' in existing_doc: update_history = existing_doc['update_history'] # Add the original citations list if not present if len(update_history) == 0: update_history.append({"original_citations": original_citations}) # Add the new update history item update_history.append(update_history_item) doc = {"update_history": update_history} doc = {'doc': doc} self.data_loader_utils.update_doc(_id, doc)
class CopyRelationships(object): def __init__(self, src_server, dest_server, src_index, src_type, dst_index, dst_type, username, password): self.src_data_loader_utils = DataLoaderUtils(src_server, src_index, src_type) self.dest_data_loader_utils = DataLoaderUtils(dest_server, dst_index, dst_type) self.processed_doc_count = 0 self.total_doc_count = 0 self.data_utils = DataUtils() self.relations_to_exclude = [] self.missing_destination_ids = [] self.username = username self.password = password self.last_time_stamp = 0 self.diff_average = 0 def run(self): self.processed_doc_count = 0 self.total_doc_count = self.get_total_doc_count() print 'Total doc count', self.total_doc_count # self.create_destination_index(mapping=None) self.export_doc_ids(server=self.src_data_loader_utils.server, src_index=self.src_data_loader_utils.index, src_type=self.src_data_loader_utils.type) print 'saving missing docs' file_utils.save_file('/data/data_loading/pubmed_2019', 'missing_docs_pubmed2019.json', self.missing_destination_ids) def run_for_ids(self, doc_ids, mapping=None): self.processed_doc_count = 0 self.total_doc_count = len(doc_ids) print 'Total doc count', self.total_doc_count print 'Fetching docs from source index' batch_doc_processor = BatchDocProcessor(doc_ids, self.copy_docs_batch, 1000, 1, 0) batch_doc_processor.run() file_utils.save_file('/data/data_loading/pubmed_2019', 'missing_docs_pubmed2019.json', self.missing_destination_ids) def export_doc_ids(self, server, src_index, src_type): print 'Fetching doc ids for', src_index, src_type query = {"match_all": {}} self.data_utils.batch_fetch_ids_for_query(base_url=server, index=src_index, type=src_type, query=query, ids_fetched=self.ids_fetched, batch_size=10000) # print 'Done, fetched', len(documents_ids), 'doc ids' def ids_fetched(self, ids, index, type): print 'Ids fetched', len(ids) self.copy_docs_batch(ids) def copy_docs_batch(self, doc_ids): print 'Fetching docs' self.data_utils.batch_fetch_docs_for_ids( base_url=self.src_data_loader_utils.server, ids=doc_ids, index=self.src_data_loader_utils.index, type=self.src_data_loader_utils.type, docs_fetched=self.docs_fetched, batch_size=500) def docs_fetched(self, docs, index, type): print 'Docs fetched', len(docs) docs_to_copy = {} # print 'Docs fetched', len(docs) for doc in docs: _id = doc['_id'] if '_source' in doc: existing_doc = doc['_source'] docs_to_copy[_id] = existing_doc self.copy_relations(docs_to_copy) # Update progress self.processed_doc_count += len(docs) progress = ((self.processed_doc_count / float(self.total_doc_count)) * 100) current_time_stamp = time.time() diff = current_time_stamp - self.last_time_stamp self.diff_average = float(diff + self.diff_average) / 2 time_remaining = diff * (float(self.total_doc_count) / len(docs)) self.last_time_stamp = current_time_stamp print '---------------------------------------------------------------------------------------------' print 'Progress', self.processed_doc_count, '/', self.total_doc_count, progress, '%', time_remaining, 'secs' print '---------------------------------------------------------------------------------------------' def get_src_relations(self, src_doc, relationship_type): src_relations = [] if relationship_type in src_doc: relations = src_doc[relationship_type] for relation_item in relations: exclude_relation_item = False for relation_to_exclude in self.relations_to_exclude: if relation_to_exclude['source'] == relation_item[ 'source'] and relation_to_exclude[ 'index_id'] == relation_item['index_id']: exclude_relation_item = True break if not exclude_relation_item: src_relations.append(relation_item) return src_relations def get_dest_relations(self, dest_doc, relationship_type): dest_relations = [] if relationship_type in dest_doc: dest_relations = dest_doc[relationship_type] return dest_relations def add_relations(self, append_ids, relation, relations_list): relation_found = False for existing_relation in relations_list: # print existing_relation['source'], relation['source'], existing_relation['index_id'], relation['index_id'] if existing_relation['source'] == relation[ 'source'] and existing_relation['index_id'] == relation[ 'index_id']: existing_relation_ids = existing_relation['ids'] if append_ids: relation_ids = relation['ids'] for _id in relation_ids: if _id not in existing_relation_ids: existing_relation_ids.append(_id) existing_relation['ids'] = existing_relation_ids relation_found = True break if not relation_found: relations_list.append(relation) return relations_list def merge_relations(self, src_doc, dest_doc, relationship_type): dest_relations = self.get_dest_relations(dest_doc, relationship_type) src_relations = self.get_src_relations(src_doc, relationship_type) # print 'src_relations', len(src_relations) # print 'dest_relations', len(dest_relations) combined_relations = [] for relation in dest_relations: combined_relations = self.add_relations(True, relation, combined_relations) for relation in src_relations: combined_relations = self.add_relations(True, relation, combined_relations) return combined_relations def copy_relations(self, src_docs): bulk_data = '' count = 0 # Fetch destination docs destination_ids = src_docs.keys() destination_docs_array = self.data_utils.fetch_docs_for_ids( base_url=self.dest_data_loader_utils.server, ids=destination_ids, index=self.dest_data_loader_utils.index, type=self.dest_data_loader_utils.type, username=self.username, password=self.password) # Create destination doc dict destination_docs = {} for doc in destination_docs_array: _id = doc['_id'] if '_source' in doc: destination_docs[_id] = doc['_source'] # Find missing destination docs for _id in destination_ids: if _id not in destination_docs: self.missing_destination_ids.append(_id) print 'Missing ids', len(self.missing_destination_ids) # print 'dest ids', len() # Copy relations for _id in destination_docs: dest_doc = destination_docs[_id] src_doc = src_docs[_id] dest_relations = {} dest_relations[RELATIONSHIP_TYPE_CITATIONS] = self.merge_relations( src_doc, dest_doc, RELATIONSHIP_TYPE_CITATIONS) dest_relations[RELATIONSHIP_TYPE_CITED_BYS] = self.merge_relations( src_doc, dest_doc, RELATIONSHIP_TYPE_CITED_BYS) dest_relations[RELATIONSHIP_TYPE_RELATIONS] = self.merge_relations( src_doc, dest_doc, RELATIONSHIP_TYPE_RELATIONS) doc = {} if len(dest_relations[RELATIONSHIP_TYPE_CITATIONS]) > 0: doc[RELATIONSHIP_TYPE_CITATIONS] = dest_relations[ RELATIONSHIP_TYPE_CITATIONS] if len(dest_relations[RELATIONSHIP_TYPE_CITED_BYS]) > 0: doc[RELATIONSHIP_TYPE_CITED_BYS] = dest_relations[ RELATIONSHIP_TYPE_CITED_BYS] if len(dest_relations[RELATIONSHIP_TYPE_RELATIONS]) > 0: doc[RELATIONSHIP_TYPE_RELATIONS] = dest_relations[ RELATIONSHIP_TYPE_RELATIONS] # if len(dest_relations[RELATIONSHIP_TYPE_CITATIONS]) >= 2: # print _id count += 1 # doc = docs_to_copy[es_id] bulk_data += self.dest_data_loader_utils.bulk_update_header(_id) bulk_data += '\n' doc = {'doc': doc} bulk_data += json.dumps(doc) bulk_data += '\n' # if count % 1000 == 0: # print 'Processed', 1000, 'docs' if len(bulk_data) >= 150000: print _id self.load_bulk_data(bulk_data) # print 'Copied', count, 'docs' bulk_data = '' if len(bulk_data) > 0: self.load_bulk_data(bulk_data) pass # print 'Copied', count, 'docs' # def create_destination_index(self, mapping=None): # if mapping is None: # # Get mapping from src index # mapping = self.src_data_loader_utils.get_mapping_from_server() # if not self.dest_data_loader_utils.index_exists(): # print 'Creating index' # self.dest_data_loader_utils.put_mapping(mapping) # # migrate_index(self.dest_data_loader_utils.index) # else: # print self.dest_data_loader_utils.index, 'exists' def load_bulk_data(self, bulk_data): print 'Bulk data size', len(bulk_data), 'loading...' response = self.dest_data_loader_utils.load_bulk_data(bulk_data) if response: pass # print 'Done loading bulk data, saving response' else: print 'Bulk data load failed' def get_total_doc_count(self): return self.data_utils.get_total_doc_count( base_url=self.src_data_loader_utils.server, index=self.src_data_loader_utils.index, type=self.src_data_loader_utils.type) # src_server = 'http://localhost:9200' # src_index = 'pubmed2018_v5' # src_type = 'article' # dest_server = 'http://localhost:9200' # dest_index = 'pubmed2019' # dest_type = 'article' # copy_relations = CopyRelationships(src_server=src_server, # dest_server=dest_server, # src_index=src_index, # src_type=src_type, # dst_index=dest_index, # dst_type=dest_type, # username='', # password='') # copy_relations.relations_to_exclude.append({ # "source": "", # "index_id": ID_PUBMED # }) # copy_relations.run() # copy_relations.run_for_ids([12620793])