def docs_fetched(self, docs, index, type): print len(docs), 'docs fetched' existing_docs = {} for doc in docs: _id = doc['_id'] if '_source' in doc: existing_doc = doc['_source'] existing_docs[_id] = existing_doc bulk_data = '' for _id in existing_docs: existing_doc = existing_docs[_id] doc = {} if self.delete_tags and 'userTags' in existing_doc: doc['userTags'] = [] if self.delete_annotations and 'annotations' in existing_doc: doc['annotations'] = [] if len(doc) > 0: bulk_data += self.data_loader_utils_dest.bulk_update_header(_id) bulk_data += '\n' doc = { 'doc': doc } bulk_data += json.dumps(doc) bulk_data += '\n' if len(bulk_data) > 0: self.load_bulk_data(bulk_data) batch_file_name = file_utils.batch_file_name_with_prefix('loaded_ids_') + '.json' file_utils.save_file(self.reports_directory, batch_file_name, existing_docs.keys())
def get_doc_ids(server, src_index, src_type, dest_dir, dest_file_name, query=None): documents_ids = file_utils.load_file(dest_dir, dest_file_name) if len(documents_ids) == 0: documents_ids = export_doc_ids(server, src_index, src_type, query) print __name__, 'Saving to', dest_dir, dest_file_name file_utils.make_directory(dest_dir) file_utils.save_file(dest_dir, dest_file_name, documents_ids) return documents_ids
def get_one(self, report_id): """ 根据ReportId获取单个报告的xml信息 :param report_id: """ # 如果本地存在,则不再重复读取 try: content = get_file_content(report_id, report_id + '.xml') except FileNotExist, ex: r = self._get_service.get_report(report_id) content = r.text # 保存到本地 save_file(content, report_id, report_id + '.xml')
def get_items_by_report_id(self, report_id, start_day=None): # 保存到本地的文件名 if start_day: name = 'week_%s_%s.txt' % (start_day, report_id) else: name = 'week_%s.txt' % report_id content = get_file_content('advertising', name) if not content: r = self.get_by_report_id(report_id) # 保存到本地 save_file(r.text, 'advertising', name) content = r.text parser = AdvertisingParser(content) return parser.get_items()
def verify_tags(self): tags_query = self.tags_query() src_docs_with_tags = self.data_utils.batch_fetch_ids_for_query(base_url=self.src_server, query=tags_query, index=self.src_index, type=self.src_type, ids_fetched=self.ids_fetched, batch_size=1000) dest_docs_with_tags = self.data_utils.batch_fetch_ids_for_query(base_url=self.dest_server, query=tags_query, index=self.dest_index, type=self.dest_type, ids_fetched=self.ids_fetched, batch_size=1000) print len(src_docs_with_tags), 'src_docs_with_tags' print len(dest_docs_with_tags), 'dest_docs_with_tags' dest_dict = {} for _id in dest_docs_with_tags: dest_dict[_id] = 0 missing_ids = [] for _id in src_docs_with_tags: if _id not in dest_dict: missing_ids.append(_id) # print missing_ids print len(missing_ids), 'tags missing_ids' count = 0 for _id in missing_ids: count += 1 if count % 10000: print _id file_utils.save_file(self.reports_directory, 'tags_missing_ids.json', missing_ids)