def docs_fetched(self, docs, index, type):
        print len(docs), 'docs fetched'
        existing_docs = {}
        for doc in docs:
            _id = doc['_id']
            if '_source' in doc:
                existing_doc = doc['_source']
                existing_docs[_id] = existing_doc

        bulk_data = ''
        for _id in existing_docs:
            existing_doc = existing_docs[_id]

            doc = {}
            if self.delete_tags and 'userTags' in existing_doc:
                doc['userTags'] = []

            if self.delete_annotations and 'annotations' in existing_doc:
                doc['annotations'] = []

            if len(doc) > 0:
                bulk_data += self.data_loader_utils_dest.bulk_update_header(_id)
                bulk_data += '\n'
                doc = {
                    'doc': doc
                }
                bulk_data += json.dumps(doc)
                bulk_data += '\n'

        if len(bulk_data) > 0:
            self.load_bulk_data(bulk_data)

        batch_file_name = file_utils.batch_file_name_with_prefix('loaded_ids_') + '.json'
        file_utils.save_file(self.reports_directory, batch_file_name, existing_docs.keys())
Ejemplo n.º 2
0
def get_doc_ids(server, src_index, src_type, dest_dir, dest_file_name, query=None):
    documents_ids = file_utils.load_file(dest_dir, dest_file_name)

    if len(documents_ids) == 0:
        documents_ids = export_doc_ids(server, src_index, src_type, query)

        print __name__, 'Saving to', dest_dir, dest_file_name
        file_utils.make_directory(dest_dir)
        file_utils.save_file(dest_dir, dest_file_name, documents_ids)

    return documents_ids
Ejemplo n.º 3
0
 def get_one(self, report_id):
     """
     根据ReportId获取单个报告的xml信息
     :param report_id:
     """
     # 如果本地存在,则不再重复读取
     try:
         content = get_file_content(report_id, report_id + '.xml')
     except FileNotExist, ex:
         r = self._get_service.get_report(report_id)
         content = r.text
         # 保存到本地
         save_file(content, report_id, report_id + '.xml')
Ejemplo n.º 4
0
 def get_items_by_report_id(self, report_id, start_day=None):
     # 保存到本地的文件名
     if start_day:
         name = 'week_%s_%s.txt' % (start_day, report_id)
     else:
         name = 'week_%s.txt' % report_id
     content = get_file_content('advertising', name)
     if not content:
         r = self.get_by_report_id(report_id)
         # 保存到本地
         save_file(r.text, 'advertising', name)
         content = r.text
     parser = AdvertisingParser(content)
     return parser.get_items()
    def verify_tags(self):
        tags_query = self.tags_query()

        src_docs_with_tags = self.data_utils.batch_fetch_ids_for_query(base_url=self.src_server,
                                                                  query=tags_query,
                                                                  index=self.src_index,
                                                                  type=self.src_type,
                                                                  ids_fetched=self.ids_fetched,
                                                                  batch_size=1000)

        dest_docs_with_tags = self.data_utils.batch_fetch_ids_for_query(base_url=self.dest_server,
                                                                   query=tags_query,
                                                                   index=self.dest_index,
                                                                   type=self.dest_type,
                                                                   ids_fetched=self.ids_fetched,
                                                                   batch_size=1000)

        print len(src_docs_with_tags), 'src_docs_with_tags'
        print len(dest_docs_with_tags), 'dest_docs_with_tags'

        dest_dict = {}
        for _id in dest_docs_with_tags:
            dest_dict[_id] = 0

        missing_ids = []
        for _id in src_docs_with_tags:
            if _id not in dest_dict:
                missing_ids.append(_id)

        # print missing_ids

        print len(missing_ids), 'tags missing_ids'
        count = 0
        for _id in missing_ids:
            count += 1
            if count % 10000:
                print _id

        file_utils.save_file(self.reports_directory, 'tags_missing_ids.json', missing_ids)