Python DataUtils.batch_fetch_docs_for_ids Beispiele

Programmiersprache: Python

Namespace / Paketname: data_utils

Klasse / Typ: DataUtils

Methode / Funktion: batch_fetch_docs_for_ids

Beispiele auf hotexamples.com: 2

Python DataUtils.batch_fetch_docs_for_ids - 2 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die data_utils.DataUtils.batch_fetch_docs_for_ids, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Häufig verwendete Methoden

Anzeigen Verbergen

DataUtils(28)

message(26)

get_filename(10)

update_message(8)

create_onehot_vectors(8)

load_embeddings(7)

load_array(6)

get_processed_df(6)

save_array(6)

load_data(5)

remove_html(5)

create_int_dict(4)

normalize_cases(4)

load_corpus(4)

parse_dependency_tree(4)

extract_tag_list(4)

add_suffix_embeddings(4)

get_date_from_row(3)

batch_fetch_ids_for_query(3)

training_inputs(3)

training_classes(3)

get_low_from_row(3)

testing_inputs(3)

get_bb_query_msg(3)

read_data(3)

testing_classes(3)

extract_data(3)

remove_excess_spaces(2)

data_preprocess(2)

parse_bb_data_msg(2)

create_dir(2)

expand_var_names(2)

extract_tag_dict(2)

extract_word_data(2)

get_high_from_row(2)

batch_fetch_docs_for_ids(2)

parse_bb_variable_msg(1)

remove_whitespace(1)

parse_bb_latest_data_msg(1)

pre_process_aws(1)

tag_id(1)

pre_process_col_tweets(1)

get_regression_data(1)

load_vocab(1)

get_class_label(1)

augmentation(1)

build_pad_config(1)

cartesian(1)

check_and_create_folders(1)

create_cache_if_not_exists(1)

Beispiel #1

Datei anzeigen

Datei: copy_tags_and_annotations.py Projekt: rrosiek/pc_data_load

class CopyTagsAndAnnotations(object):
    def __init__(self, reports_directory, src_server, src_index, src_type, dest_server, dest_index, dest_type):
        self.data_loader_utils_dest = DataLoaderUtils(dest_server, dest_index, dest_type)
        self.reports_directory = reports_directory

        self.src_server = src_server
        self.src_index = src_index
        self.src_type = src_type

        self.dest_server = dest_server
        self.dest_index = dest_index
        self.dest_type = dest_type

        self.copy_tags = True
        self.copy_annotations = True

        self.combine_tags = False # Combine not implemented, set to false  
        self.combine_annotations = False # Combine not implemented, set to false 

        self.data_utils = DataUtils()

    def run(self):
        docs_to_copy = self.fetch_ids()
        self.copy(docs_to_copy)

        if self.copy_tags:
            self.verify_tags()

        if self.copy_annotations:
            self.verify_annotations()

    def ids_fetched(self, ids, index, type):
        print len(ids), 'ids fetched'

    def tags_query(self):
        tags_query = {
            "nested": {
                "path": "userTags",
                "query": {
                    "bool": {
                        "must": [
                            {
                                "exists": {
                                    "field": "userTags"
                                }
                            }
                        ]
                    }
                }
            }
        }

        return tags_query

    def annotations_query(self):
        annotations_query = {
            "nested": {
                "path": "annotations",
                "query": {
                    "bool": {
                        "must": [
                            {
                                "exists": {
                                    "field": "annotations"
                                }
                            }
                        ]
                    }
                }
            }
        }

        return annotations_query

    def fetch_ids(self):
        combined_docs = {}

        tags_query = self.tags_query()
        annotations_query = self.annotations_query()

        if self.copy_tags:
            print 'Fetching docs with tags', self.src_server, self.src_index, self.src_type
            docs_with_tags = self.data_utils.batch_fetch_ids_for_query(base_url=self.src_server,
                                                                        query=tags_query,
                                                                        index=self.src_index,
                                                                        type=self.src_type,
                                                                        ids_fetched=self.ids_fetched,
                                                                        batch_size=1000)
            print len(docs_with_tags), 'docs_with_tags'
            for _id in docs_with_tags:
                combined_docs[_id] = ''

        if self.copy_annotations:
            print 'Fetching docs with annotations', self.src_server, self.src_index, self.src_type
            docs_with_annotations = self.data_utils.batch_fetch_ids_for_query(base_url=self.src_server,
                                                                                query=annotations_query,
                                                                                index=self.src_index,
                                                                                type=self.src_type,
                                                                                ids_fetched=self.ids_fetched,
                                                                                batch_size=1000)

            print len(docs_with_annotations), 'docs_with_annotations'
            for _id in docs_with_annotations:
                combined_docs[_id] = ''

        print len(combined_docs), 'combined_docs'
        return combined_docs

    def docs_fetched(self, docs, index, type):
        print len(docs), 'docs fetched'
        existing_docs = {}
        for doc in docs:
            _id = doc['_id']
            if '_source' in doc:
                existing_doc = doc['_source']
                existing_docs[_id] = existing_doc

        bulk_data = ''
        for _id in existing_docs:
            existing_doc = existing_docs[_id]

            doc = {}
            if self.copy_tags and 'userTags' in existing_doc:
                if self.combine_tags:
                    pass                    
                else:
                    doc['userTags'] = existing_doc['userTags']

            if self.copy_annotations and 'annotations' in existing_doc:
                if self.combine_annotations:
                    pass    
                else:
                    doc['annotations'] = existing_doc['annotations']

            if len(doc) > 0:
                bulk_data += self.data_loader_utils_dest.bulk_update_header(_id)
                bulk_data += '\n'
                doc = {
                    'doc': doc
                }
                bulk_data += json.dumps(doc)
                bulk_data += '\n'

        if len(bulk_data) > 0:
            self.load_bulk_data(bulk_data)

        batch_file_name = file_utils.batch_file_name_with_prefix('loaded_ids_') + '.json'
        file_utils.save_file(self.reports_directory, batch_file_name, existing_docs.keys())

    def copy(self, ids):
        file_utils.make_directory(self.reports_directory)
        ids_array = ids.keys()
        # ids_array = [ids_array[0]]
        self.data_utils.batch_fetch_docs_for_ids(base_url=self.src_server,
                                                ids=ids_array,
                                                index=self.src_index,
                                                type=self.src_type,
                                                docs_fetched=self.docs_fetched, batch_size=500)

    def load_bulk_data(self, bulk_data):
        print 'Bulk data size', len(bulk_data), 'loading...'
        response = self.data_loader_utils_dest.load_bulk_data(bulk_data)

        if response:
            print 'Done loading bulk data, saving response'
        else:
            print 'Bulk data load failed'

    def verify_tags(self):
        tags_query = self.tags_query()

        src_docs_with_tags = self.data_utils.batch_fetch_ids_for_query(base_url=self.src_server,
                                                                  query=tags_query,
                                                                  index=self.src_index,
                                                                  type=self.src_type,
                                                                  ids_fetched=self.ids_fetched,
                                                                  batch_size=1000)

        dest_docs_with_tags = self.data_utils.batch_fetch_ids_for_query(base_url=self.dest_server,
                                                                   query=tags_query,
                                                                   index=self.dest_index,
                                                                   type=self.dest_type,
                                                                   ids_fetched=self.ids_fetched,
                                                                   batch_size=1000)

        print len(src_docs_with_tags), 'src_docs_with_tags'
        print len(dest_docs_with_tags), 'dest_docs_with_tags'

        dest_dict = {}
        for _id in dest_docs_with_tags:
            dest_dict[_id] = 0

        missing_ids = []
        for _id in src_docs_with_tags:
            if _id not in dest_dict:
                missing_ids.append(_id)

        # print missing_ids

        print len(missing_ids), 'tags missing_ids'
        count = 0
        for _id in missing_ids:
            count += 1
            if count % 10000:
                print _id

        file_utils.save_file(self.reports_directory, 'tags_missing_ids.json', missing_ids)

    def verify_annotations(self):
        annotations_query = self.annotations_query()

        src_docs_with_tags = self.data_utils.batch_fetch_ids_for_query(base_url=self.src_server,
                                                                        query=annotations_query,
                                                                        index=self.src_index,
                                                                        type=self.src_type,
                                                                        ids_fetched=self.ids_fetched,
                                                                        batch_size=1000)

        dest_docs_with_tags = self.data_utils.batch_fetch_ids_for_query(base_url=self.dest_server,
                                                                        query=annotations_query,
                                                                        index=self.dest_index,
                                                                        type=self.dest_type,
                                                                        ids_fetched=self.ids_fetched,
                                                                        batch_size=1000)

        print len(src_docs_with_tags), 'src_docs_with_annotations'
        print len(dest_docs_with_tags), 'dest_docs_with_annotations'

        dest_dict = {}
        for _id in dest_docs_with_tags:
            dest_dict[_id] = 0

        missing_ids = []
        for _id in src_docs_with_tags:
            if _id not in dest_dict:
                missing_ids.append(_id)

        print len(missing_ids), 'annotations missing_ids'

        count = 0
        for _id in missing_ids:
            count += 1
            if count % 10:
                print _id

        file_utils.save_file(self.reports_directory, 'annotations_missing_ids.json', missing_ids)

Beispiel #2

Datei anzeigen

Datei: delete_tags_and_annotations.py Projekt: rrosiek/pc_data_load

class DeleteUserData(object):
    def __init__(self, reports_directory, src_server, src_index, src_type):
        self.data_loader_utils_dest = DataLoaderUtils(src_server, src_index, src_type)
        self.reports_directory = reports_directory

        self.src_server = src_server
        self.src_index = src_index
        self.src_type = src_type

        self.delete_tags = True
        self.delete_annotations = True

        self.data_utils = DataUtils()

    def run(self):
        docs_to_delete = self.fetch_ids()
        self.delete(docs_to_delete)

    def ids_fetched(self, ids, index, type):
        print len(ids), 'ids fetched'

    def tags_query(self):
        tags_query = {
            "nested": {
                "path": "userTags",
                "query": {
                    "bool": {
                        "must": [
                            {
                                "exists": {
                                    "field": "userTags"
                                }
                            }
                        ]
                    }
                }
            }
        }

        return tags_query

    def annotations_query(self):
        annotations_query = {
            "nested": {
                "path": "annotations",
                "query": {
                    "bool": {
                        "must": [
                            {
                                "exists": {
                                    "field": "annotations"
                                }
                            }
                        ]
                    }
                }
            }
        }

        return annotations_query

    def fetch_ids(self):
        combined_docs = {}

        tags_query = self.tags_query()
        annotations_query = self.annotations_query()

        if self.delete_tags:
            print 'Fetching docs with tags', self.src_server, self.src_index, self.src_type
            docs_with_tags = self.data_utils.batch_fetch_ids_for_query(base_url=self.src_server,
                                                                  query=tags_query,
                                                                  index=self.src_index,
                                                                  type=self.src_type,
                                                                  ids_fetched=self.ids_fetched,
                                                                  batch_size=1000)
            print len(docs_with_tags), 'docs_with_tags'
            for _id in docs_with_tags:
                combined_docs[_id] = ''

        if self.delete_annotations:
            print 'Fetching docs with annotations', self.src_server, self.src_index, self.src_type
            docs_with_annotations = self.data_utils.batch_fetch_ids_for_query(base_url=self.src_server,
                                                                         query=annotations_query,
                                                                         index=self.src_index,
                                                                         type=self.src_type,
                                                                         ids_fetched=self.ids_fetched,
                                                                         batch_size=1000)

            print len(docs_with_annotations), 'docs_with_annotations'
            for _id in docs_with_annotations:
                combined_docs[_id] = ''

        print len(combined_docs), 'combined_docs'
        return combined_docs

    def delete_data(self, _ids):
        bulk_data = ''
        for _id in _ids:
            doc = {}
            if self.delete_tags:
                doc['userTags'] = []

            if self.delete_annotations:
                doc['annotations'] = []

            if len(doc) > 0:
                bulk_data += self.data_loader_utils_dest.bulk_update_header(_id)
                bulk_data += '\n'
                doc = {
                    'doc': doc
                }
                bulk_data += json.dumps(doc)
                bulk_data += '\n'

            if len(bulk_data) >= 300000:
                self.load_bulk_data(bulk_data)
                bulk_data = ''

        if len(bulk_data) > 0:
            self.load_bulk_data(bulk_data)

    def docs_fetched(self, docs, index, type):
        print len(docs), 'docs fetched'
        existing_docs = {}
        for doc in docs:
            _id = doc['_id']
            if '_source' in doc:
                existing_doc = doc['_source']
                existing_docs[_id] = existing_doc

        bulk_data = ''
        for _id in existing_docs:
            existing_doc = existing_docs[_id]

            doc = {}
            if self.delete_tags and 'userTags' in existing_doc:
                doc['userTags'] = []

            if self.delete_annotations and 'annotations' in existing_doc:
                doc['annotations'] = []

            if len(doc) > 0:
                bulk_data += self.data_loader_utils_dest.bulk_update_header(_id)
                bulk_data += '\n'
                doc = {
                    'doc': doc
                }
                bulk_data += json.dumps(doc)
                bulk_data += '\n'

        if len(bulk_data) > 0:
            self.load_bulk_data(bulk_data)

        batch_file_name = file_utils.batch_file_name_with_prefix('loaded_ids_') + '.json'
        file_utils.save_file(self.reports_directory, batch_file_name, existing_docs.keys())

    def delete(self, ids):
        file_utils.make_directory(self.reports_directory)
        ids_array = ids.keys()
        # ids_array = [ids_array[0]]
        self.data_utils.batch_fetch_docs_for_ids(base_url=self.src_server,
                                            ids=ids_array,
                                            index=self.src_index,
                                            type=self.src_type,
                                            docs_fetched=self.docs_fetched, batch_size=500)

    def load_bulk_data(self, bulk_data):
        print 'Bulk data size', len(bulk_data), 'loading...'
        response = self.data_loader_utils_dest.load_bulk_data(bulk_data)

        if response:
            print 'Done loading bulk data, saving response'
        else:
            print 'Bulk data load failed'