コード例 #1
0
    def __init__(self):
        self.updated_docs = {}
        self.original_docs = {}

        self.server = SERVER
        self.index = INDEX
        self.type = TYPE

        self.server_username = ''
        self.server_password = ''

        self.load_config = self.get_load_config(clean_citations_directory)
        self.data_loader_utils = DataLoaderUtils(
            self.load_config.server, self.load_config.index,
            self.load_config.type, self.load_config.server_username,
            self.load_config.server_password)

        self.docs_with_updates = {}

        self.inverted_index = {}
        self.current_baseline_file = None
        self.current_update_file = None

        self.processes = []
        self.missing_docs = {}

        self.inverted_index_for_updated_docs = {}
コード例 #2
0
ファイル: missing_ids.py プロジェクト: rrosiek/pc_data_load
    def __init__(self):
        self.missing_ids = {}
        self.new_ids = {}
        self.data_utils = DataUtils()
        self.data_loader_utils = DataLoaderUtils(SERVER, OLD_INDEX, OLD_TYPE,
                                                 '', '')

        self.docs_for_dolan = {}
コード例 #3
0
    def __init__(self, load_config):
        self.load_config = load_config
        self.grant_num_groups = {}

        self.data_loader_utils = DataLoaderUtils(SERVER, INDEX, TYPE)

        self.bulk_data_size = 300000
        self.docs_processed = 0
        self.total_doc_count = 0
コード例 #4
0
    def __init__(self, src_server, dest_server, src_index, src_type, dst_index, dst_type, username, password):
        self.src_data_loader_utils = DataLoaderUtils(src_server, src_index, src_type)
        self.dest_data_loader_utils = DataLoaderUtils(dest_server, dst_index, dst_type)

        self.data_utils = DataUtils()

        self.username = username
        self.password = password

        file_utils.make_directory(TEMP_DIR)
コード例 #5
0
ファイル: copy_docs.py プロジェクト: rrosiek/pc_data_load
    def __init__(self, src_server, dest_server, src_index, src_type, dst_index,
                 dst_type):
        self.src_data_loader_utils = DataLoaderUtils(src_server, src_index,
                                                     src_type)
        self.dest_data_loader_utils = DataLoaderUtils(dest_server, dst_index,
                                                      dst_type)

        self.processed_doc_count = 0
        self.total_doc_count = 0

        self.data_utils = DataUtils()
コード例 #6
0
def duplicate_index(server, src_index, src_type, dst_index, dst_type, mapping=None):
    src_data_loader_utils = DataLoaderUtils(server, src_index, src_type)
    dest_data_loader_utils = DataLoaderUtils(server, dst_index, dst_type)

    if mapping is None:
        # Get mapping from src index
        mapping = src_data_loader_utils.get_mapping_from_server()

    if not dest_data_loader_utils.index_exists():
        print 'Creating index'
        dest_data_loader_utils.put_mapping(mapping)
    else:
        print dst_index, 'exists'

    data = {
        "source": {
            "index": src_index
        },
        "dest": {
            "index": dst_index
        }
    }

    url = server + '/_reindex?wait_for_completion=false'

    print url
    print data

    response = requests.post(url, json=data)

    print response
    print json.loads(response.text)
コード例 #7
0
    def __init__(self, server, src_index, src_type, process_doc_method):
        self.server = server
        self.index = src_index
        self.type = src_type
        self.process_doc_method = process_doc_method

        self.batch_size = 5000
        self.process_count = 2
        self.process_spawn_delay = 0.15
        self.bulk_data_size = 300000

        self.data_loader_utils = DataLoaderUtils(self.server, self.index,
                                                 self.type)

        self.data_utils = DataUtils()
コード例 #8
0
    def __init__(self, load_config, data_source, data_source_summary):
        super(PubmedRelationshipProcessor,
              self).__init__(load_config, data_source)
        self.data_source_summary = data_source_summary
        self.data_loader_utils = DataLoaderUtils(
            self.load_config.server, self.load_config.index,
            self.load_config.type, self.load_config.server_username,
            self.load_config.server_password)
        self.load_relationships = True

        self.docs_with_new_citations = {}
        self.docs_citations_history = {}

        self.existing_docs = {}

        self.data_utils = DataUtils()
コード例 #9
0
    def __init__(self, src_server, dest_server, src_index, src_type, dst_index,
                 dst_type, username, password):
        self.src_data_loader_utils = DataLoaderUtils(src_server, src_index,
                                                     src_type)
        self.dest_data_loader_utils = DataLoaderUtils(dest_server, dst_index,
                                                      dst_type)

        self.processed_doc_count = 0
        self.total_doc_count = 0

        self.data_utils = DataUtils()

        self.relations_to_exclude = []
        self.missing_destination_ids = []

        self.username = username
        self.password = password

        self.last_time_stamp = 0
        self.diff_average = 0
コード例 #10
0
    def check_and_create_index(self):

        data_loader_utils = DataLoaderUtils(self.server, self.index, self.type, self.server_username, self.server_password)
        mapping_file_path = self.mapping_file_path()
        print 'Checking index...', self.index, self.type
        if not data_loader_utils.index_exists() and mapping_file_path is not None:
            mapping = data_loader_utils.load_mapping_from_file(mapping_file_path)
            data_loader_utils.create_index_from_mapping(mapping)
コード例 #11
0
class AddInitialGrantFlag(object):
    def __init__(self, load_config):
        self.load_config = load_config
        self.grant_num_groups = {}

        self.data_loader_utils = DataLoaderUtils(SERVER, INDEX, TYPE)

        self.bulk_data_size = 300000
        self.docs_processed = 0
        self.total_doc_count = 0

    def process_doc(self, _id, doc):
        if 'grant_num' in doc:
            grant_num = doc['grant_num']

            grant_num_comps = grant_num.split('-')
            if grant_num_comps[0] not in self.grant_num_groups:
                self.grant_num_groups[grant_num_comps[0]] = {}

            fy = None

            if 'fy' in doc:
                fy = doc['fy']
                if len(fy) > 0:
                    fy = int(fy)
                else:
                    fy = None

            if fy is not None:
                self.grant_num_groups[grant_num_comps[0]][_id] = {
                    'id': _id,
                    'fy': fy,
                    'grant_num': grant_num
                }

        updated_doc = {}
        updated_doc['initial_grant'] = False

        return updated_doc

    def process_grant_num_groups(self):

        print 'Processing', len(self.grant_num_groups), 'grant_num groups'
        bulk_data = ''

        total_grant_num_groups = len(self.grant_num_groups)
        count = 0

        for grant_num in self.grant_num_groups:
            count += 1
            progress = ((count / float(total_grant_num_groups)) * 100)
            print 'Pass 2: progress', count, '/', total_grant_num_groups, progress, '%'

            grant_num_group = self.grant_num_groups[grant_num]

            # Find doc with lowest fy
            lowest_item = None
            for _id in grant_num_group:
                fy_data = grant_num_group[_id]

                if lowest_item is None:
                    lowest_item = fy_data
                else:
                    fy = fy_data['fy']
                    lowest_item_fy = lowest_item['fy']
                    if fy < lowest_item_fy:
                        lowest_item = fy_data

            if lowest_item is not None:
                _id = lowest_item['id']

                doc = {}
                doc['initial_grant'] = True

                bulk_data += self.data_loader_utils.bulk_update_header(_id)
                bulk_data += '\n'
                doc = {'doc': doc}
                bulk_data += json.dumps(doc)
                bulk_data += '\n'

            if len(bulk_data) >= self.bulk_data_size:
                self.load_bulk_data(bulk_data)
                bulk_data = ''

        if len(bulk_data) > 0:
            self.load_bulk_data(bulk_data)

    def load_bulk_data(self, bulk_data):
        self.data_loader_utils.load_bulk_data(bulk_data)
        # pass

    def run(self):
        doc_ids = get_doc_ids(
            server=self.load_config.server,
            src_index=self.load_config.index,
            src_type=self.load_config.type,
            dest_dir=self.load_config.other_files_directory(),
            dest_file_name="INITIAL_GRANT_ALL_IRDB_IDS.json")

        doc_ids = doc_ids.keys()

        self.total_doc_count = len(doc_ids)

        data_utils = DataUtils()
        data_utils.batch_fetch_docs_for_ids(base_url=self.load_config.server,
                                            ids=doc_ids,
                                            index=self.load_config.index,
                                            type=self.load_config.type,
                                            docs_fetched=self.docs_fetched)

        self.process_grant_num_groups()

    def docs_fetched(self, docs, index, type):
        docs_to_process = {}

        self.docs_processed += len(docs)
        progress = ((self.docs_processed / float(self.total_doc_count)) * 100)
        print 'Pass 1: progress', self.docs_processed, '/', self.total_doc_count, progress, '%'

        # print 'Docs fetched', len(docs)
        for doc in docs:
            _id = doc['_id']
            if '_source' in doc:
                existing_doc = doc['_source']
                docs_to_process[_id] = existing_doc

        self.process_docs(docs_to_process)

        # print 'Processed docs', self.processed_docs, 'Pubmed relations', len(self.pubmed_relations)

    def process_docs(self, docs):
        bulk_data = ''

        for _id in docs:
            doc = docs[_id]

            processed_doc = self.process_doc(_id, doc)

            if processed_doc is not None:
                bulk_data += self.data_loader_utils.bulk_update_header(_id)
                bulk_data += '\n'
                updated_doc = {'doc': processed_doc}
                bulk_data += json.dumps(updated_doc)
                bulk_data += '\n'

            if len(bulk_data) >= self.bulk_data_size:
                # print 'loading bulk data...'
                self.load_bulk_data(bulk_data)
                bulk_data = ''

        if len(bulk_data) > 0:
            # print 'loading bulk data...'
            self.load_bulk_data(bulk_data)
コード例 #12
0
class CopyGrants(object):

    def __init__(self, src_server, dest_server, src_index, src_type, dst_index, dst_type, username, password):
        self.src_data_loader_utils = DataLoaderUtils(src_server, src_index, src_type)
        self.dest_data_loader_utils = DataLoaderUtils(dest_server, dst_index, dst_type)

        self.data_utils = DataUtils()

        self.username = username
        self.password = password

        file_utils.make_directory(TEMP_DIR)

    def run(self):
        self.process_batches()

    def process_batches(self):
        batch_file_names = []
        for batch_file_name in os.listdir(TEMP_DIR):
            file_path = os.path.join(TEMP_DIR, batch_file_name)
            if os.path.isfile(file_path) and batch_file_name.startswith('batch_'):
                batch_file_names.append(batch_file_name)

        print "Generated ", len(batch_file_names), 'batch file names'

        batch_file_names.sort()

        if len(batch_file_names) == 0:
            batch_file_names = self.split_to_batches()

        print len(batch_file_names)
        raw_input('Continue?')

        processed_batches = file_utils.load_file(TEMP_DIR, 'processed_pubmed2018_docs_with_grants_batches.json')
        for batch_file_name in batch_file_names:
            if batch_file_name not in processed_batches:
                print 'Loading batch', batch_file_name
                batch = file_utils.load_file(TEMP_DIR, batch_file_name)
                self.copy_docs_batch(batch)
                processed_batches[batch_file_name] = 0
                file_utils.save_file(TEMP_DIR, 'processed_pubmed2018_docs_with_grants_batches.json', processed_batches)

    def split_to_batches(self):
        server = self.src_data_loader_utils.server
        src_index = self.src_data_loader_utils.index
        src_type = self.src_data_loader_utils.type

        print 'Fetching doc ids for', src_index, src_type
        query = {
            "nested": {
                "path": "grants",
                "query": {
                    "bool": {
                        "must": [
                            {
                                "exists": {
                                    "field": "grants"
                                }
                            }
                        ]
                    }
                }
            }
        }

        all_pubmed_ids = export_doc_ids.get_doc_ids(server,
                                                    src_index,
                                                    src_type,
                                                    TEMP_DIR,
                                                    'pubmed2018_docs_with_grants.json', query=query)
        # all_pubmed_ids = all_pubmed_ids.keys()
        # all_pubmed_ids.sort()
        self.total_doc_count = len(all_pubmed_ids)  

        max_batch_count = 5000
        
        batch_file_names = []
        batch_index = 0
        batch_ids = []
        # Splitting into batches
        for _id in all_pubmed_ids:
            batch_ids.append(_id)

            if len(batch_ids) >= max_batch_count:
                print 'Writing batch:', batch_index
                batch_file_name = 'batch_' + str(batch_index) + '.json'
                batch_file_names.append(batch_file_name)
                file_utils.save_file(TEMP_DIR, batch_file_name, batch_ids)

                batch_ids = []
                batch_index += 1

        if len(batch_ids) > 0:
            print 'Writing batch:', batch_index
            batch_file_name = 'batch_' + str(batch_index) + '.json'
            batch_file_names.append(batch_file_name)
            file_utils.save_file(TEMP_DIR, batch_file_name, batch_ids)

            batch_index += 1

        return batch_file_names

    def copy_docs_batch(self, doc_ids):
        print 'Fetching docs'
        self.data_utils.batch_fetch_docs_for_ids(base_url=self.src_data_loader_utils.server,
                                                ids=doc_ids,
                                                index=self.src_data_loader_utils.index,
                                                type=self.src_data_loader_utils.type,
                                                docs_fetched=self.docs_fetched,
                                                batch_size=500)
   
    def docs_fetched(self, docs, index, type):
        print 'Docs fetched', len(docs)
        docs_to_copy = {}

        # print 'Docs fetched', len(docs)
        for doc in docs:
            _id = doc['_id']
            if '_source' in doc:
                existing_doc = doc['_source']
                docs_to_copy[_id] = existing_doc

        self.copy_relations(docs_to_copy)
    
    def load_bulk_data(self, bulk_data):
        print 'Bulk data size', len(bulk_data), 'loading...'
        response = self.dest_data_loader_utils.load_bulk_data(bulk_data)

        if response:
            pass
            # print 'Done loading bulk data, saving response'
        else:
            print 'Bulk data load failed'

    def copy_relations(self, src_docs):
        bulk_data = ''
        count = 0

        # Copy relations 
        for _id in src_docs:
            src_doc = src_docs[_id]
            
            doc = {}
            if 'grants' in src_doc:
                doc['grants'] = src_doc['grants']

            count += 1

            if len(doc) > 0: 
                bulk_data += self.dest_data_loader_utils.bulk_update_header(_id)
                bulk_data += '\n'
                doc = {
                    'doc': doc
                }
                bulk_data += json.dumps(doc)
                bulk_data += '\n'

                # if count % 1000 == 0:
                #     print 'Processed', 1000, 'docs'
                if len(bulk_data) >= 150000:
                    print _id
                    self.load_bulk_data(bulk_data)
                    # print 'Copied', count, 'docs'
                    bulk_data = ''

        if len(bulk_data) > 0:
            self.load_bulk_data(bulk_data)
            pass
コード例 #13
0
ファイル: load_manager.py プロジェクト: rrosiek/pc_data_load
def create_index():
    data_loader_utils = DataLoaderUtils(SERVER, INDEX, TYPE)
    data_loader_utils.check_and_create_index('data_load/clinical_trials/mapping.json')
コード例 #14
0
ファイル: copy_docs.py プロジェクト: rrosiek/pc_data_load
class CopyDocs(object):
    def __init__(self, src_server, dest_server, src_index, src_type, dst_index,
                 dst_type):
        self.src_data_loader_utils = DataLoaderUtils(src_server, src_index,
                                                     src_type)
        self.dest_data_loader_utils = DataLoaderUtils(dest_server, dst_index,
                                                      dst_type)

        self.processed_doc_count = 0
        self.total_doc_count = 0

        self.data_utils = DataUtils()

    def get_total_doc_count(self):
        return self.data_utils.get_total_doc_count(
            base_url=self.src_data_loader_utils.server,
            index=self.src_data_loader_utils.index,
            type=self.src_data_loader_utils.type)

    def docs_fetched(self, docs, index, type):
        docs_to_copy = {}

        # print 'Docs fetched', len(docs)
        for doc in docs:
            _id = doc['_id']
            if '_source' in doc:
                existing_doc = doc['_source']
                docs_to_copy[_id] = existing_doc

        self.index_docs(docs_to_copy)

        self.processed_doc_count += len(docs)

        progress = ((self.processed_doc_count / float(self.total_doc_count)) *
                    100)
        print '---------------------------------------------------------------------------------------------'
        print 'Progress', self.processed_doc_count, '/', self.total_doc_count, progress, '%'
        print '---------------------------------------------------------------------------------------------'

    def export_doc_ids(self, server, src_index, src_type):
        print 'Fetching doc ids for', src_index, src_type
        query = {"match_all": {}}
        self.data_utils.batch_fetch_ids_for_query(base_url=server,
                                                  index=src_index,
                                                  type=src_type,
                                                  query=query,
                                                  ids_fetched=self.ids_fetched)

        # print 'Done, fetched', len(documents_ids), 'doc ids'

    def ids_fetched(self, ids, index, type):
        self.copy_docs_batch(ids)

    def create_destination_index(self, mapping=None):
        if mapping is None:
            # Get mapping from src index
            mapping = self.src_data_loader_utils.get_mapping_from_server()

        if not self.dest_data_loader_utils.index_exists():
            print 'Creating index'
            self.dest_data_loader_utils.put_mapping(mapping)
            # migrate_index(self.dest_data_loader_utils.index)
        else:
            print self.dest_data_loader_utils.index, 'exists'

    def copy_docs(self):
        self.processed_doc_count = 0
        self.total_doc_count = self.get_total_doc_count()

        print 'Total doc count', self.total_doc_count

        self.create_destination_index(mapping=None)

        self.export_doc_ids(server=self.src_data_loader_utils.server,
                            src_index=self.src_data_loader_utils.index,
                            src_type=self.src_data_loader_utils.type)

    def copy_docs_for_ids(self, doc_ids, mapping=None):
        self.processed_doc_count = 0
        self.total_doc_count = len(doc_ids)

        print 'Total doc count', self.total_doc_count

        self.create_destination_index(mapping)

        print 'Fetching docs from source index'
        batch_doc_processor = BatchDocProcessor(doc_ids, self.copy_docs_batch,
                                                3000, 16, 0.33)
        batch_doc_processor.run()

    def copy_docs_batch(self, doc_ids):
        self.data_utils.batch_fetch_docs_for_ids(
            base_url=self.src_data_loader_utils.server,
            ids=doc_ids,
            index=self.src_data_loader_utils.index,
            type=self.src_data_loader_utils.type,
            docs_fetched=self.docs_fetched)

    def index_docs(self, docs_to_copy):
        bulk_data = ''
        count = 0

        for es_id in docs_to_copy:
            count += 1
            doc = docs_to_copy[es_id]
            bulk_data += self.dest_data_loader_utils.bulk_index_header(es_id)
            bulk_data += '\n'
            bulk_data += json.dumps(doc)
            bulk_data += '\n'

            # if count % 1000 == 0:
            #     print 'Processed', 1000, 'docs'

            if len(bulk_data) >= 150000:
                self.load_bulk_data(bulk_data)
                # print 'Copied', count, 'docs'
                bulk_data = ''

        if len(bulk_data) > 0:
            self.load_bulk_data(bulk_data)

        # print 'Copied', count, 'docs'

    def load_bulk_data(self, bulk_data):
        # print 'Bulk data size', len(bulk_data), 'loading...'
        response = self.dest_data_loader_utils.load_bulk_data(bulk_data)

        if response:
            pass
            # print 'Done loading bulk data, saving response'
        else:
            print 'Bulk data load failed'


# src_server = 'http://localhost:9200'
# src_index = 'irdb_v3'
# src_type = 'grant'

# dest_server = 'http://localhost:9200'
# dest_index = 'irdb_v4'
# dest_type = 'grant'

# copy_docs = CopyDocs(src_server=src_server,
#                             dest_server=dest_server,
#                             src_index=src_index,
#                             src_type=src_type,
#                             dst_index=dest_index,
#                             dst_type=dest_type)

# copy_docs.copy_docs()
# copy_relations.relations_to_exclude.append({
#     "source": "",
#     "index_id": ID_PUBMED
# })
# copy_relations.run()
コード例 #15
0
ファイル: missing_ids.py プロジェクト: rrosiek/pc_data_load
class FindMissingIds(object):
    def __init__(self):
        self.missing_ids = {}
        self.new_ids = {}
        self.data_utils = DataUtils()
        self.data_loader_utils = DataLoaderUtils(SERVER, OLD_INDEX, OLD_TYPE,
                                                 '', '')

        self.docs_for_dolan = {}

    def run(self):
        old_ids = export_doc_ids(server=SERVER,
                                 src_index=OLD_INDEX,
                                 src_type=OLD_TYPE)

        new_ids = export_doc_ids(server=SERVER,
                                 src_index=NEW_INDEX,
                                 src_type=NEW_TYPE)

        for _id in old_ids:
            if _id not in new_ids:
                self.missing_ids[_id] = 0
                if len(self.missing_ids) % 1000 == 0:
                    print 'Missing ids', len(self.missing_ids)

        for _id in new_ids:
            if _id not in old_ids:
                self.new_ids[_id] = 0
                if len(self.new_ids) % 1000 == 0:
                    print 'New ids', len(self.new_ids)

        print 'Missing ids', len(self.missing_ids)
        print 'New ids', len(self.new_ids)

        file_utils.make_directory(missing_ids_directory)

        file_utils.save_file(missing_ids_directory, 'missing_ids.json',
                             self.missing_ids.keys())
        file_utils.save_file(missing_ids_directory, 'new_ids.json',
                             self.new_ids)

    def check_tags_and_annotations(self):
        missing_ids = file_utils.load_file(missing_ids_directory,
                                           'missing_ids.json')
        new_ids = file_utils.load_file(missing_ids_directory, 'new_ids.json')

        print 'Missing ids', len(missing_ids)
        print 'New ids', len(new_ids)

        docs_with_tags = self.fetch_ids()

        missing_docs_with_tags = []
        for _id in missing_ids:
            if _id in docs_with_tags:
                missing_docs_with_tags.append(_id)
                print 'Missing docs with tags', _id

        print 'Missing docs with tags', len(missing_docs_with_tags)
        print 'Missing docs with tags', json.dumps(missing_docs_with_tags)

        for _id in missing_docs_with_tags:
            existing_doc = self.get_existing_doc(_id)
            if 'userTags' in existing_doc:
                user_tags = existing_doc['userTags']
                for user_tag in user_tags:
                    added_by = user_tag['added_by']

                    if added_by == '*****@*****.**':
                        self.docs_for_dolan[_id] = existing_doc
                        print _id
                        print user_tags

                    break

        print 'Docs for Dolan', len(self.docs_for_dolan)

        print 'Docs for Dolan', self.docs_for_dolan.keys()

    def get_existing_doc(self, _id):
        exisiting_doc = self.data_loader_utils.fetch_doc(_id)
        if exisiting_doc is not None and '_source' in exisiting_doc:
            exisiting_doc = exisiting_doc['_source']
        return exisiting_doc

    def fetch_ids(self):
        combined_docs = {}

        tags_query = self.tags_query()
        annotations_query = self.annotations_query()

        print 'Fetching docs with tags', SERVER, OLD_INDEX, OLD_TYPE
        docs_with_tags = self.data_utils.batch_fetch_ids_for_query(
            base_url=SERVER,
            query=tags_query,
            index=OLD_INDEX,
            type=OLD_TYPE,
            ids_fetched=self.ids_fetched,
            batch_size=1000)
        print len(docs_with_tags), 'docs_with_tags'
        for _id in docs_with_tags:
            combined_docs[_id] = ''

        print 'Fetching docs with annotations', SERVER, OLD_INDEX, OLD_TYPE
        docs_with_annotations = self.data_utils.batch_fetch_ids_for_query(
            base_url=SERVER,
            query=annotations_query,
            index=OLD_INDEX,
            type=OLD_TYPE,
            ids_fetched=self.ids_fetched,
            batch_size=1000)

        print len(docs_with_annotations), 'docs_with_annotations'
        for _id in docs_with_annotations:
            combined_docs[_id] = ''

        print len(combined_docs), 'combined_docs'
        return combined_docs

    def ids_fetched(self, ids, index, type):
        print len(ids), 'ids fetched'

    def tags_query(self):
        tags_query = {
            "nested": {
                "path": "userTags",
                "query": {
                    "bool": {
                        "must": [{
                            "exists": {
                                "field": "userTags"
                            }
                        }]
                    }
                }
            }
        }

        return tags_query

    def annotations_query(self):
        annotations_query = {
            "nested": {
                "path": "annotations",
                "query": {
                    "bool": {
                        "must": [{
                            "exists": {
                                "field": "annotations"
                            }
                        }]
                    }
                }
            }
        }

        return annotations_query
コード例 #16
0
class ProcessIndex(object):
    def __init__(self, server, src_index, src_type, process_doc_method):
        self.server = server
        self.index = src_index
        self.type = src_type
        self.process_doc_method = process_doc_method

        self.batch_size = 5000
        self.process_count = 2
        self.process_spawn_delay = 0.15
        self.bulk_data_size = 300000

        self.data_loader_utils = DataLoaderUtils(self.server, self.index,
                                                 self.type)

        self.data_utils = DataUtils()

    def run(self):
        # doc_ids = export_doc_ids( self.server, self.index,
        #                             self.type, self.index + '_' + self.type , 'doc_ids.json')

        doc_ids = file_utils.load_file(self.index, self.index + '_ids.json')

        if len(doc_ids) == 0:
            doc_ids = export_doc_ids.export_doc_ids(self.server, self.index,
                                                    self.type)

        doc_ids = doc_ids.keys()

        batch_doc_processor = BatchDocProcessor(doc_ids, self.process_batch,
                                                self.batch_size,
                                                self.process_count,
                                                self.process_spawn_delay)
        batch_doc_processor.run()

    def docs_fetched(self, docs, index, type):
        docs_to_process = {}

        print 'Docs fetched', len(docs)
        for doc in docs:
            _id = doc['_id']
            if '_source' in doc:
                existing_doc = doc['_source']
                docs_to_process[_id] = existing_doc

        self.process_docs(docs_to_process)

    def process_docs(self, docs):
        bulk_data = ''

        for _id in docs:
            doc = docs[_id]

            processed_doc = self.process_doc_method(_id, doc)

            if processed_doc is not None:
                bulk_data += self.data_loader_utils.bulk_update_header(_id)
                bulk_data += '\n'
                updated_doc = {'doc': processed_doc}
                bulk_data += json.dumps(updated_doc)
                bulk_data += '\n'

            if len(bulk_data) >= self.bulk_data_size:
                # print 'loading bulk data...'
                self.load_bulk_data(bulk_data)
                bulk_data = ''

        if len(bulk_data) > 0:
            # print 'loading bulk data...'
            self.load_bulk_data(bulk_data)

    def load_bulk_data(self, bulk_data):
        self.data_loader_utils.load_bulk_data(bulk_data)
        # pass

    def process_batch(self, doc_ids):
        self.data_utils.batch_fetch_docs_for_ids(
            base_url=self.server,
            ids=doc_ids,
            index=self.index,
            type=self.type,
            docs_fetched=self.docs_fetched)
コード例 #17
0
class CleanCitations(object):
    def __init__(self):
        self.updated_docs = {}
        self.original_docs = {}

        self.server = SERVER
        self.index = INDEX
        self.type = TYPE

        self.server_username = ''
        self.server_password = ''

        self.load_config = self.get_load_config(clean_citations_directory)
        self.data_loader_utils = DataLoaderUtils(
            self.load_config.server, self.load_config.index,
            self.load_config.type, self.load_config.server_username,
            self.load_config.server_password)

        self.docs_with_updates = {}

        self.inverted_index = {}
        self.current_baseline_file = None
        self.current_update_file = None

        self.processes = []
        self.missing_docs = {}

        self.inverted_index_for_updated_docs = {}

    def run(self):
        # self.get_updated_docs()
        self.updated_docs = file_utils.load_file(
            self.load_config.other_files_directory(), 'updated_docs.json')
        print 'Updated docs:', len(self.updated_docs)
        print 'Original docs:', len(self.original_docs)

        # self.get_original_docs()
        # sys.exit(1)

        self.original_docs = file_utils.load_file(
            self.load_config.other_files_directory(), 'original_docs.json')
        self.inverted_index = file_utils.load_file(
            self.load_config.other_files_directory(), 'inverted_index.json')
        self.inverted_index_for_updated_docs = file_utils.load_file(
            self.load_config.other_files_directory(),
            'inverted_index_for_updated_docs.json')

        print 'Updated docs:', len(self.updated_docs)
        print 'Original docs:', len(self.original_docs)
        print 'Inverted index:', len(self.inverted_index)
        print 'inverted_index_for_updated_docs:', len(
            self.inverted_index_for_updated_docs)
        # print json.dumps(self.inverted_index_for_updated_docs)
        # input = raw_input('Continue?')
        # if input.lower() in ['n', 'no', '0']:
        #     sys.exit(1)

        self.update_docs()

        print 'Docs with updates', len(self.docs_with_updates)
        # print json.dumps(self.docs_with_updates)

        print 'Missing docs'
        print json.dumps(self.missing_docs.keys())

        file_utils.save_file(self.load_config.other_files_directory(),
                             'docs_with_updates.json', self.docs_with_updates)

    def update_docs(self):
        for _id in self.updated_docs:
            if _id in self.original_docs:
                original_doc = self.original_docs[_id]
                updated_doc = self.updated_docs[_id]
                # print original_doc
                # print updated_doc
                original_citations = self.load_config.data_mapper.get_citations(
                    [original_doc])
                updated_citations = self.load_config.data_mapper.get_citations(
                    [updated_doc])

                # print _id, 'original', len(original_citations), 'updated', len(updated_citations)
                if not self.compare_citations(original_citations,
                                              updated_citations):
                    print 'Doc with update', _id
                    self.docs_with_updates[_id] = {
                        'original_citations': len(original_citations),
                        'updated_citations': len(updated_citations),
                        'original_doc': original_doc,
                        'updated_doc': updated_doc
                    }

                added_citations = []
                removed_citations = []
                for citation in updated_citations:
                    if citation not in original_citations:
                        added_citations.append(citation)

                for citation in original_citations:
                    if citation not in updated_citations:
                        removed_citations.append(citation)

                if _id in self.inverted_index_for_updated_docs:
                    update_file = self.inverted_index_for_updated_docs[_id]
                    # print update_file
                    # self.update_doc_with_history(_id, update_file, original_citations, removed_citations, added_citations)
                else:
                    print _id, 'missing from inverted index'
                # self.update_doc(_id, original_citations)

            else:
                updated_doc = self.updated_docs[_id]
                self.missing_docs[_id] = updated_doc
                updated_citations = self.load_config.data_mapper.get_citations(
                    [updated_doc])

                print 'Missing doc', _id, len(updated_citations)

    def compare_citations(self, original_citations, updated_citations):
        for _id in original_citations:
            if _id not in updated_citations:
                return False

        for _id in updated_citations:
            if _id not in original_citations:
                return False

        return True

    def get_existing_doc(self, _id):
        exisiting_doc = self.data_loader_utils.fetch_doc(_id)
        if exisiting_doc is not None and '_source' in exisiting_doc:
            exisiting_doc = exisiting_doc['_source']
        return exisiting_doc

    def update_doc_with_history(self, _id, update_file, original_citations,
                                removed_citations, added_citations):
        print _id, update_file, 'original_citations', len(
            original_citations), 'removed_citations', len(
                removed_citations), 'added_citations', len(added_citations)
        now = datetime.datetime.now()

        # updated_date = now.isoformat()
        updated_date = "2019-01-14T11:16:01.000000"
        # 2019-01-17T18:03:43.605774

        existing_doc = self.get_existing_doc(_id)

        # update_file = os.path.basename(self.data_source.data_source_file_path)

        # Create the update history item
        update_history_item = {
            "updated_date": updated_date,
            "update_file": update_file,
            "removed_citations": removed_citations,
            "added_citations": added_citations
        }

        # Get the existing update history
        update_history = []
        if 'update_history' in existing_doc:
            update_history = existing_doc['update_history']

        # Add the original citations list if not present
        if len(update_history) == 0:
            update_history.append({"original_citations": original_citations})

        # Add the new update history item
        update_history.append(update_history_item)

        doc = {"update_history": update_history}

        doc = {'doc': doc}

        self.data_loader_utils.update_doc(_id, doc)

    def update_doc(self, _id, original_citations):
        print 'Updating doc', _id, len(original_citations), 'citations'
        # input = raw_input('Continue?')
        # if input.lower() in ['n', 'no', '0']:
        #     sys.exit(1)

        # Get the existing update history
        update_history = []

        # Add the original citations list if not present
        if len(update_history) == 0:
            update_history.append({"original_citations": original_citations})

        doc = {"update_history": update_history}

        doc = {'doc': doc}

        self.data_loader_utils.update_doc(_id, doc)

    def get_original_docs(self):
        load_config = self.get_load_config(baseline_directory)
        ftp_manager = FTPManager(load_config)

        baseline_file_urls = ftp_manager.get_baseline_file_urls()
        # ftp_manager.download_missing_files(file_urls=baseline_file_urls, no_of_files=10)
        baseline_files = file_manager.get_baseline_files(
            load_config, baseline_file_urls)

        # Filter
        filtered_baseline_files = []
        for baseline_file in baseline_files:
            if 'pubmed19n0511' in baseline_file:
                filtered_baseline_files.append(baseline_file)
            elif 'pubmed19n0560' in baseline_file:
                filtered_baseline_files.append(baseline_file)

        baseline_files = filtered_baseline_files

        print 'Baseline files:', len(baseline_files)

        for baseline_file in baseline_files:
            # self.process_baseline_file(baseline_file)
            process = Process(target=self.process_baseline_file,
                              args=(baseline_file, ))
            process.start()

            self.processes.append(process)
            if len(self.processes) >= 16:
                old_process = self.processes.pop(0)
                old_process.join()

            time.sleep(0.5)

        while len(self.processes) > 0:
            old_process = self.processes.pop(0)
            old_process.join()

        self.combine_inverted_index()
        self.combine_original_docs()

        # file_utils.save_file(self.load_config.other_files_directory(), 'original_docs.json', self.original_docs)
        # file_utils.save_file(self.load_config.other_files_directory(), 'inverted_index.json', self.inverted_index)

    def combine_inverted_index(self):
        files = []
        generated_files_directory = self.load_config.generated_files_directory(
        )
        for name in os.listdir(generated_files_directory):
            file_path = os.path.join(generated_files_directory, name)
            if os.path.isfile(file_path) and name.startswith(
                    'inverted_index_'):
                files.append(name)

        combined = {}
        for name in files:
            data = file_utils.load_file(generated_files_directory, name)
            combined.update(data)

        print 'Inverted index', len(combined)
        file_utils.save_file(self.load_config.other_files_directory(),
                             'inverted_index.json', combined)

    def combine_original_docs(self):
        files = []
        generated_files_directory = self.load_config.generated_files_directory(
        )
        for name in os.listdir(generated_files_directory):
            file_path = os.path.join(generated_files_directory, name)
            if os.path.isfile(file_path) and name.startswith('original_docs_'):
                files.append(name)

        combined = {}
        for name in files:
            data = file_utils.load_file(generated_files_directory, name)
            combined.update(data)

        print 'Original docs', len(combined)
        file_utils.save_file(self.load_config.other_files_directory(),
                             'original_docs.json', combined)

    def process_baseline_file(self, baseline_file):
        print "Processing file:", baseline_file

        process_file = ProcessBaselineFile(
            self.load_config, dict.fromkeys(self.updated_docs.keys()),
            baseline_file)
        process_file.run()

    # def process_baseline_file(self, baseline_file):
    #     print "Processing file:", baseline_file

    #     file_name = os.path.basename(baseline_file)
    #     self.current_baseline_file = file_name.split('.')[0]

    #     last_time_stamp = time.time()

    #     xml_data_source = XMLDataSource(baseline_file, 2)
    #     xml_data_source.process_rows(self.process_baseline_row)

    #     current_time_stamp = time.time()
    #     diff = current_time_stamp - last_time_stamp

    #     print 'Time for file', baseline_file, diff

    def process_baseline_row(self, row, current_index):
        if current_index % 100 == 0:
            print current_index
        _id = self.extract_id(self.load_config.data_source_name, row,
                              current_index)
        if _id is not None:
            self.inverted_index[_id] = self.current_baseline_file
            if _id in self.updated_docs:
                doc = self.extract_data(_id, self.load_config.data_source_name,
                                        row)
                if doc is not None and len(doc) > 0:
                    self.original_docs[_id] = doc

                # if len(self.original_docs) % 100 == 0:
                print 'Original docs', len(self.original_docs)

        return True

    def get_updated_docs(self):
        load_config = self.get_load_config(updates_directory)
        ftp_manager = FTPManager(load_config)

        update_file_urls = ftp_manager.get_update_file_urls()
        update_file_urls = update_file_urls[:2]

        ftp_manager.download_missing_files(file_urls=update_file_urls,
                                           no_of_files=2)

        all_files = file_manager.get_all_files(load_config)
        files_to_process = all_files[:2]
        # files_to_process = file_manager.get_new_update_files(load_config, update_file_urls, 2)
        print files_to_process

        for update_file in files_to_process:
            file_name = os.path.basename(update_file)
            self.current_update_file = file_name  #file_name.split('.')[0]

            xml_data_source = XMLDataSource(update_file, 2)
            xml_data_source.process_rows(self.process_row)

        print 'Total updated ids:', len(self.updated_docs)

        file_utils.save_file(self.load_config.other_files_directory(),
                             'updated_docs.json', self.updated_docs)
        file_utils.save_file(self.load_config.other_files_directory(),
                             'inverted_index_for_updated_docs.json',
                             self.inverted_index_for_updated_docs)

    def process_row(self, row, current_index):
        _id = self.extract_id(self.load_config.data_source_name, row,
                              current_index)
        if _id is not None and _id not in self.updated_docs:
            doc = self.extract_data(_id, self.load_config.data_source_name,
                                    row)
            if doc is not None and len(doc) > 0:
                self.updated_docs[_id] = doc

            self.inverted_index_for_updated_docs[
                _id] = self.current_update_file

            if len(self.updated_docs) % 1000 == 0:
                print 'Updated docs', len(self.updated_docs)

        return True

    def get_load_config(self, root_directory):
        load_config = LoadConfig()
        load_config.root_directory = root_directory
        load_config.process_count = psutil.cpu_count()

        load_config.server = self.server
        load_config.server_username = self.server_username
        load_config.server_password = self.server_password
        load_config.index = self.index
        load_config.type = self.type

        load_config.data_mapper = self.get_data_mapper()
        load_config.data_extractor = self.get_data_extractor()
        load_config.max_memory_percent = self.get_max_memory_percent()

        return load_config

    def get_data_mapper(self):
        return PubmedDataMapper()

    def get_data_extractor(self):
        return PubmedDataExtractor()

    def get_max_memory_percent(self):
        return 75

    def extract_id(self, name, row, current_index):
        if self.load_config.data_extractor is not None:
            if self.load_config.data_extractor.should_generate_id(name):
                return self.load_config.data_extractor.generate_id(
                    current_index)
            else:
                return self.load_config.data_extractor.extract_id(name, row)

        self.load_config.log(LOG_LEVEL_WARNING,
                             'Error: no data extractor configured')
        return None

    def extract_data(self, _id, name, row):
        if self.load_config.data_extractor is not None:
            return self.load_config.data_extractor.extract_data(_id, name, row)

        return row
コード例 #18
0
def create_index():
    data_loader_utils = DataLoaderUtils(SERVER, INDEX, TYPE)
    data_loader_utils.check_and_create_index('mapping.json')
コード例 #19
0
class PubmedRelationshipProcessor(DataSourceProcessor):
    def __init__(self, load_config, data_source, data_source_summary):
        super(PubmedRelationshipProcessor,
              self).__init__(load_config, data_source)
        self.data_source_summary = data_source_summary
        self.data_loader_utils = DataLoaderUtils(
            self.load_config.server, self.load_config.index,
            self.load_config.type, self.load_config.server_username,
            self.load_config.server_password)
        self.load_relationships = True

        self.docs_with_new_citations = {}
        self.docs_citations_history = {}

        self.existing_docs = {}

        self.data_utils = DataUtils()

    def docs_fetched(self, docs, index, type):
        self.load_config.log(LOG_LEVEL_TRACE, 'Docs fetched', len(docs))
        for doc in docs:
            _id = doc['_id']
            if '_source' in doc:
                existing_doc = doc['_source']
                self.existing_docs[_id] = existing_doc

    def get_docs_with_new_citations(self):
        return self.docs_with_new_citations

    def get_citations_history(self):
        return self.docs_citations_history

    def update_citations_history(self, new_doc, _id, new_citations,
                                 existing_citations):
        # Update citation history
        if _id not in self.docs_citations_history:
            self.docs_citations_history[_id] = {}

        # Set the new doc flag
        self.docs_citations_history[_id]['new'] = new_doc

        # Update new citations
        if 'new_citations' not in self.docs_citations_history[_id]:
            self.docs_citations_history[_id]['new_citations'] = []

        self.docs_citations_history[_id]['new_citations'].extend(new_citations)

        # Update existing citations
        if 'existing_citations' not in self.docs_citations_history[_id]:
            self.docs_citations_history[_id]['existing_citations'] = []

        self.docs_citations_history[_id]['existing_citations'].extend(
            existing_citations)

    def process_relationships(self, extracted_ids):
        # all_indexed_ids = {}
        # if 'indexed_ids' in self.data_source_summary:
        #     all_indexed_ids = self.data_source_summary['indexed_ids']

        all_updated_ids = {}
        if 'updated_ids' in self.data_source_summary:
            all_updated_ids = self.data_source_summary['updated_ids']

        print 'all_updated_ids', len(all_updated_ids)
        print 'extracted_ids', len(extracted_ids)

        # Fetch existing (updated) docs
        self.load_config.log(LOG_LEVEL_DEBUG, 'Fetching docs',
                             self.load_config.server, self.load_config.index,
                             self.load_config.type)

        ids_to_fetch = all_updated_ids.keys()
        self.data_utils.batch_fetch_docs_for_ids(
            self.load_config.server, ids_to_fetch, self.load_config.index,
            self.load_config.type, self.docs_fetched,
            self.load_config.doc_fetch_batch_size,
            self.load_config.server_username, self.load_config.server_password)

        print 'existing_docs', len(self.existing_docs)

        pubmed_citations_pubmed = {}
        pubmed_cited_bys_pubmed = {}

        citations_to_remove = {}
        cited_bys_to_remove = {}

        count = 0
        for _id in extracted_ids:
            count += 1

            data = extracted_ids[_id]

            if len(data) == 0:
                print 'No data for', _id

            new_doc = False
            existing_citations = []
            new_citations = self.load_config.data_mapper.get_citations(data)

            if _id in all_updated_ids:
                # Existing doc
                existing_doc = self.get_existing_doc(_id)
                existing_citations = self.get_citations(existing_doc)
                new_doc = False
            else:
                new_doc = True

            self.update_citations_history(new_doc, _id, new_citations,
                                          existing_citations)

            added_citations = []
            removed_citations = []

            # Get removed citations
            for existing_citation in existing_citations:
                if existing_citation not in new_citations:
                    removed_citations.append(existing_citation)

            # Get added citations
            for new_citation in new_citations:
                if new_citation not in existing_citations:
                    added_citations.append(new_citation)

            # Added citations and cited bys
            for citation in added_citations:
                # Citations
                if _id not in pubmed_citations_pubmed:
                    pubmed_citations_pubmed[_id] = []
                if citation not in pubmed_citations_pubmed[_id]:
                    pubmed_citations_pubmed[_id].append(citation)

                # Cited by
                if citation not in pubmed_cited_bys_pubmed:
                    pubmed_cited_bys_pubmed[citation] = []
                if _id not in pubmed_cited_bys_pubmed[citation]:
                    pubmed_cited_bys_pubmed[citation].append(_id)

            # Get existing cited bys (citations from other existing docs) for the new doc
            # if new_doc:
            #     existing_cited_bys = self.get_existing_cited_bys(_id)

            #     for cited_by in existing_cited_bys:
            #         if _id not in pubmed_cited_bys_pubmed:
            #             pubmed_cited_bys_pubmed[_id] = []
            #         if cited_by not in pubmed_cited_bys_pubmed[_id]:
            #             pubmed_cited_bys_pubmed[_id].append(cited_by)

            # Removed citations and cited bys
            for removed_citation in removed_citations:
                # Removed citations
                if _id not in citations_to_remove:
                    citations_to_remove[_id] = []
                if removed_citation not in citations_to_remove[_id]:
                    citations_to_remove[_id].append(removed_citation)

                # Removed cited_bys
                if removed_citation not in cited_bys_to_remove:
                    cited_bys_to_remove[removed_citation] = []
                if _id not in cited_bys_to_remove[removed_citation]:
                    cited_bys_to_remove[removed_citation].append(_id)

            # Docs with new citations
            if len(added_citations) > 0:
                if _id not in self.docs_with_new_citations:
                    self.docs_with_new_citations[_id] = []
                self.docs_with_new_citations[_id].extend(added_citations)

            if count % 1000 == 0:
                print 'Processed', count, 'docs'

        pubmed_ids = {}
        pubmed_ids = self.load_config.data_mapper.reformat(
            reformatted_array=pubmed_ids,
            relations_array=pubmed_citations_pubmed,
            dest_index_id=ID_PUBMED,
            relationship_type=RELATIONSHIP_TYPE_CITATIONS,
            removed_ids=citations_to_remove)

        pubmed_ids = self.load_config.data_mapper.reformat(
            reformatted_array=pubmed_ids,
            relations_array=pubmed_cited_bys_pubmed,
            dest_index_id=ID_PUBMED,
            relationship_type=RELATIONSHIP_TYPE_CITED_BYS,
            removed_ids=cited_bys_to_remove)

        print 'pubmed_citations_pubmed', len(pubmed_citations_pubmed)
        print 'pubmed_cited_bys_pubmed', len(pubmed_cited_bys_pubmed)

        print 'citations_to_remove', len(citations_to_remove)
        print 'cited_bys_to_remove', len(cited_bys_to_remove)

        print 'reformatted pubmed_ids', len(pubmed_ids)

        relationships = dict()
        relationships[ID_PUBMED] = pubmed_ids

        return relationships

    # def get_cited_bys_for_doc(self, _id):
    #     doc = self.fetch_existing_doc(_id)
    #     return self.get_cited_bys(doc)

    # Fetch existing doc from elasticsearch
    def fetch_existing_doc(self, _id):
        existing_doc = self.data_loader_utils.fetch_doc(_id)
        if existing_doc is not None and '_source' in existing_doc:
            existing_doc = existing_doc['_source']
        return existing_doc

    def get_existing_doc(self, _id):
        existing_doc = None
        if _id in self.existing_docs:
            existing_doc = self.existing_docs[_id]

        # Retry two times if not obtained in mget
        if existing_doc is None or len(existing_doc) == 0:
            existing_doc = self.fetch_existing_doc(_id)
            if existing_doc is None or len(existing_doc) == 0:
                existing_doc = self.fetch_existing_doc(_id)

        return existing_doc

    def get_cited_bys(self, doc):
        cited_bys = []
        if doc is not None and 'cited_bys' in doc:
            cited_bys_array = doc['cited_bys']

            for cited_by_item in cited_bys_array:
                source = cited_by_item['source']
                index_id = cited_by_item['index_id']
                if source == self.load_config.source and index_id == ID_PUBMED:
                    cited_bys = cited_by_item['ids']
                    break

        return cited_bys

    # Get citations from doc
    def get_citations(self, doc):
        citations = []
        if doc is not None and 'citations' in doc:
            citations_array = doc['citations']

            for citation_item in citations_array:
                source = citation_item['source']
                index_id = citation_item['index_id']
                if source == self.load_config.source and index_id == ID_PUBMED:
                    citations = citation_item['ids']
                    break

        return citations

    def has_multiple_citations(self, doc):
        citations = []
        if 'citations' in doc:
            citations_array = doc['citations']
            if len(citations_array) > 1:
                return True

        return False

    def get_existing_cited_bys(self, _id):
        """
        Search elasticsearch for any docs citing the given id
        """
        query = {
            "bool": {
                "must": [{
                    "match": {
                        "citations.ids": _id
                    }
                }, {
                    "match": {
                        "citations.source": ""
                    }
                }, {
                    "match": {
                        "citations.index_id": ID_PUBMED
                    }
                }]
            }
        }

        ids = self.data_utils.batch_fetch_ids_for_query(
            base_url=self.load_config.server,
            query=query,
            index=self.load_config.index,
            type=self.load_config.type)

        return ids

    def update_doc(self, _id, existing_doc, original_citations,
                   removed_citations, added_citations):
        if len(removed_citations) > 0 or len(added_citations) > 0:
            print 'Updating doc:', _id, 'original_citations', len(
                original_citations), 'removed_citations', len(
                    removed_citations), 'added_citations', len(added_citations)
        now = datetime.datetime.now()

        updated_date = now.isoformat()
        update_file = os.path.basename(self.data_source.data_source_file_path)

        # Create the update history item
        update_history_item = {
            "updated_date": updated_date,
            "update_file": update_file,
            "removed_citations": removed_citations,
            "added_citations": added_citations
        }

        # Get the existing update history
        update_history = []
        if 'update_history' in existing_doc:
            update_history = existing_doc['update_history']

        # Add the original citations list if not present
        if len(update_history) == 0:
            update_history.append({"original_citations": original_citations})

        # Add the new update history item
        update_history.append(update_history_item)

        doc = {"update_history": update_history}

        doc = {'doc': doc}

        self.data_loader_utils.update_doc(_id, doc)
コード例 #20
0
class CopyRelationships(object):
    def __init__(self, src_server, dest_server, src_index, src_type, dst_index,
                 dst_type, username, password):
        self.src_data_loader_utils = DataLoaderUtils(src_server, src_index,
                                                     src_type)
        self.dest_data_loader_utils = DataLoaderUtils(dest_server, dst_index,
                                                      dst_type)

        self.processed_doc_count = 0
        self.total_doc_count = 0

        self.data_utils = DataUtils()

        self.relations_to_exclude = []
        self.missing_destination_ids = []

        self.username = username
        self.password = password

        self.last_time_stamp = 0
        self.diff_average = 0

    def run(self):
        self.processed_doc_count = 0
        self.total_doc_count = self.get_total_doc_count()

        print 'Total doc count', self.total_doc_count

        # self.create_destination_index(mapping=None)
        self.export_doc_ids(server=self.src_data_loader_utils.server,
                            src_index=self.src_data_loader_utils.index,
                            src_type=self.src_data_loader_utils.type)

        print 'saving missing docs'

        file_utils.save_file('/data/data_loading/pubmed_2019',
                             'missing_docs_pubmed2019.json',
                             self.missing_destination_ids)

    def run_for_ids(self, doc_ids, mapping=None):
        self.processed_doc_count = 0
        self.total_doc_count = len(doc_ids)

        print 'Total doc count', self.total_doc_count

        print 'Fetching docs from source index'
        batch_doc_processor = BatchDocProcessor(doc_ids, self.copy_docs_batch,
                                                1000, 1, 0)
        batch_doc_processor.run()

        file_utils.save_file('/data/data_loading/pubmed_2019',
                             'missing_docs_pubmed2019.json',
                             self.missing_destination_ids)

    def export_doc_ids(self, server, src_index, src_type):
        print 'Fetching doc ids for', src_index, src_type
        query = {"match_all": {}}
        self.data_utils.batch_fetch_ids_for_query(base_url=server,
                                                  index=src_index,
                                                  type=src_type,
                                                  query=query,
                                                  ids_fetched=self.ids_fetched,
                                                  batch_size=10000)

        # print 'Done, fetched', len(documents_ids), 'doc ids'

    def ids_fetched(self, ids, index, type):
        print 'Ids fetched', len(ids)
        self.copy_docs_batch(ids)

    def copy_docs_batch(self, doc_ids):
        print 'Fetching docs'
        self.data_utils.batch_fetch_docs_for_ids(
            base_url=self.src_data_loader_utils.server,
            ids=doc_ids,
            index=self.src_data_loader_utils.index,
            type=self.src_data_loader_utils.type,
            docs_fetched=self.docs_fetched,
            batch_size=500)

    def docs_fetched(self, docs, index, type):
        print 'Docs fetched', len(docs)
        docs_to_copy = {}

        # print 'Docs fetched', len(docs)
        for doc in docs:
            _id = doc['_id']
            if '_source' in doc:
                existing_doc = doc['_source']
                docs_to_copy[_id] = existing_doc

        self.copy_relations(docs_to_copy)

        # Update progress
        self.processed_doc_count += len(docs)
        progress = ((self.processed_doc_count / float(self.total_doc_count)) *
                    100)

        current_time_stamp = time.time()
        diff = current_time_stamp - self.last_time_stamp
        self.diff_average = float(diff + self.diff_average) / 2
        time_remaining = diff * (float(self.total_doc_count) / len(docs))

        self.last_time_stamp = current_time_stamp

        print '---------------------------------------------------------------------------------------------'
        print 'Progress', self.processed_doc_count, '/', self.total_doc_count, progress, '%', time_remaining, 'secs'
        print '---------------------------------------------------------------------------------------------'

    def get_src_relations(self, src_doc, relationship_type):
        src_relations = []

        if relationship_type in src_doc:
            relations = src_doc[relationship_type]

            for relation_item in relations:
                exclude_relation_item = False
                for relation_to_exclude in self.relations_to_exclude:
                    if relation_to_exclude['source'] == relation_item[
                            'source'] and relation_to_exclude[
                                'index_id'] == relation_item['index_id']:
                        exclude_relation_item = True
                        break

                if not exclude_relation_item:
                    src_relations.append(relation_item)

        return src_relations

    def get_dest_relations(self, dest_doc, relationship_type):
        dest_relations = []

        if relationship_type in dest_doc:
            dest_relations = dest_doc[relationship_type]

        return dest_relations

    def add_relations(self, append_ids, relation, relations_list):
        relation_found = False
        for existing_relation in relations_list:
            # print existing_relation['source'], relation['source'], existing_relation['index_id'], relation['index_id']
            if existing_relation['source'] == relation[
                    'source'] and existing_relation['index_id'] == relation[
                        'index_id']:
                existing_relation_ids = existing_relation['ids']

                if append_ids:
                    relation_ids = relation['ids']

                    for _id in relation_ids:
                        if _id not in existing_relation_ids:
                            existing_relation_ids.append(_id)

                existing_relation['ids'] = existing_relation_ids

                relation_found = True
                break

        if not relation_found:
            relations_list.append(relation)

        return relations_list

    def merge_relations(self, src_doc, dest_doc, relationship_type):
        dest_relations = self.get_dest_relations(dest_doc, relationship_type)
        src_relations = self.get_src_relations(src_doc, relationship_type)

        # print 'src_relations', len(src_relations)
        # print 'dest_relations', len(dest_relations)

        combined_relations = []
        for relation in dest_relations:
            combined_relations = self.add_relations(True, relation,
                                                    combined_relations)

        for relation in src_relations:
            combined_relations = self.add_relations(True, relation,
                                                    combined_relations)

        return combined_relations

    def copy_relations(self, src_docs):
        bulk_data = ''
        count = 0

        # Fetch destination docs
        destination_ids = src_docs.keys()
        destination_docs_array = self.data_utils.fetch_docs_for_ids(
            base_url=self.dest_data_loader_utils.server,
            ids=destination_ids,
            index=self.dest_data_loader_utils.index,
            type=self.dest_data_loader_utils.type,
            username=self.username,
            password=self.password)

        # Create destination doc dict
        destination_docs = {}
        for doc in destination_docs_array:
            _id = doc['_id']
            if '_source' in doc:
                destination_docs[_id] = doc['_source']

        # Find missing destination docs
        for _id in destination_ids:
            if _id not in destination_docs:
                self.missing_destination_ids.append(_id)

        print 'Missing ids', len(self.missing_destination_ids)
        # print 'dest ids', len()

        # Copy relations
        for _id in destination_docs:
            dest_doc = destination_docs[_id]
            src_doc = src_docs[_id]

            dest_relations = {}

            dest_relations[RELATIONSHIP_TYPE_CITATIONS] = self.merge_relations(
                src_doc, dest_doc, RELATIONSHIP_TYPE_CITATIONS)
            dest_relations[RELATIONSHIP_TYPE_CITED_BYS] = self.merge_relations(
                src_doc, dest_doc, RELATIONSHIP_TYPE_CITED_BYS)
            dest_relations[RELATIONSHIP_TYPE_RELATIONS] = self.merge_relations(
                src_doc, dest_doc, RELATIONSHIP_TYPE_RELATIONS)

            doc = {}
            if len(dest_relations[RELATIONSHIP_TYPE_CITATIONS]) > 0:
                doc[RELATIONSHIP_TYPE_CITATIONS] = dest_relations[
                    RELATIONSHIP_TYPE_CITATIONS]

            if len(dest_relations[RELATIONSHIP_TYPE_CITED_BYS]) > 0:
                doc[RELATIONSHIP_TYPE_CITED_BYS] = dest_relations[
                    RELATIONSHIP_TYPE_CITED_BYS]

            if len(dest_relations[RELATIONSHIP_TYPE_RELATIONS]) > 0:
                doc[RELATIONSHIP_TYPE_RELATIONS] = dest_relations[
                    RELATIONSHIP_TYPE_RELATIONS]

            # if len(dest_relations[RELATIONSHIP_TYPE_CITATIONS]) >= 2:
            #     print _id

            count += 1

            # doc = docs_to_copy[es_id]
            bulk_data += self.dest_data_loader_utils.bulk_update_header(_id)
            bulk_data += '\n'
            doc = {'doc': doc}
            bulk_data += json.dumps(doc)
            bulk_data += '\n'

            # if count % 1000 == 0:
            #     print 'Processed', 1000, 'docs'
            if len(bulk_data) >= 150000:
                print _id
                self.load_bulk_data(bulk_data)
                # print 'Copied', count, 'docs'
                bulk_data = ''

        if len(bulk_data) > 0:
            self.load_bulk_data(bulk_data)
            pass

        # print 'Copied', count, 'docs'

    # def create_destination_index(self, mapping=None):
    #     if mapping is None:
    #         # Get mapping from src index
    #         mapping = self.src_data_loader_utils.get_mapping_from_server()

    #     if not self.dest_data_loader_utils.index_exists():
    #         print 'Creating index'
    #         self.dest_data_loader_utils.put_mapping(mapping)
    #         # migrate_index(self.dest_data_loader_utils.index)
    #     else:
    #         print self.dest_data_loader_utils.index, 'exists'

    def load_bulk_data(self, bulk_data):
        print 'Bulk data size', len(bulk_data), 'loading...'
        response = self.dest_data_loader_utils.load_bulk_data(bulk_data)

        if response:
            pass
            # print 'Done loading bulk data, saving response'
        else:
            print 'Bulk data load failed'

    def get_total_doc_count(self):
        return self.data_utils.get_total_doc_count(
            base_url=self.src_data_loader_utils.server,
            index=self.src_data_loader_utils.index,
            type=self.src_data_loader_utils.type)


# src_server = 'http://localhost:9200'
# src_index = 'pubmed2018_v5'
# src_type = 'article'

# dest_server = 'http://localhost:9200'
# dest_index = 'pubmed2019'
# dest_type = 'article'

# copy_relations = CopyRelationships(src_server=src_server,
#                                     dest_server=dest_server,
#                                     src_index=src_index,
#                                     src_type=src_type,
#                                     dst_index=dest_index,
#                                     dst_type=dest_type,
#                                     username='',
#                                     password='')

# copy_relations.relations_to_exclude.append({
#     "source": "",
#     "index_id": ID_PUBMED
# })
# copy_relations.run()
# copy_relations.run_for_ids([12620793])