Beispiel #1
0
class FindMissingIds(object):
    def __init__(self):
        self.missing_ids = {}
        self.new_ids = {}
        self.data_utils = DataUtils()
        self.data_loader_utils = DataLoaderUtils(SERVER, OLD_INDEX, OLD_TYPE,
                                                 '', '')

        self.docs_for_dolan = {}

    def run(self):
        old_ids = export_doc_ids(server=SERVER,
                                 src_index=OLD_INDEX,
                                 src_type=OLD_TYPE)

        new_ids = export_doc_ids(server=SERVER,
                                 src_index=NEW_INDEX,
                                 src_type=NEW_TYPE)

        for _id in old_ids:
            if _id not in new_ids:
                self.missing_ids[_id] = 0
                if len(self.missing_ids) % 1000 == 0:
                    print 'Missing ids', len(self.missing_ids)

        for _id in new_ids:
            if _id not in old_ids:
                self.new_ids[_id] = 0
                if len(self.new_ids) % 1000 == 0:
                    print 'New ids', len(self.new_ids)

        print 'Missing ids', len(self.missing_ids)
        print 'New ids', len(self.new_ids)

        file_utils.make_directory(missing_ids_directory)

        file_utils.save_file(missing_ids_directory, 'missing_ids.json',
                             self.missing_ids.keys())
        file_utils.save_file(missing_ids_directory, 'new_ids.json',
                             self.new_ids)

    def check_tags_and_annotations(self):
        missing_ids = file_utils.load_file(missing_ids_directory,
                                           'missing_ids.json')
        new_ids = file_utils.load_file(missing_ids_directory, 'new_ids.json')

        print 'Missing ids', len(missing_ids)
        print 'New ids', len(new_ids)

        docs_with_tags = self.fetch_ids()

        missing_docs_with_tags = []
        for _id in missing_ids:
            if _id in docs_with_tags:
                missing_docs_with_tags.append(_id)
                print 'Missing docs with tags', _id

        print 'Missing docs with tags', len(missing_docs_with_tags)
        print 'Missing docs with tags', json.dumps(missing_docs_with_tags)

        for _id in missing_docs_with_tags:
            existing_doc = self.get_existing_doc(_id)
            if 'userTags' in existing_doc:
                user_tags = existing_doc['userTags']
                for user_tag in user_tags:
                    added_by = user_tag['added_by']

                    if added_by == '*****@*****.**':
                        self.docs_for_dolan[_id] = existing_doc
                        print _id
                        print user_tags

                    break

        print 'Docs for Dolan', len(self.docs_for_dolan)

        print 'Docs for Dolan', self.docs_for_dolan.keys()

    def get_existing_doc(self, _id):
        exisiting_doc = self.data_loader_utils.fetch_doc(_id)
        if exisiting_doc is not None and '_source' in exisiting_doc:
            exisiting_doc = exisiting_doc['_source']
        return exisiting_doc

    def fetch_ids(self):
        combined_docs = {}

        tags_query = self.tags_query()
        annotations_query = self.annotations_query()

        print 'Fetching docs with tags', SERVER, OLD_INDEX, OLD_TYPE
        docs_with_tags = self.data_utils.batch_fetch_ids_for_query(
            base_url=SERVER,
            query=tags_query,
            index=OLD_INDEX,
            type=OLD_TYPE,
            ids_fetched=self.ids_fetched,
            batch_size=1000)
        print len(docs_with_tags), 'docs_with_tags'
        for _id in docs_with_tags:
            combined_docs[_id] = ''

        print 'Fetching docs with annotations', SERVER, OLD_INDEX, OLD_TYPE
        docs_with_annotations = self.data_utils.batch_fetch_ids_for_query(
            base_url=SERVER,
            query=annotations_query,
            index=OLD_INDEX,
            type=OLD_TYPE,
            ids_fetched=self.ids_fetched,
            batch_size=1000)

        print len(docs_with_annotations), 'docs_with_annotations'
        for _id in docs_with_annotations:
            combined_docs[_id] = ''

        print len(combined_docs), 'combined_docs'
        return combined_docs

    def ids_fetched(self, ids, index, type):
        print len(ids), 'ids fetched'

    def tags_query(self):
        tags_query = {
            "nested": {
                "path": "userTags",
                "query": {
                    "bool": {
                        "must": [{
                            "exists": {
                                "field": "userTags"
                            }
                        }]
                    }
                }
            }
        }

        return tags_query

    def annotations_query(self):
        annotations_query = {
            "nested": {
                "path": "annotations",
                "query": {
                    "bool": {
                        "must": [{
                            "exists": {
                                "field": "annotations"
                            }
                        }]
                    }
                }
            }
        }

        return annotations_query
class PubmedRelationshipProcessor(DataSourceProcessor):
    def __init__(self, load_config, data_source, data_source_summary):
        super(PubmedRelationshipProcessor,
              self).__init__(load_config, data_source)
        self.data_source_summary = data_source_summary
        self.data_loader_utils = DataLoaderUtils(
            self.load_config.server, self.load_config.index,
            self.load_config.type, self.load_config.server_username,
            self.load_config.server_password)
        self.load_relationships = True

        self.docs_with_new_citations = {}
        self.docs_citations_history = {}

        self.existing_docs = {}

        self.data_utils = DataUtils()

    def docs_fetched(self, docs, index, type):
        self.load_config.log(LOG_LEVEL_TRACE, 'Docs fetched', len(docs))
        for doc in docs:
            _id = doc['_id']
            if '_source' in doc:
                existing_doc = doc['_source']
                self.existing_docs[_id] = existing_doc

    def get_docs_with_new_citations(self):
        return self.docs_with_new_citations

    def get_citations_history(self):
        return self.docs_citations_history

    def update_citations_history(self, new_doc, _id, new_citations,
                                 existing_citations):
        # Update citation history
        if _id not in self.docs_citations_history:
            self.docs_citations_history[_id] = {}

        # Set the new doc flag
        self.docs_citations_history[_id]['new'] = new_doc

        # Update new citations
        if 'new_citations' not in self.docs_citations_history[_id]:
            self.docs_citations_history[_id]['new_citations'] = []

        self.docs_citations_history[_id]['new_citations'].extend(new_citations)

        # Update existing citations
        if 'existing_citations' not in self.docs_citations_history[_id]:
            self.docs_citations_history[_id]['existing_citations'] = []

        self.docs_citations_history[_id]['existing_citations'].extend(
            existing_citations)

    def process_relationships(self, extracted_ids):
        # all_indexed_ids = {}
        # if 'indexed_ids' in self.data_source_summary:
        #     all_indexed_ids = self.data_source_summary['indexed_ids']

        all_updated_ids = {}
        if 'updated_ids' in self.data_source_summary:
            all_updated_ids = self.data_source_summary['updated_ids']

        print 'all_updated_ids', len(all_updated_ids)
        print 'extracted_ids', len(extracted_ids)

        # Fetch existing (updated) docs
        self.load_config.log(LOG_LEVEL_DEBUG, 'Fetching docs',
                             self.load_config.server, self.load_config.index,
                             self.load_config.type)

        ids_to_fetch = all_updated_ids.keys()
        self.data_utils.batch_fetch_docs_for_ids(
            self.load_config.server, ids_to_fetch, self.load_config.index,
            self.load_config.type, self.docs_fetched,
            self.load_config.doc_fetch_batch_size,
            self.load_config.server_username, self.load_config.server_password)

        print 'existing_docs', len(self.existing_docs)

        pubmed_citations_pubmed = {}
        pubmed_cited_bys_pubmed = {}

        citations_to_remove = {}
        cited_bys_to_remove = {}

        count = 0
        for _id in extracted_ids:
            count += 1

            data = extracted_ids[_id]

            if len(data) == 0:
                print 'No data for', _id

            new_doc = False
            existing_citations = []
            new_citations = self.load_config.data_mapper.get_citations(data)

            if _id in all_updated_ids:
                # Existing doc
                existing_doc = self.get_existing_doc(_id)
                existing_citations = self.get_citations(existing_doc)
                new_doc = False
            else:
                new_doc = True

            self.update_citations_history(new_doc, _id, new_citations,
                                          existing_citations)

            added_citations = []
            removed_citations = []

            # Get removed citations
            for existing_citation in existing_citations:
                if existing_citation not in new_citations:
                    removed_citations.append(existing_citation)

            # Get added citations
            for new_citation in new_citations:
                if new_citation not in existing_citations:
                    added_citations.append(new_citation)

            # Added citations and cited bys
            for citation in added_citations:
                # Citations
                if _id not in pubmed_citations_pubmed:
                    pubmed_citations_pubmed[_id] = []
                if citation not in pubmed_citations_pubmed[_id]:
                    pubmed_citations_pubmed[_id].append(citation)

                # Cited by
                if citation not in pubmed_cited_bys_pubmed:
                    pubmed_cited_bys_pubmed[citation] = []
                if _id not in pubmed_cited_bys_pubmed[citation]:
                    pubmed_cited_bys_pubmed[citation].append(_id)

            # Get existing cited bys (citations from other existing docs) for the new doc
            # if new_doc:
            #     existing_cited_bys = self.get_existing_cited_bys(_id)

            #     for cited_by in existing_cited_bys:
            #         if _id not in pubmed_cited_bys_pubmed:
            #             pubmed_cited_bys_pubmed[_id] = []
            #         if cited_by not in pubmed_cited_bys_pubmed[_id]:
            #             pubmed_cited_bys_pubmed[_id].append(cited_by)

            # Removed citations and cited bys
            for removed_citation in removed_citations:
                # Removed citations
                if _id not in citations_to_remove:
                    citations_to_remove[_id] = []
                if removed_citation not in citations_to_remove[_id]:
                    citations_to_remove[_id].append(removed_citation)

                # Removed cited_bys
                if removed_citation not in cited_bys_to_remove:
                    cited_bys_to_remove[removed_citation] = []
                if _id not in cited_bys_to_remove[removed_citation]:
                    cited_bys_to_remove[removed_citation].append(_id)

            # Docs with new citations
            if len(added_citations) > 0:
                if _id not in self.docs_with_new_citations:
                    self.docs_with_new_citations[_id] = []
                self.docs_with_new_citations[_id].extend(added_citations)

            if count % 1000 == 0:
                print 'Processed', count, 'docs'

        pubmed_ids = {}
        pubmed_ids = self.load_config.data_mapper.reformat(
            reformatted_array=pubmed_ids,
            relations_array=pubmed_citations_pubmed,
            dest_index_id=ID_PUBMED,
            relationship_type=RELATIONSHIP_TYPE_CITATIONS,
            removed_ids=citations_to_remove)

        pubmed_ids = self.load_config.data_mapper.reformat(
            reformatted_array=pubmed_ids,
            relations_array=pubmed_cited_bys_pubmed,
            dest_index_id=ID_PUBMED,
            relationship_type=RELATIONSHIP_TYPE_CITED_BYS,
            removed_ids=cited_bys_to_remove)

        print 'pubmed_citations_pubmed', len(pubmed_citations_pubmed)
        print 'pubmed_cited_bys_pubmed', len(pubmed_cited_bys_pubmed)

        print 'citations_to_remove', len(citations_to_remove)
        print 'cited_bys_to_remove', len(cited_bys_to_remove)

        print 'reformatted pubmed_ids', len(pubmed_ids)

        relationships = dict()
        relationships[ID_PUBMED] = pubmed_ids

        return relationships

    # def get_cited_bys_for_doc(self, _id):
    #     doc = self.fetch_existing_doc(_id)
    #     return self.get_cited_bys(doc)

    # Fetch existing doc from elasticsearch
    def fetch_existing_doc(self, _id):
        existing_doc = self.data_loader_utils.fetch_doc(_id)
        if existing_doc is not None and '_source' in existing_doc:
            existing_doc = existing_doc['_source']
        return existing_doc

    def get_existing_doc(self, _id):
        existing_doc = None
        if _id in self.existing_docs:
            existing_doc = self.existing_docs[_id]

        # Retry two times if not obtained in mget
        if existing_doc is None or len(existing_doc) == 0:
            existing_doc = self.fetch_existing_doc(_id)
            if existing_doc is None or len(existing_doc) == 0:
                existing_doc = self.fetch_existing_doc(_id)

        return existing_doc

    def get_cited_bys(self, doc):
        cited_bys = []
        if doc is not None and 'cited_bys' in doc:
            cited_bys_array = doc['cited_bys']

            for cited_by_item in cited_bys_array:
                source = cited_by_item['source']
                index_id = cited_by_item['index_id']
                if source == self.load_config.source and index_id == ID_PUBMED:
                    cited_bys = cited_by_item['ids']
                    break

        return cited_bys

    # Get citations from doc
    def get_citations(self, doc):
        citations = []
        if doc is not None and 'citations' in doc:
            citations_array = doc['citations']

            for citation_item in citations_array:
                source = citation_item['source']
                index_id = citation_item['index_id']
                if source == self.load_config.source and index_id == ID_PUBMED:
                    citations = citation_item['ids']
                    break

        return citations

    def has_multiple_citations(self, doc):
        citations = []
        if 'citations' in doc:
            citations_array = doc['citations']
            if len(citations_array) > 1:
                return True

        return False

    def get_existing_cited_bys(self, _id):
        """
        Search elasticsearch for any docs citing the given id
        """
        query = {
            "bool": {
                "must": [{
                    "match": {
                        "citations.ids": _id
                    }
                }, {
                    "match": {
                        "citations.source": ""
                    }
                }, {
                    "match": {
                        "citations.index_id": ID_PUBMED
                    }
                }]
            }
        }

        ids = self.data_utils.batch_fetch_ids_for_query(
            base_url=self.load_config.server,
            query=query,
            index=self.load_config.index,
            type=self.load_config.type)

        return ids

    def update_doc(self, _id, existing_doc, original_citations,
                   removed_citations, added_citations):
        if len(removed_citations) > 0 or len(added_citations) > 0:
            print 'Updating doc:', _id, 'original_citations', len(
                original_citations), 'removed_citations', len(
                    removed_citations), 'added_citations', len(added_citations)
        now = datetime.datetime.now()

        updated_date = now.isoformat()
        update_file = os.path.basename(self.data_source.data_source_file_path)

        # Create the update history item
        update_history_item = {
            "updated_date": updated_date,
            "update_file": update_file,
            "removed_citations": removed_citations,
            "added_citations": added_citations
        }

        # Get the existing update history
        update_history = []
        if 'update_history' in existing_doc:
            update_history = existing_doc['update_history']

        # Add the original citations list if not present
        if len(update_history) == 0:
            update_history.append({"original_citations": original_citations})

        # Add the new update history item
        update_history.append(update_history_item)

        doc = {"update_history": update_history}

        doc = {'doc': doc}

        self.data_loader_utils.update_doc(_id, doc)
Beispiel #3
0
class CleanCitations(object):
    def __init__(self):
        self.updated_docs = {}
        self.original_docs = {}

        self.server = SERVER
        self.index = INDEX
        self.type = TYPE

        self.server_username = ''
        self.server_password = ''

        self.load_config = self.get_load_config(clean_citations_directory)
        self.data_loader_utils = DataLoaderUtils(
            self.load_config.server, self.load_config.index,
            self.load_config.type, self.load_config.server_username,
            self.load_config.server_password)

        self.docs_with_updates = {}

        self.inverted_index = {}
        self.current_baseline_file = None
        self.current_update_file = None

        self.processes = []
        self.missing_docs = {}

        self.inverted_index_for_updated_docs = {}

    def run(self):
        # self.get_updated_docs()
        self.updated_docs = file_utils.load_file(
            self.load_config.other_files_directory(), 'updated_docs.json')
        print 'Updated docs:', len(self.updated_docs)
        print 'Original docs:', len(self.original_docs)

        # self.get_original_docs()
        # sys.exit(1)

        self.original_docs = file_utils.load_file(
            self.load_config.other_files_directory(), 'original_docs.json')
        self.inverted_index = file_utils.load_file(
            self.load_config.other_files_directory(), 'inverted_index.json')
        self.inverted_index_for_updated_docs = file_utils.load_file(
            self.load_config.other_files_directory(),
            'inverted_index_for_updated_docs.json')

        print 'Updated docs:', len(self.updated_docs)
        print 'Original docs:', len(self.original_docs)
        print 'Inverted index:', len(self.inverted_index)
        print 'inverted_index_for_updated_docs:', len(
            self.inverted_index_for_updated_docs)
        # print json.dumps(self.inverted_index_for_updated_docs)
        # input = raw_input('Continue?')
        # if input.lower() in ['n', 'no', '0']:
        #     sys.exit(1)

        self.update_docs()

        print 'Docs with updates', len(self.docs_with_updates)
        # print json.dumps(self.docs_with_updates)

        print 'Missing docs'
        print json.dumps(self.missing_docs.keys())

        file_utils.save_file(self.load_config.other_files_directory(),
                             'docs_with_updates.json', self.docs_with_updates)

    def update_docs(self):
        for _id in self.updated_docs:
            if _id in self.original_docs:
                original_doc = self.original_docs[_id]
                updated_doc = self.updated_docs[_id]
                # print original_doc
                # print updated_doc
                original_citations = self.load_config.data_mapper.get_citations(
                    [original_doc])
                updated_citations = self.load_config.data_mapper.get_citations(
                    [updated_doc])

                # print _id, 'original', len(original_citations), 'updated', len(updated_citations)
                if not self.compare_citations(original_citations,
                                              updated_citations):
                    print 'Doc with update', _id
                    self.docs_with_updates[_id] = {
                        'original_citations': len(original_citations),
                        'updated_citations': len(updated_citations),
                        'original_doc': original_doc,
                        'updated_doc': updated_doc
                    }

                added_citations = []
                removed_citations = []
                for citation in updated_citations:
                    if citation not in original_citations:
                        added_citations.append(citation)

                for citation in original_citations:
                    if citation not in updated_citations:
                        removed_citations.append(citation)

                if _id in self.inverted_index_for_updated_docs:
                    update_file = self.inverted_index_for_updated_docs[_id]
                    # print update_file
                    # self.update_doc_with_history(_id, update_file, original_citations, removed_citations, added_citations)
                else:
                    print _id, 'missing from inverted index'
                # self.update_doc(_id, original_citations)

            else:
                updated_doc = self.updated_docs[_id]
                self.missing_docs[_id] = updated_doc
                updated_citations = self.load_config.data_mapper.get_citations(
                    [updated_doc])

                print 'Missing doc', _id, len(updated_citations)

    def compare_citations(self, original_citations, updated_citations):
        for _id in original_citations:
            if _id not in updated_citations:
                return False

        for _id in updated_citations:
            if _id not in original_citations:
                return False

        return True

    def get_existing_doc(self, _id):
        exisiting_doc = self.data_loader_utils.fetch_doc(_id)
        if exisiting_doc is not None and '_source' in exisiting_doc:
            exisiting_doc = exisiting_doc['_source']
        return exisiting_doc

    def update_doc_with_history(self, _id, update_file, original_citations,
                                removed_citations, added_citations):
        print _id, update_file, 'original_citations', len(
            original_citations), 'removed_citations', len(
                removed_citations), 'added_citations', len(added_citations)
        now = datetime.datetime.now()

        # updated_date = now.isoformat()
        updated_date = "2019-01-14T11:16:01.000000"
        # 2019-01-17T18:03:43.605774

        existing_doc = self.get_existing_doc(_id)

        # update_file = os.path.basename(self.data_source.data_source_file_path)

        # Create the update history item
        update_history_item = {
            "updated_date": updated_date,
            "update_file": update_file,
            "removed_citations": removed_citations,
            "added_citations": added_citations
        }

        # Get the existing update history
        update_history = []
        if 'update_history' in existing_doc:
            update_history = existing_doc['update_history']

        # Add the original citations list if not present
        if len(update_history) == 0:
            update_history.append({"original_citations": original_citations})

        # Add the new update history item
        update_history.append(update_history_item)

        doc = {"update_history": update_history}

        doc = {'doc': doc}

        self.data_loader_utils.update_doc(_id, doc)

    def update_doc(self, _id, original_citations):
        print 'Updating doc', _id, len(original_citations), 'citations'
        # input = raw_input('Continue?')
        # if input.lower() in ['n', 'no', '0']:
        #     sys.exit(1)

        # Get the existing update history
        update_history = []

        # Add the original citations list if not present
        if len(update_history) == 0:
            update_history.append({"original_citations": original_citations})

        doc = {"update_history": update_history}

        doc = {'doc': doc}

        self.data_loader_utils.update_doc(_id, doc)

    def get_original_docs(self):
        load_config = self.get_load_config(baseline_directory)
        ftp_manager = FTPManager(load_config)

        baseline_file_urls = ftp_manager.get_baseline_file_urls()
        # ftp_manager.download_missing_files(file_urls=baseline_file_urls, no_of_files=10)
        baseline_files = file_manager.get_baseline_files(
            load_config, baseline_file_urls)

        # Filter
        filtered_baseline_files = []
        for baseline_file in baseline_files:
            if 'pubmed19n0511' in baseline_file:
                filtered_baseline_files.append(baseline_file)
            elif 'pubmed19n0560' in baseline_file:
                filtered_baseline_files.append(baseline_file)

        baseline_files = filtered_baseline_files

        print 'Baseline files:', len(baseline_files)

        for baseline_file in baseline_files:
            # self.process_baseline_file(baseline_file)
            process = Process(target=self.process_baseline_file,
                              args=(baseline_file, ))
            process.start()

            self.processes.append(process)
            if len(self.processes) >= 16:
                old_process = self.processes.pop(0)
                old_process.join()

            time.sleep(0.5)

        while len(self.processes) > 0:
            old_process = self.processes.pop(0)
            old_process.join()

        self.combine_inverted_index()
        self.combine_original_docs()

        # file_utils.save_file(self.load_config.other_files_directory(), 'original_docs.json', self.original_docs)
        # file_utils.save_file(self.load_config.other_files_directory(), 'inverted_index.json', self.inverted_index)

    def combine_inverted_index(self):
        files = []
        generated_files_directory = self.load_config.generated_files_directory(
        )
        for name in os.listdir(generated_files_directory):
            file_path = os.path.join(generated_files_directory, name)
            if os.path.isfile(file_path) and name.startswith(
                    'inverted_index_'):
                files.append(name)

        combined = {}
        for name in files:
            data = file_utils.load_file(generated_files_directory, name)
            combined.update(data)

        print 'Inverted index', len(combined)
        file_utils.save_file(self.load_config.other_files_directory(),
                             'inverted_index.json', combined)

    def combine_original_docs(self):
        files = []
        generated_files_directory = self.load_config.generated_files_directory(
        )
        for name in os.listdir(generated_files_directory):
            file_path = os.path.join(generated_files_directory, name)
            if os.path.isfile(file_path) and name.startswith('original_docs_'):
                files.append(name)

        combined = {}
        for name in files:
            data = file_utils.load_file(generated_files_directory, name)
            combined.update(data)

        print 'Original docs', len(combined)
        file_utils.save_file(self.load_config.other_files_directory(),
                             'original_docs.json', combined)

    def process_baseline_file(self, baseline_file):
        print "Processing file:", baseline_file

        process_file = ProcessBaselineFile(
            self.load_config, dict.fromkeys(self.updated_docs.keys()),
            baseline_file)
        process_file.run()

    # def process_baseline_file(self, baseline_file):
    #     print "Processing file:", baseline_file

    #     file_name = os.path.basename(baseline_file)
    #     self.current_baseline_file = file_name.split('.')[0]

    #     last_time_stamp = time.time()

    #     xml_data_source = XMLDataSource(baseline_file, 2)
    #     xml_data_source.process_rows(self.process_baseline_row)

    #     current_time_stamp = time.time()
    #     diff = current_time_stamp - last_time_stamp

    #     print 'Time for file', baseline_file, diff

    def process_baseline_row(self, row, current_index):
        if current_index % 100 == 0:
            print current_index
        _id = self.extract_id(self.load_config.data_source_name, row,
                              current_index)
        if _id is not None:
            self.inverted_index[_id] = self.current_baseline_file
            if _id in self.updated_docs:
                doc = self.extract_data(_id, self.load_config.data_source_name,
                                        row)
                if doc is not None and len(doc) > 0:
                    self.original_docs[_id] = doc

                # if len(self.original_docs) % 100 == 0:
                print 'Original docs', len(self.original_docs)

        return True

    def get_updated_docs(self):
        load_config = self.get_load_config(updates_directory)
        ftp_manager = FTPManager(load_config)

        update_file_urls = ftp_manager.get_update_file_urls()
        update_file_urls = update_file_urls[:2]

        ftp_manager.download_missing_files(file_urls=update_file_urls,
                                           no_of_files=2)

        all_files = file_manager.get_all_files(load_config)
        files_to_process = all_files[:2]
        # files_to_process = file_manager.get_new_update_files(load_config, update_file_urls, 2)
        print files_to_process

        for update_file in files_to_process:
            file_name = os.path.basename(update_file)
            self.current_update_file = file_name  #file_name.split('.')[0]

            xml_data_source = XMLDataSource(update_file, 2)
            xml_data_source.process_rows(self.process_row)

        print 'Total updated ids:', len(self.updated_docs)

        file_utils.save_file(self.load_config.other_files_directory(),
                             'updated_docs.json', self.updated_docs)
        file_utils.save_file(self.load_config.other_files_directory(),
                             'inverted_index_for_updated_docs.json',
                             self.inverted_index_for_updated_docs)

    def process_row(self, row, current_index):
        _id = self.extract_id(self.load_config.data_source_name, row,
                              current_index)
        if _id is not None and _id not in self.updated_docs:
            doc = self.extract_data(_id, self.load_config.data_source_name,
                                    row)
            if doc is not None and len(doc) > 0:
                self.updated_docs[_id] = doc

            self.inverted_index_for_updated_docs[
                _id] = self.current_update_file

            if len(self.updated_docs) % 1000 == 0:
                print 'Updated docs', len(self.updated_docs)

        return True

    def get_load_config(self, root_directory):
        load_config = LoadConfig()
        load_config.root_directory = root_directory
        load_config.process_count = psutil.cpu_count()

        load_config.server = self.server
        load_config.server_username = self.server_username
        load_config.server_password = self.server_password
        load_config.index = self.index
        load_config.type = self.type

        load_config.data_mapper = self.get_data_mapper()
        load_config.data_extractor = self.get_data_extractor()
        load_config.max_memory_percent = self.get_max_memory_percent()

        return load_config

    def get_data_mapper(self):
        return PubmedDataMapper()

    def get_data_extractor(self):
        return PubmedDataExtractor()

    def get_max_memory_percent(self):
        return 75

    def extract_id(self, name, row, current_index):
        if self.load_config.data_extractor is not None:
            if self.load_config.data_extractor.should_generate_id(name):
                return self.load_config.data_extractor.generate_id(
                    current_index)
            else:
                return self.load_config.data_extractor.extract_id(name, row)

        self.load_config.log(LOG_LEVEL_WARNING,
                             'Error: no data extractor configured')
        return None

    def extract_data(self, _id, name, row):
        if self.load_config.data_extractor is not None:
            return self.load_config.data_extractor.extract_data(_id, name, row)

        return row