def create_load_config():
    load_config = LoadConfig()
    load_config.root_directory = ROOT_DIRECTORY
    # load_config.data_source_name = 'extended_relations'
    load_config.process_count = psutil.cpu_count()

    load_config.server = LOCAL_SERVER
    load_config.index = INDEX
    load_config.type = TYPE

    load_config.data_mapper = IRDBDataMapper()
    load_config.data_extractor = IRDBDataExtractor()
    # load_config.data_source_name = file_name.split('.')[0]
    load_config.max_memory_percent = 75

    return load_config
def get_load_config():
    irdb_load_config = irdb_load_config_getter.get_load_config()
    load_config = LoadConfig()
    load_config.root_directory = irdb_load_config.root_directory

    load_config.server = irdb_load_config.server
    load_config.index = INDEX_MAPPING[ID_PUBMED]['index']
    load_config.type = INDEX_MAPPING[ID_PUBMED]['type']

    load_config.data_extractor = PubmedDataExtractor()
    load_config.data_mapper = PubmedDataMapper()
    # load_config.data_source_name = file_name.split('.')[0]
    load_config.process_count = irdb_load_config.process_count

    # load_config.log_level = LOG_LEVEL_TRACE

    return load_config
    def get_pubmed_load_config(self):
        index_item = es_utils.get_info_for_index_id(ID_PUBMED)
        pubmed_index = index_item['index']
        pubmed_type = index_item['index_type']

        load_config = LoadConfig()
        load_config.root_directory = self.ct_load_config.root_directory

        load_config.server = self.ct_load_config.server
        load_config.index = pubmed_index
        load_config.type = pubmed_type

        load_config.data_extractor = PubmedDataExtractor()
        load_config.data_mapper = PubmedDataMapper()

        return load_config
def get_load_config():
    load_config = LoadConfig()
    load_config.root_directory = ROOT_DIRECTORY

    load_config.server = SERVER
    load_config.index = INDEX
    load_config.type = TYPE

    load_config.process_count = PROCESS_COUNT
    load_config.bulk_data_size = BULK_DATA_SIZE
    load_config.data_loader_batch_size = DATA_LOADER_BATCH_SIZE
    load_config.data_source_batch_size = DATA_SOURCE_BATCH_SIZE
    load_config.doc_fetch_batch_size = DOC_FETCH_BATCH_SIZE

    # load_config.log_level = LOG_LEVEL_TRACE

    load_config.data_extractor = CTDataExtractor()
    load_config.data_mapper = CTDataMapper()
    # load_config.data_source_name = file_name.split('.')[0]

    load_config.max_memory_percent = 80

    return load_config
                    added_citations = update_history_item['added_citations']

                    citations_set = set(citations)
                    added_citations_set = set(added_citations)
                    citations = list(citations_set | added_citations_set)

                if 'removed_citations' in update_history_item:
                    removed_citations = update_history_item[
                        'removed_citations']

                    citations = list(set(citations) - set(removed_citations))

        return citations


load_config = LoadConfig()
load_config.root_directory = '/data/data_loading/pubmed_2019/pubmed2019/fix_citations'
# load_config.process_count = psutil.cpu_count()

load_config.server = 'http://localhost:9200'
load_config.server_username = ''
load_config.server_password = ''
load_config.index = "pubmed2019"
load_config.type = "article"

load_config.data_mapper = PubmedDataMapper()
load_config.data_extractor = PubmedDataExtractor()
load_config.max_memory_percent = 75

load_config.source = ""
load_config.append_relations = False
Exemple #6
0
    def get_load_config(self):
        load_config = LoadConfig()
        load_config.root_directory = self.root_directory
        load_config.process_count = psutil.cpu_count()

        load_config.server = self.server
        load_config.server_username = self.server_username
        load_config.server_password = self.server_password
        load_config.index = self.index
        load_config.type = self.type

        load_config.data_mapper = self.get_data_mapper()
        load_config.data_extractor = self.get_data_extractor()
        load_config.max_memory_percent = self.get_max_memory_percent()

        return load_config
                        "match": {
                            "citations.index_id": ID_PUBMED
                        }
                    }
                ]
            }
        }

        ids = self.data_utils.batch_fetch_ids_for_query(base_url=self.load_config.server, 
                                                query=query, 
                                                index=self.load_config.index, 
                                                type=self.load_config.type)

        return ids

load_config = LoadConfig()
load_config.root_directory = DIR
# load_config.process_count = psutil.cpu_count()

load_config.server = 'http://localhost:9200'
load_config.server_username = ''
load_config.server_password = ''
load_config.index =  "pubmed2019"
load_config.type = "article"

load_config.data_mapper =  PubmedDataMapper()
load_config.data_extractor = PubmedDataExtractor()
load_config.max_memory_percent = 75

load_config.process_count = 4
load_config.process_spawn_delay = 1