Ejemplo n.º 1
0
def run():
    load_config = irdb_load_config.get_load_config()
    load_config.data_source_name = DS_PATENT_RELATIONS
    data_source_batch_name = 'loaded_ids'

    data_source_batch_directory = load_config.data_source_batch_directory(
        data_source_batch_name)

    all_updated_ids = {}
    all_indexed_ids = {}
    all_failed_ids = {}

    for name in os.listdir(data_source_batch_directory):
        file_path = os.path.join(data_source_batch_directory, name)
        if os.path.isfile(file_path) and name.startswith(
                DATA_LOADER_BATCH_PREFIX):
            # print 'processing file:', file_path
            batch_data = file_utils.load_file(data_source_batch_directory,
                                              name)
            updated_ids = batch_data['updated_ids']
            indexed_ids = batch_data['indexed_ids']
            failed_ids = batch_data['failed_ids']

            for _id in updated_ids:
                all_updated_ids[_id] = 0

            for _id in indexed_ids:
                all_indexed_ids[_id] = 0

            for _id in failed_ids:
                all_failed_ids[_id] = 0

    print len(all_failed_ids), 'all_failed_ids'
    print len(all_indexed_ids), 'all_indexed_ids'
    print len(all_updated_ids), 'all_updated_ids'
Ejemplo n.º 2
0
def get_load_config():
    load_config = irdb_load_config.get_load_config()
    load_config.data_source_name = DS_PATENT_RELATIONS
    # load_config.log_level = LOG_LEVEL_TRACE

    load_config.source = 'derived'
    load_config.append_relations = False

    load_config.auto_retry_load = True
    load_config.max_retries = 2
    return load_config
Ejemplo n.º 3
0
    def get_load_config(self):
        load_config = irdb_load_config.get_load_config()
        load_config.data_source_name = DS_SPIRES_PUB_PROJECTS
        # load_config.log_level = LOG_LEVEL_TRACE

        load_config.source = 'irdb'
        load_config.append_relations = False

        load_config.auto_retry_load = True
        load_config.max_retries = 2
        return load_config
Ejemplo n.º 4
0
def get_load_config():
    irdb_load_config = irdb_load_config_getter.get_load_config()
    load_config = LoadConfig()
    load_config.root_directory = irdb_load_config.root_directory

    load_config.server = irdb_load_config.server
    load_config.index = INDEX_MAPPING[ID_PUBMED]['index']
    load_config.type = INDEX_MAPPING[ID_PUBMED]['type']

    load_config.data_extractor = PubmedDataExtractor()
    load_config.data_mapper = PubmedDataMapper()
    # load_config.data_source_name = file_name.split('.')[0]
    load_config.process_count = irdb_load_config.process_count

    # load_config.log_level = LOG_LEVEL_TRACE

    return load_config
Ejemplo n.º 5
0
def run():
    load_config = irdb_load_config.get_load_config()
    load_config.data_source_name = DS_EXTENDED_RELATIONS

    generated_files_directory = load_config.generated_files_directory()

    batch_file_names = []
    for batch_file_name in os.listdir(generated_files_directory):
        file_path = os.path.join(generated_files_directory, batch_file_name)
        if os.path.isfile(file_path) and batch_file_name.startswith('batch_'):
            batch_file_names.append(batch_file_name)

    for batch_file_name in batch_file_names:
        batch = file_utils.load_file(generated_files_directory,
                                     batch_file_name)
        if len(batch) < 1000:
            print batch_file_name, len(batch)
Ejemplo n.º 6
0
def run():
    load_config = irdb_load_config.get_load_config()
    load_config.data_source_name = DS_EXTENDED_RELATIONS

    all_irdb_ids = export_doc_ids.get_doc_ids_for_load_config(load_config)
    all_irdb_ids = all_irdb_ids.keys()

    source = 'derived'
    clear_relationships.batch_clear_relations_for_ids(
        server=LOCAL_SERVER,
        _ids=all_irdb_ids,
        src_index=INDEX_MAPPING[ID_IRDB]['index'],
        src_type=INDEX_MAPPING[ID_IRDB]['type'],
        source=source,
        dest_index_ids=[
            ID_CLINICAL_TRIALS, ID_CLINICAL_GUIDELINES, ID_FDA_PURPLE_BOOK,
            ID_FDA_PATENTS, ID_FDA_PRODUCTS, ID_DWPI
        ],
        relationship_types=[RELATIONSHIP_TYPE_RELATIONS])
Ejemplo n.º 7
0
def run():
    load_config = irdb_load_config.get_load_config()
    load_config.data_source_name = DS_PATENT_RELATIONS

    all_irdb_ids = export_doc_ids.get_doc_ids_for_load_config(load_config)
    all_irdb_ids = all_irdb_ids.keys()

    print 'IRDB IDS', len(all_irdb_ids)
    time.sleep(10)

    source = 'derived'
    clear_relationships.batch_clear_relations_for_ids(
        server=SERVER,
        _ids=all_irdb_ids,
        src_index=INDEX_MAPPING[ID_IRDB]['index'],
        src_type=INDEX_MAPPING[ID_IRDB]['type'],
        source=source,
        dest_index_ids=[ID_USPTO, ID_DERWENT_PATENTS],
        relationship_types=[RELATIONSHIP_TYPE_RELATIONS])
Ejemplo n.º 8
0
def analyse_batches():
    load_config = irdb_load_config.get_load_config()
    load_config.data_source_name = DS_PATENT_RELATIONS
    generated_files_directory = load_config.generated_files_directory()

    batch_file_names = []
    for batch_file_name in os.listdir(generated_files_directory):
        file_path = os.path.join(generated_files_directory, batch_file_name)
        if os.path.isfile(file_path) and batch_file_name.startswith('batch_'):
            batch_file_names.append(batch_file_name)

    print "Generated ", len(batch_file_names), 'batch file names'

    all_ids = {}
    batches_with_no_ids = {}
    ids_with_more_rels = {}

    for batch_file_name in batch_file_names:
        reformatted_array, ids_to_update = process_file(
            load_config, batch_file_name)
        print batch_file_name, 'reformatted ids', len(reformatted_array)
        if len(reformatted_array) == 0:
            batches_with_no_ids[batch_file_name] = len(ids_to_update)
        for _id in reformatted_array:
            if _id not in all_ids:
                all_ids[_id] = len(reformatted_array[_id])

            indexes = {}
            for item in reformatted_array[_id]:
                index_id = item['index_id']
                if index_id not in indexes:
                    indexes[index_id] = 0
                else:
                    'print duplicate index_id'
                    ids_with_more_rels[_id] = index_id

    print len(all_ids), 'all_ids'
    print 'ids_with_more_rels', ids_with_more_rels

    print 'batches_with_no_ids'
    print batches_with_no_ids

    run()
Ejemplo n.º 9
0
def get_load_config():
    load_config = irdb_load_config.get_load_config()
    load_config.data_source_name = DS_EXTENDED_RELATIONS
    return load_config