Example #1
0
def download_data():
    load_config = ct_load_config.get_load_config()
    source_files_directory = load_config.source_files_directory()

    csv_downloader = CSVDownloader(source_files_directory)
    for ct_table in CT_TABLES:
        csv_downloader.download(ct_table)
Example #2
0
def verify():
    load_config = ct_load_config.get_load_config()

    for data_source in DATA_SOURCES:
        # print 'Verifying', data_source
        stats = load_stats(load_config, data_source)
        total_ids = 0
        if 'total_ids' in stats:
            total_ids = stats['total_ids']

        url = load_config.server + '/' + load_config.index + '/' + load_config.type + '/_search?size=0'
        data = {
            "query":{
                "exists":{
                    "field":data_source
                }
            }
        }

        response = requests.post(url, json=data) 
        # print response
        if response.status_code == 200:
            response_data = json.loads(response.text)
            if 'hits' in response_data:
                hits = response_data['hits']
                if 'total' in hits:
                    total = hits['total']
                    complete = ''
                    if total != total_ids:
                        complete = 'Incomplete'
                    print data_source, 'es_count:', total, 'stats_count:', total_ids, '[', complete, ']'
        else:
            print response.text
Example #3
0
def process_relationships(data_source_name):
    print 'processing', data_source_name
    load_config = ct_load_config.get_load_config()
    load_config.data_source_name = data_source_name

    # Relationships
    load_config.append_relations = False
    load_config.source = data_source_name

    source_files_directory = load_config.source_files_directory()
    data_source_file_path = source_files_directory + '/' + data_source_name + '.csv'

    data_processor = CTRelationshipProcessor(load_config, CSVDataSource(data_source_file_path))
    # data_processor.mode = DataProcessor.MODE_RETRY_FAILED_DOCS
    data_processor.run()
Example #4
0
def process_file(data_source_name):
    print 'processing', data_source_name

    load_config = ct_load_config.get_load_config()
    load_config.data_source_name = data_source_name

    source_files_directory = load_config.source_files_directory()
    data_source_file_path = source_files_directory + '/' + data_source_name + '.csv'

    print 'Processing file', data_source_file_path
    # data_processor = DataProcessor(load_config, CSVDataSource(data_source_file_path))
    # data_processor.mode = DataProcessor.MODE_RETRY_FAILED_DOCS
    # data_processor.process_rows()

    data_processor = DataSourceProcessor(load_config, CSVDataSource(data_source_file_path))
    data_processor.run()
Example #5
0
def run():
    server = SERVER
    load_config = ct_load_config.get_load_config()
    other_files_directory = load_config.other_files_directory()
    ct_v2_ids = export_doc_ids.get_doc_ids(SERVER, 'clinical_trials_v2',
                                           'study', other_files_directory,
                                           'ct_v2_ids.json')
    ct_v1_ids = export_doc_ids.get_doc_ids(SERVER, 'clinical_trials', 'study',
                                           other_files_directory,
                                           'ct_v1_ids.json')

    common_ids = {}
    for _id in ct_v2_ids:
        if _id in ct_v1_ids:
            common_ids[_id] = 0

    print 'ct_v1_ids', len(ct_v1_ids)
    print 'ct_v2_ids', len(ct_v2_ids)
    print 'common_ids', len(common_ids)
Example #6
0
def run():
    load_config = ct_load_config.get_load_config()
    generated_files_dir = load_config.generated_files_directory()

    for data_source in data_sources:
        unique_ids = 0
        try:
            data_source_dir = generated_files_dir + '/' + data_source
            stats_file = data_source + '_stats.json'


            stats = file_utils.load_file(data_source_dir, stats_file)
            unique_ids = stats['unique_ids']
        except Exception as e:
            pass

        # print 'verifing', data_source
        url = 'http://localhost:9200/clinical_trials/study/_search?size=0'

        data = {
            "query": {
                "exists": {
                    "field": data_source
                }
            }
        }

        response = requests.post(url, json=data)
        if response.status_code == 200 or response.status_code == 201:
            response_obj = json.loads(response.text)
            total = response_obj['hits']['total']

            percent_completion = 0
            if unique_ids > 0:
                percent_completion = (total / float(unique_ids)) * 100
            print data_source, '- [', total, '/', unique_ids, '] -', percent_completion, '%'
            print '----------------------------------------------------------------------------------'

        else:
            print response.text
Example #7
0
def save_processed_data_sources(processed_data_sources):
    load_config = ct_load_config.get_load_config()
    other_files_directory = load_config.other_files_directory()

    file_utils.save_file(other_files_directory, 'processed_data_sources.json', processed_data_sources)
Example #8
0
def get_processed_data_sources():
    load_config = ct_load_config.get_load_config()
    other_files_directory = load_config.other_files_directory()

    processed_data_sources = file_utils.load_file(other_files_directory, 'processed_data_sources.json')
    return processed_data_sources