def download_data(): load_config = ct_load_config.get_load_config() source_files_directory = load_config.source_files_directory() csv_downloader = CSVDownloader(source_files_directory) for ct_table in CT_TABLES: csv_downloader.download(ct_table)
def verify(): load_config = ct_load_config.get_load_config() for data_source in DATA_SOURCES: # print 'Verifying', data_source stats = load_stats(load_config, data_source) total_ids = 0 if 'total_ids' in stats: total_ids = stats['total_ids'] url = load_config.server + '/' + load_config.index + '/' + load_config.type + '/_search?size=0' data = { "query":{ "exists":{ "field":data_source } } } response = requests.post(url, json=data) # print response if response.status_code == 200: response_data = json.loads(response.text) if 'hits' in response_data: hits = response_data['hits'] if 'total' in hits: total = hits['total'] complete = '' if total != total_ids: complete = 'Incomplete' print data_source, 'es_count:', total, 'stats_count:', total_ids, '[', complete, ']' else: print response.text
def process_relationships(data_source_name): print 'processing', data_source_name load_config = ct_load_config.get_load_config() load_config.data_source_name = data_source_name # Relationships load_config.append_relations = False load_config.source = data_source_name source_files_directory = load_config.source_files_directory() data_source_file_path = source_files_directory + '/' + data_source_name + '.csv' data_processor = CTRelationshipProcessor(load_config, CSVDataSource(data_source_file_path)) # data_processor.mode = DataProcessor.MODE_RETRY_FAILED_DOCS data_processor.run()
def process_file(data_source_name): print 'processing', data_source_name load_config = ct_load_config.get_load_config() load_config.data_source_name = data_source_name source_files_directory = load_config.source_files_directory() data_source_file_path = source_files_directory + '/' + data_source_name + '.csv' print 'Processing file', data_source_file_path # data_processor = DataProcessor(load_config, CSVDataSource(data_source_file_path)) # data_processor.mode = DataProcessor.MODE_RETRY_FAILED_DOCS # data_processor.process_rows() data_processor = DataSourceProcessor(load_config, CSVDataSource(data_source_file_path)) data_processor.run()
def run(): server = SERVER load_config = ct_load_config.get_load_config() other_files_directory = load_config.other_files_directory() ct_v2_ids = export_doc_ids.get_doc_ids(SERVER, 'clinical_trials_v2', 'study', other_files_directory, 'ct_v2_ids.json') ct_v1_ids = export_doc_ids.get_doc_ids(SERVER, 'clinical_trials', 'study', other_files_directory, 'ct_v1_ids.json') common_ids = {} for _id in ct_v2_ids: if _id in ct_v1_ids: common_ids[_id] = 0 print 'ct_v1_ids', len(ct_v1_ids) print 'ct_v2_ids', len(ct_v2_ids) print 'common_ids', len(common_ids)
def run(): load_config = ct_load_config.get_load_config() generated_files_dir = load_config.generated_files_directory() for data_source in data_sources: unique_ids = 0 try: data_source_dir = generated_files_dir + '/' + data_source stats_file = data_source + '_stats.json' stats = file_utils.load_file(data_source_dir, stats_file) unique_ids = stats['unique_ids'] except Exception as e: pass # print 'verifing', data_source url = 'http://localhost:9200/clinical_trials/study/_search?size=0' data = { "query": { "exists": { "field": data_source } } } response = requests.post(url, json=data) if response.status_code == 200 or response.status_code == 201: response_obj = json.loads(response.text) total = response_obj['hits']['total'] percent_completion = 0 if unique_ids > 0: percent_completion = (total / float(unique_ids)) * 100 print data_source, '- [', total, '/', unique_ids, '] -', percent_completion, '%' print '----------------------------------------------------------------------------------' else: print response.text
def save_processed_data_sources(processed_data_sources): load_config = ct_load_config.get_load_config() other_files_directory = load_config.other_files_directory() file_utils.save_file(other_files_directory, 'processed_data_sources.json', processed_data_sources)
def get_processed_data_sources(): load_config = ct_load_config.get_load_config() other_files_directory = load_config.other_files_directory() processed_data_sources = file_utils.load_file(other_files_directory, 'processed_data_sources.json') return processed_data_sources