def run(self): output_dir = self.output().path common.shell_cmd('mkdir -p %s', output_dir) for i in range(len(self.input())): input_dir = self.input()[i].path download_util.extract_and_clean(input_dir, 'ISO-8859-1//TRANSLIT', 'UTF-8', 'txt')
def run(self): input_dir = self.input().path output_dir = self.output().path common.shell_cmd("mkdir -p %s", dirname(output_dir)) NEEDS_HEADERS = {"estabtypes.txt": ["establishment_type_id", "description"]} inputs = [] for input_file in glob.glob(input_dir + "/*.txt"): if basename(input_file) in REMAPPED_FILES: continue header_key = basename(input_file) fieldnames = NEEDS_HEADERS.get(header_key, None) inputs.append( parallel.Collection.from_glob( input_file, parallel.CSVDictLineInput( delimiter="|", fieldnames=fieldnames, quoting=csv.QUOTE_NONE, escapechar="\\" ), ) ) parallel.mapreduce( inputs=inputs, mapper=TXT2JSONMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path )
def run(self): input_dir = self.input().path output_dir = self.output().path common.shell_cmd('mkdir -p %s', output_dir) # TODO(hansnelsen): change to the openfda.parallel version of multiprocess pool = multiprocessing.Pool(processes=3) for i in range(PARTITIONS): partition_dict = {} output_filename = join(output_dir, str(i) + '.maude.json') # Get all of the files for the current partition for filename in glob.glob(input_dir + '/' + str(i) + '.*.txt'): for file_type in CATEGORIES: if file_type in filename: logging.info('Using file %s for joining', filename) partition_dict[file_type] = filename logging.info('Starting Partition %d', i) master_file = partition_dict['mdrfoi'] patient_file = partition_dict['patient'] device_file = partition_dict['foidev'] text_file = partition_dict['foitext'] pool.apply_async(join_maude.join_maude, (master_file, patient_file, device_file, text_file, output_filename)) pool.close() pool.join()
def _run(self): output_dir = self.output().path common.shell_cmd('mkdir -p %s', output_dir) change_log = csv.reader(open(self.change_log_file, 'r')) batches = collections.defaultdict(list) for row in change_log: spl_id, spl_type, spl_date = row # Only grab the human and ceullar therapy labels for this index valid_types = ['cellular therapy', 'human'] is_valid = False for valid_type in valid_types: if spl_type.lower().find(valid_type) != -1: is_valid = True break # only process valid document types if is_valid: # All blank dates to be treated as the week of June 1, 2009 if not spl_date: spl_date = '20090601120000' date = arrow.get(spl_date, 'YYYYMMDDHHmmss') batches[date.ceil('week')].append(spl_id) for batch_date, ids in batches.items(): batch_file = '%s.ids' % batch_date.format('YYYYMMDD') batch_out = open(join(output_dir, batch_file), 'w') unique_ids = list(set(ids)) batch_out.write('\n'.join(unique_ids))
def run(self): output_dir = self.output().path common.shell_cmd('mkdir -p %s', output_dir) input_dir = self.local_dir for zip_filename in glob.glob(input_dir + '/*.zip'): common.shell_cmd_quiet('unzip -ou %s -d %s', zip_filename, output_dir)
def run(self): crawl_dir = dirname(dirname(self.output().path)) common.shell_cmd('mkdir -p %s', dirname(self.output().path)) manifests = walk_glob('manifest.json', crawl_dir) records = [] for file_name in manifests: records.append(json.load(open(file_name))) # Default data structure that creates the appropriate structure on the # first put so that we can blindly use `+=` when appropriate. combined = collections.defaultdict( lambda: collections.defaultdict( lambda: { 'export_date': None, 'partitions': [], 'total_records': 0 } ) ) # Walk over all of the manifests and create a single dictionary for row in records: for domain, value in row.items(): for sub, val in value.items(): combined[domain][sub]['export_date'] = val.get('export_date', '') combined[domain][sub]['partitions'] += val.get('partitions', []) combined[domain][sub]['total_records'] += val.get('total_records', 0) with open(join(self.output().path), 'w') as json_out: json.dump(combined, json_out, indent=2)
def run(self): input_dir = self.input().path output_dir = self.output().path common.shell_cmd('mkdir -p %s', dirname(output_dir)) NEEDS_HEADERS = { 'estabtypes.txt': ['establishment_type_id', 'description'] } inputs = [] for input_file in glob.glob(input_dir + '/*.txt'): if basename(input_file) in REMAPPED_FILES: continue header_key = basename(input_file) fieldnames = NEEDS_HEADERS.get(header_key, None) inputs.append( parallel.Collection.from_glob( input_file, parallel.CSVDictLineInput(delimiter='|', fieldnames=fieldnames, quoting=csv.QUOTE_NONE, escapechar='\\'))) parallel.mapreduce(inputs=inputs, mapper=TXT2JSONMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path)
def run(self): input_dir = self.input().path output_dir = self.output().path common.shell_cmd('mkdir -p %s', output_dir) # TODO(hansnelsen): change to the openfda.parallel version of multiprocess pool = multiprocessing.Pool(processes=6) for i in range(PARTITIONS): partition_dict = {} output_filename = join(output_dir, str(i) + '.maude.json') # Get all of the files for the current partition for filename in glob.glob(input_dir + '/' + str(i) + '.*.txt'): for file_type in CATEGORIES: if file_type in filename: logging.info('Using file %s for joining', filename) partition_dict[file_type] = filename logging.info('Starting Partition %d', i) master_file = partition_dict['mdrfoi'] patient_file = partition_dict['patient'] device_file = partition_dict['foidev'] text_file = partition_dict['foitext'] pool.apply_async(join_maude.join_maude, (master_file, patient_file, device_file, text_file, output_filename)) pool.close() pool.join()
def run(self): common.shell_cmd('mkdir -p %s', self.local_dir) soup = BeautifulSoup(urlopen(CAERS_DOWNLOAD_PAGE_URL).read(), 'lxml') for a in soup.find_all(title=re.compile('CAERS ASCII.*')): if 'Download CAERS ASCII' in re.sub(r'\s', ' ', a.text): fileURL = urljoin('https://www.fda.gov', a['href']) common.download(fileURL, join(self.output().path, a.attrs['title']+'.csv'))
def run(self): # Since we only iterate over dates in the umbrella process, we need to # skip batch files that do not exist output_file = self.output().path if not os.path.exists(self.batch): common.shell_cmd('touch %s', output_file) return input_file = self.input()[1].path es = elasticsearch.Elasticsearch(self.es_host) index_util.start_index_transaction(es, 'druglabel', self.epoch) parallel.mapreduce( input_collection=parallel.Collection.from_sharded(input_file), mapper=index_util.LoadJSONMapper(self.es_host, 'druglabel', 'spl', self.epoch, docid_key='set_id', version_key='version'), reducer=parallel.NullReducer(), output_prefix='/tmp/loadjson.druglabel', num_shards=1, map_workers=1) index_util.commit_index_transaction(es, 'druglabel') common.shell_cmd('touch %s', output_file)
def run(self): crawl_dir = dirname(dirname(self.output().path)) common.shell_cmd('mkdir -p %s', dirname(self.output().path)) manifests = walk_glob('manifest.json', crawl_dir) records = [] for file_name in manifests: records.append(json.load(open(file_name))) # Default data structure that creates the appropriate structure on the # first put so that we can blindly use `+=` when appropriate. combined = collections.defaultdict( lambda: collections.defaultdict(lambda: { 'export_date': None, 'partitions': [], 'total_records': 0 })) # Walk over all of the manifests and create a single dictionary for row in records: for domain, value in row.items(): for sub, val in value.items(): combined[domain][sub]['export_date'] = val.get( 'export_date', '') combined[domain][sub]['partitions'] += val.get( 'partitions', []) combined[domain][sub]['total_records'] += val.get( 'total_records', 0) with open(join(self.output().path), 'w') as json_out: json.dump(combined, json_out, indent=2)
def run(self): output_dir = dirname(self.output().path) common.shell_cmd('mkdir -p %s', output_dir) dt = arrow.get(self.batch) url = DOWNLOAD_URL.replace('--Y--', str(dt.year)) url = url.replace('--M--', str(dt.month)) url = url.replace('--D--', str(dt.day)) download_to_file_with_retry(url, self.output().path)
def run(self): common.shell_cmd('mkdir -p %s', dirname(self.output().path)) input_files = glob.glob(self.input().path + '/*.txt') parallel.mapreduce( parallel.Collection.from_glob( input_files, parallel.CSVDictLineInput(delimiter='|', strip_str='\0')), mapper=PMAMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path)
def run(self): common.shell_cmd('mkdir -p %s', self.local_dir) soup = BeautifulSoup(urllib2.urlopen(DAILY_MED_DOWNLOADS_PAGE).read(), 'lxml') for a in soup.find_all(href=re.compile('.*.zip')): if '_human_' in a.text: try: common.download(a['href'], join(self.local_dir, a['href'].split('/')[-1])) except ProcessException as e: logging.error("Could not download a DailyMed SPL archive: {0}: {1}".format(a['href'], e))
def run(self): common.shell_cmd('mkdir -p %s', dirname(self.output().path)) input_files = glob.glob(self.input().path + '/*.txt') parallel.mapreduce(parallel.Collection.from_glob( input_files, parallel.CSVDictLineInput(delimiter='|', strip_str='\0')), mapper=PMAMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path)
def run(self): common.shell_cmd('mkdir -p %s', join(BASE_DIR, 'tmp')) files = glob.glob(self.input().path + '/*/*.json') parallel.mapreduce( parallel.Collection.from_glob(files, parallel.JSONLineInput()), mapper=ParallelExportMapper(output_dir=self.output().path), reducer=parallel.NullReducer(), output_prefix=join(BASE_DIR, 'tmp'), output_format=parallel.NullOutput(), map_workers=10)
def run(self): common.shell_cmd('mkdir -p %s', dirname(self.output().path)) input_files = glob.glob(self.input().path + '/*.txt') parallel.mapreduce(parallel.Collection.from_glob( input_files, parallel.CSVDictLineInput(delimiter='|', quoting=csv.QUOTE_NONE, escapechar='\\')), mapper=ClassificationMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path)
def run(self): sync_path = join(BASE_DIR, self.date_str) target_bucket = 's3://%s/%s/' % (self.download_bucket, self.date_str) s3_cmd = [ 'aws', '--profile', config.aws_profile(), 's3', 'sync', sync_path, target_bucket, '--exclude "*"', '--include "*.zip"', '--include "*schema.json"' ] common.shell_cmd(' '.join(s3_cmd)) common.shell_cmd('touch %s', self.output().path)
def run(self): common.shell_cmd('mkdir -p %s', dirname(self.output().path)) input_files = glob.glob(self.input().path + '/*.txt') parallel.mapreduce( parallel.Collection.from_glob( input_files, parallel.CSVDictLineInput(delimiter='|', quoting=csv.QUOTE_NONE, escapechar='\\')), mapper=ClassificationMapper(), reducer=parallel.IdentityReducer(), output_prefix=self.output().path)
def run(self): output_dir = self.output().path common.shell_cmd("mkdir -p %s", output_dir) input_dir = self.input()[0].path supplemental_dir = self.input()[1].path download_util.extract_and_clean(input_dir, "ISO-8859-1", "UTF-8", "txt") # One of the files needs to be remapped from one column (submission_number) # to two columns (pma_number and k_number) depending on the prefix. file_name = "registration_listing.txt" output_file = join(output_dir, "remapped_" + file_name) remap_supplemental_files(join(output_dir, file_name), join(supplemental_dir, file_name), output_file)
def run(self): output_dir = self.output().path common.shell_cmd('mkdir -p %s', output_dir) date = self.batch if date >= CROSSOVER_XML_START_DATE and date <= CROSSOVER_XML_END_DATE: url = CROSSOVER_XML_URL else: url = CURRENT_XML_BASE_URL url = url.replace('WEEK', date.strftime('%m%d%Y')) file_name = 'enforcementreport.xml' xml_file = '%(output_dir)s/%(file_name)s' % locals() download_to_file_with_retry(url, xml_file)
def extract_and_clean(input_dir, source_encoding, target_encoding, file_type): ''' A utility function that extracts all of the zip files in a directory and converts the files from a source encoding to a target encoding. ''' for zip_filename in glob.glob(input_dir + '/*.zip'): txt_name = zip_filename.replace('zip', file_type) txt_name = txt_name.replace('raw', 'extracted') common.shell_cmd('mkdir -p %s', dirname(txt_name)) cmd = 'unzip -p %s | iconv -f %s -t %s -c > %s' logging.info('Unzipping and converting %s', zip_filename) common.shell_cmd(cmd, zip_filename, source_encoding, target_encoding, txt_name)
def run(self): sync_path = join(BASE_DIR, self.date_str) target_bucket = S3_BASE_BUCKET + '%s/' % self.date_str for data_path in self.output(): s3_cmd = [ 'aws', '--profile', config.aws_profile(), 's3', 'sync', sync_path, target_bucket, '--exclude "*"', '--include "*.zip"', '--include "*schema.json"' ] common.shell_cmd(' '.join(s3_cmd)) common.shell_cmd('touch %s', data_path.path)
def run(self): logging.info('Extracting: %s', (self.input().path)) extract_dir = dirname(self.input().path) gsrs_file_name = os.path.basename(self.input().path) gz_filename = os.path.splitext(gsrs_file_name)[0] + ".gz" gsrs_file = join(extract_dir, gsrs_file_name) gz_file = join(extract_dir, gz_filename) os.rename(gsrs_file, gz_file) common.shell_cmd('gunzip ' + gz_file) os.rename( os.path.splitext(gz_file)[0], os.path.splitext(gz_file)[0] + ".json")
def run(self): output_dir = self.output().path common.shell_cmd('mkdir -p %s', output_dir) input_dir = self.input()[0].path supplemental_dir = self.input()[1].path download_util.extract_and_clean(input_dir, 'ISO-8859-1', 'UTF-8', 'txt') # One of the files needs to be remapped from one column (submission_number) # to two columns (pma_number and k_number) depending on the prefix. file_name = 'registration_listing.txt' output_file = join(output_dir, 'remapped_' + file_name) remap_supplemental_files(join(output_dir, file_name), join(supplemental_dir, file_name), output_file)
def run(self): zip_filename = self.input().path output_filename = self.output().path output_dir = dirname(output_filename) common.shell_cmd('mkdir -p %s' % output_dir) cmd = 'unzip -o %(zip_filename)s \ -d %(output_dir)s' % locals() common.shell_cmd(cmd) # UNII filename varies; find and rename to a standardized name. # It is now a tab-delimited CSV instead of an XML as before. for file in glob.glob(join(output_dir, 'UNII*Names*.txt')): logging.info('Renaming %s', file) os.rename(file, output_filename)
def _run(self): shutil.rmtree(self.output().path, ignore_errors=True) os.makedirs(self.output().path) # Get all of the endpoints served by this index # Create an `EndpointExport` object for each endpoint in order to export # each endpoint properly. # # Endpoint exports can be: # date range based (quarterly output) # filter based (index serves many endpoints) # vanilla (endpoint is 1 to 1 with index and it is exported all at once) for endpoint, index_name in ENDPOINT_INDEX_MAP.items(): endpoint_batches = [] chunks = CUSTOM_CHUNKS.get(endpoint, DEFAULT_CHUNKS) if endpoint in RANGE_ENDPOINT_MAP: params = RANGE_ENDPOINT_MAP[endpoint] params['chunks'] = chunks endpoint_batches = _make_date_range_endpoint_batch( endpoint, params) elif endpoint in FILTERED_ENPOINT_MAP: params = FILTERED_ENPOINT_MAP[endpoint] query = EndpointExport.build_term_filter(**params) endpoint_batches.append( EndpointExport(endpoint, query=query, chunks=chunks)) else: endpoint_batches.append(EndpointExport(endpoint, chunks=chunks)) # This is a hack to overcome the shortcoming of the parallel library of # only having one mapper process for a tiny, single file input. Since we # want to execute these endpoint batches in parallel, we write each task # to its own file. It will create a mapper for each file. for ep in endpoint_batches: partition = ep.partition if ep.partition else 'all' if 'enforcement' in ep.endpoint: partition = ep.endpoint.replace('enforcement', '').replace('/', '') elif 'label' in ep.endpoint: partition = ep.endpoint.replace('label', '').replace('/', '') output_dir = join(self.output().path, index_name) common.shell_cmd('mkdir -p %s', output_dir) file_name = join(output_dir, partition + '.json') with open(file_name, 'w') as json_out: json_dict = json.dumps(ep.__dict__) json_out.write(json_dict + '\n')
def run(self): schema_file = self.get_schemafile() assert os.path.exists( schema_file ), 'No schema file available for index %s' % self.index_name es_client = elasticsearch.Elasticsearch(config.es_host()) endpoints = self.get_endpoints() # Get all of the endpoints served by this index # Create an `EndpointExport` object for each endpoint in order to export # each endpoint properly. # # Endpoint exports can be: # date range based (quarterly output) # filter based (index serves many endpoints) # vanilla (endpoint is 1 to 1 with index and it is exported all at once) endpoint_batches = [] for endpoint in endpoints: chunks = CUSTOM_CHUNKS.get(endpoint, DEFAULT_CHUNKS) if endpoint in RANGE_ENDPOINT_MAP: params = RANGE_ENDPOINT_MAP[endpoint] params['chunks'] = chunks endpoint_batches = _make_date_range_endpoint_batch( endpoint, params) elif endpoint in FILTERED_ENPOINT_MAP: params = FILTERED_ENPOINT_MAP[endpoint] query = EndpointExport.build_term_filter(**params) endpoint_batches.append( EndpointExport(endpoint, query=query, chunks=chunks)) else: endpoint_batches.append(EndpointExport(endpoint, chunks=chunks)) # Dump each of the `EndpointExport` objects in the list for ep in endpoint_batches: # The output_dir will be the same for all outputs, once you factor out # the endpoint, so we can safely look at the first one only. output_dir = dirname(dirname(self.output()[0].path)) endpoint_dir = join(output_dir, ep.endpoint[1:]) index_util.dump_index(es_client, ep.index_name, ep.endpoint, join(endpoint_dir, ep.partition), cleaner=omit_internal_keys, query=ep.query, chunks=ep.chunks) common.shell_cmd('cp %s %s', schema_file, endpoint_dir)
def run(self): sync_path = join(BASE_DIR, self.date_str) target_bucket = 's3://%s/%s/' % (self.download_bucket, self.date_str) s3_cmd = [ 'aws', '--profile', config.aws_profile(), 's3', 'sync', sync_path, target_bucket, '--exclude "*"', '--include "*.zip"', '--include "*schema.json"'] common.shell_cmd(' '.join(s3_cmd)) common.shell_cmd('touch %s', self.output().path)
def run(self): output_dir = dirname(self.output().path) common.shell_cmd('mkdir -p %s', output_dir) end = arrow.get(self.batch) start = end.shift(days=-6) id_list = self._fetch_ids(start, end) if len(id_list) >= 500: # We have tried to get the entire list of weekly results in one shot, but apparently there are more than 500 # hits and CDRH recall search does not support more than 500 results per search and does not support paging either. # This occurrence is rare, but when it does happen we need to re-retrieve the results day by day. for r in arrow.Arrow.range('day', start, end): id_list = id_list + self._fetch_ids(r, r) df = pd.DataFrame(data=list(set(id_list)), columns=['id']) df.to_csv(self.output().path, index=False)
def map(self, key, value, output): es_client = elasticsearch.Elasticsearch(config.es_host(), timeout=120) ep = common.ObjectDict(value) schema_file = join(SCHEMA_DIR, ep.index_name + '_schema.json') endpoint_dir = join(self.output_dir, ep.endpoint[1:]) target_dir = join(endpoint_dir, ep.partition) common.shell_cmd('mkdir -p %s', target_dir) index_util.dump_index(es_client, ep.index_name, ep.endpoint, target_dir, cleaner=omit_internal_keys, query=ep.query, chunks=ep.chunks) # Copy the current JSON schema to the zip location so that it is included # in the sync to s3 common.shell_cmd('cp %s %s', schema_file, endpoint_dir)
def map(self, key, value, output): es_client = elasticsearch.Elasticsearch(config.es_host()) ep = common.ObjectDict(value) schema_file = join(SCHEMA_DIR, ep.index_name + '_schema.json') endpoint_dir = join(self.output_dir, ep.endpoint[1:]) target_dir = join(endpoint_dir, ep.partition) common.shell_cmd('mkdir -p %s', target_dir) index_util.dump_index(es_client, ep.index_name, ep.endpoint, target_dir, cleaner=omit_internal_keys, query=ep.query, chunks=ep.chunks) # Copy the current JSON schema to the zip location so that it is included # in the sync to s3 common.shell_cmd('cp %s %s', schema_file, endpoint_dir)
def map(self, key, value, output): es_client = elasticsearch.Elasticsearch(config.es_host(), timeout=120) ep = common.ObjectDict(value) schema_file = join(SCHEMA_DIR, ep.index_name + '_schema.json') endpoint_dir = join(self.output_dir, ep.endpoint[1:]) target_dir = join(endpoint_dir, ep.partition) common.shell_cmd('mkdir -p %s', target_dir) index_util.dump_index(es_client, ep.index_name, ep.endpoint, target_dir, cleaner=omit_internal_keys, query=ep.query, chunks=ep.chunks) # Copy the current JSON schema to the zip location so that it is included # in the sync to s3. flock is required to avoid a race condition when copying the schema file. common.shell_cmd_quiet('flock --verbose %s cp %s %s', schema_file, schema_file, endpoint_dir)
def run(self): output_file = self.output().path input_file = self.input()[1].path es = elasticsearch.Elasticsearch(self.es_host) index_util.start_index_transaction(es, 'recall', self.epoch) parallel.mapreduce( input_collection=parallel.Collection.from_sharded(input_file), mapper=index_util.LoadJSONMapper(self.es_host, 'recall', 'enforcementreport', self.epoch, docid_key='@id', version_key='@version'), reducer=parallel.NullReducer(), output_prefix='/tmp/loadjson.recall', num_shards=1, map_workers=1) index_util.commit_index_transaction(es, 'recall') common.shell_cmd('touch %s', output_file)
def _run(self): common.shell_cmd('mkdir -p %s', self.output().path) # Get all of the endpoints served by this index # Create an `EndpointExport` object for each endpoint in order to export # each endpoint properly. # # Endpoint exports can be: # date range based (quarterly output) # filter based (index serves many endpoints) # vanilla (endpoint is 1 to 1 with index and it is exported all at once) for endpoint, index_name in ENDPOINT_INDEX_MAP.items(): endpoint_batches = [] chunks = CUSTOM_CHUNKS.get(endpoint, DEFAULT_CHUNKS) if endpoint in RANGE_ENDPOINT_MAP: params = RANGE_ENDPOINT_MAP[endpoint] params['chunks'] = chunks endpoint_batches = _make_date_range_endpoint_batch(endpoint, params) elif endpoint in FILTERED_ENPOINT_MAP: params = FILTERED_ENPOINT_MAP[endpoint] query = EndpointExport.build_term_filter(**params) endpoint_batches.append( EndpointExport(endpoint, query=query, chunks=chunks) ) else: endpoint_batches.append(EndpointExport(endpoint, chunks=chunks)) # This is a hack to overcome the shortcoming of the parallel library of # only having one mapper process for a tiny, single file input. Since we # want to execute these endpoint batches in parallel, we write each task # to its own file. It will create a mapper for each file. for ep in endpoint_batches: partition = ep.partition if ep.partition else 'all' if 'enforcement' in ep.endpoint: partition = ep.endpoint.replace('enforcement', '').replace('/', '') output_dir = join(self.output().path, index_name) common.shell_cmd('mkdir -p %s', output_dir) file_name = join(output_dir, partition + '.json') with open(file_name, 'w') as json_out: json_dict = json.dumps(ep.__dict__) json_out.write(json_dict + '\n')
def _run(self): output_dir = self.output().path common.shell_cmd("mkdir -p %s", output_dir) change_log = csv.reader(open(self.change_log_file, "r")) batches = collections.defaultdict(list) for row in change_log: spl_id, spl_type, spl_date = row # Only grab the human labels for this index if spl_type.lower().find("human") != -1: # All blank dates to be treated as the week of June 1, 2009 if not spl_date: spl_date = "20090601120000" date = arrow.get(spl_date, "YYYYMMDDHHmmss") batches[date.ceil("week")].append(spl_id) for batch_date, ids in batches.items(): batch_file = "%s.ids" % batch_date.format("YYYYMMDD") batch_out = open(join(output_dir, batch_file), "w") unique_ids = list(set(ids)) batch_out.write("\n".join(unique_ids))
def _run(self): output_dir = self.output().path common.shell_cmd('mkdir -p %s', output_dir) change_log = csv.reader(open(self.change_log_file, 'r')) batches = collections.defaultdict(list) for row in change_log: spl_id, spl_type, spl_date = row # Only grab the human labels for this index if spl_type.lower().find('human') != -1: # All blank dates to be treated as the week of June 1, 2009 if not spl_date: spl_date = '20090601120000' date = arrow.get(spl_date, 'YYYYMMDDHHmmss') batches[date.ceil('week')].append(spl_id) for batch_date, ids in batches.items(): batch_file = '%s.ids' % batch_date.format('YYYYMMDD') batch_out = open(join(output_dir, batch_file), 'w') unique_ids = list(set(ids)) batch_out.write('\n'.join(unique_ids))
def run(self): for filename in glob.glob(SPL_S3_DIR + '/*/*.xml'): src_dir = dirname(filename) barcode_target = join(src_dir, 'barcodes') xml_out = join(barcode_target, 'otc-bars.xml') json_out = xml_out.replace('.xml', '.json') if not os.path.exists(xml_out): common.shell_cmd('mkdir -p %s', barcode_target) logging.info('Zbarimg on directory %s', src_dir) cmd = 'find %(src_dir)s -name "*.jpg" -size +0\ -exec zbarimg -q --xml {} \; > \ %(xml_out)s' % locals() os.system(cmd) if common.is_older(json_out, xml_out): logging.info('%s does not exist, producing...', json_out) process_barcodes.XML2JSON(xml_out) else: logging.debug('%s already exists, skipping', xml_out) common.shell_cmd('touch %s', self.output().path)
def run(self): output_dir = self.output().path common.shell_cmd('mkdir -p %s', output_dir) input_dir = self.input()[0].path supplemental_dir = self.input()[1].path download_util.extract_and_clean(input_dir, 'ISO-8859-1', 'UTF-8', 'txt') # One of the files needs to be remapped from one column (submission_number) # to two columns (pma_number and k_number) depending on the prefix. file_name = 'registration_listing.txt' output_file = join(output_dir, 'remapped_' + file_name) remap_supplemental_files(join(output_dir, file_name), join(supplemental_dir, file_name), output_file) # There are a handful of files with floats for keys # This step can be removed once it is fixed on the source system. for fix_file in self.problem_files: with open(join(output_dir, fix_file), 'r') as needs_fixing: lines = needs_fixing.readlines() with open(join(output_dir, fix_file), 'w') as gets_fixing: for line in lines: gets_fixing.write(re.sub(r'\.0', '', line))
def run(self): output_dir = self.output().path common.shell_cmd('mkdir -p %s', output_dir) for i in range(len(self.input())): input_dir = self.input()[i].path for zip_filename in glob.glob(input_dir + '/*.zip'): txt_name = zip_filename.replace('zip', 'txt') txt_name = txt_name.replace('raw', 'extracted') common.shell_cmd('mkdir -p %s', dirname(txt_name)) cmd = 'unzip -p %s | iconv -f "ISO-8859-1//TRANSLIT" -t UTF8 -c > %s' logging.info('Unzipping and converting %s', zip_filename) common.shell_cmd(cmd, zip_filename, txt_name)
import arrow import simplejson as json from openfda.tasks import DependencyTriggeredTask from openfda import common, config, parallel, spl from openfda.annotation_table import unii_harmonization from openfda.spl import process_barcodes, extract RUN_DIR = dirname(dirname(os.path.abspath(__file__))) data_dir = config.data_dir('harmonization') BATCH_DATE = arrow.utcnow().ceil('week').format('YYYYMMDD') BASE_DIR = config.data_dir('harmonization/batches/%s' % BATCH_DATE) SPL_S3_DIR = config.data_dir('spl/s3_sync') TMP_DIR = config.tmp_dir() common.shell_cmd('mkdir -p %s', data_dir) common.shell_cmd('mkdir -p %s', BASE_DIR) common.shell_cmd('mkdir -p %s', TMP_DIR) SPL_SET_ID_INDEX = join(BASE_DIR, 'spl_index.db') DAILYMED_PREFIX = 'ftp://public.nlm.nih.gov/nlmdata/.dailymed/' PHARM_CLASS_DOWNLOAD = \ DAILYMED_PREFIX + 'pharmacologic_class_indexing_spl_files.zip' RXNORM_DOWNLOAD = \ DAILYMED_PREFIX + 'rxnorm_mappings.zip' NDC_DOWNLOAD_PAGE = \ 'http://www.fda.gov/drugs/informationondrugs/ucm142438.htm'
def run(self): input_dir = join(self.input().path, 'events') output_dir = self.output().path fh_dict = {} common.shell_cmd('mkdir -p %s', output_dir) # Headers need to be written to the start of each partition. The patient # and foitext headers are manually created. The headers for mdrfoi and # foidev are detected from the source and placed into the header dictionary. header = {} header['patient'] = PATIENT_KEYS header['foitext'] = TEXT_KEYS header['mdrfoi'] = MDR_KEYS header['foidev'] = DEVICE_KEYS for i in range(PARTITIONS): for category in CATEGORIES: filename = str(i) + '.' + category + '.txt' filename = join(output_dir, filename) logging.info('Creating file handles for writing %s', filename) output_handle = open(filename, 'w') csv_writer = csv.writer(output_handle, delimiter='|') csv_writer.writerow(header[category]) fh_dict[category + str(i)] = output_handle # Because we download all zips from the site, we need to ignore some of the # files for the partitioning process. Remove if files are excluded from # download. for filename in glob.glob(input_dir + '/*.txt'): logging.info('Processing: %s', filename) skip = False for ignore in IGNORE_FILES: if ignore in filename: skip = True if skip: logging.info('Skipping: %s', filename) continue for category in CATEGORIES: if category in filename: file_category = category # MAUDE files do not escape quote characters, we just hope that no # pipe characters occur in records... file_handle = csv.reader(open(filename, 'r'), quoting=csv.QUOTE_NONE, delimiter='|') partioned = collections.defaultdict(list) for i, row in enumerate(file_handle): # skip header rows if (i == 0) and ('MDR_REPORT_KEY' in row): continue # Only work with rows that have data and the first column is a number if row and row[0].isdigit(): partioned[int(row[0]) % PARTITIONS].append(row) else: logging.warn('Skipping row: %s', row) for partnum, rows in partioned.iteritems(): output_handle = fh_dict[file_category + str(partnum)] csv_writer = csv.writer(output_handle, delimiter='|') logging.info('Writing: %s %s %s %s', partnum, file_category + str(partnum), output_handle, len(rows)) csv_writer.writerows(rows)
def run(self): output_dir = self.output().path common.shell_cmd('mkdir -p %s', output_dir) input_dir = self.local_dir for zip_filename in glob.glob(input_dir + '/*.zip'): common.shell_cmd('unzip -ou %s -d %s', zip_filename, output_dir)
def run(self): output_dir = self.output().path common.shell_cmd('mkdir -p %s', output_dir) input_dir = self.input().path download_util.extract_and_clean(input_dir, 'ISO-8859-1', 'UTF-8', 'txt')
import elasticsearch import luigi import requests import simplejson as json from openfda import common, config, download_util, elasticsearch_requests, index_util, parallel from openfda.device_harmonization.pipeline import (Harmonized2OpenFDA, DeviceAnnotateMapper) from openfda.device_pma import transform from openfda.tasks import AlwaysRunTask RUN_DIR = dirname(dirname(os.path.abspath(__file__))) # A directory for holding files that track Task state META_DIR = config.data_dir('device_pma/meta') common.shell_cmd('mkdir -p %s', META_DIR) DEVICE_PMA_ZIP = 'https://www.accessdata.fda.gov/premarket/ftparea/pma.zip' class DownloadPMA(luigi.Task): def requires(self): return [] def output(self): return luigi.LocalTarget(config.data_dir('device_pma/raw')) def run(self): output_filename = join(self.output().path, DEVICE_PMA_ZIP.split('/')[-1]) common.download(DEVICE_PMA_ZIP, output_filename) class ExtractAndCleanDownloadsPMA(luigi.Task):
import csv import re import logging import os from os.path import dirname, join import arrow import datetime import luigi from openfda import common, config, index_util, parallel RUN_DIR = dirname(dirname(os.path.abspath(__file__))) BASE_DIR = config.data_dir('caers') common.shell_cmd('mkdir -p %s', BASE_DIR) S3_BUCKET = 's3://openfda-data-caers/' S3_LOCAL_DIR = config.data_dir('caers/s3_sync') # TODO(hansnelsen): initiate and resolve naming convention for this file and # s3 bucket. Currently, the file is downloaded from # s3://openfda-lonnie/caers/ (the naming of this file is # not consistent). The pipeline engineer downloads it, renames # it and then uploaded manually to the above bucket. CAERS_FILE = 'caers.csv' logging.info(S3_LOCAL_DIR, 'dir') common.shell_cmd('mkdir -p %s', S3_LOCAL_DIR) RENAME_MAP = { 'Report #': 'report_number', 'Created Date': 'date_created', 'Event Start Date': 'date_started',
def test_shell_cmd(self): tmpFile = '/tmp/'+(''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(32))) common.shell_cmd('touch %(tmpFile)s' % locals()) assert len(common.shell_cmd('ls %(tmpFile)s' % locals())) > 0 assert common.shell_cmd('ls %(tmpFile)s' % locals()).startswith(tmpFile)
from bs4 import BeautifulSoup import elasticsearch import luigi import pandas import requests import simplejson as json import urllib2 from openfda import common, config, elasticsearch_requests, index_util, parallel from openfda import download_util from openfda.index_util import AlwaysRunTask, ResetElasticSearch from openfda.device_harmonization.pipeline import Harmonized2OpenFDA, DeviceAnnotateMapper RUN_DIR = dirname(dirname(os.path.abspath(__file__))) BASE_DIR = "./data/registration" common.shell_cmd("mkdir -p %s", BASE_DIR) # A directory for holding files that track Task state META_DIR = join(BASE_DIR, "meta") common.shell_cmd("mkdir -p %s", META_DIR) DEVICE_REG_PAGE = ( "http://www.fda.gov/MedicalDevices/" "DeviceRegulationandGuidance/HowtoMarketYourDevice/" "RegistrationandListing/ucm134495.htm" ) S3_BUCKET = "s3://openfda-data-reglist/" S3_LOCAL_DIR = join(BASE_DIR, "s3_sync") common.shell_cmd("mkdir -p %s", S3_LOCAL_DIR)
import arrow import elasticsearch import luigi from openfda import common, elasticsearch_requests, index_util, parallel from openfda.annotation_table.pipeline import CombineHarmonization from openfda.index_util import AlwaysRunTask from openfda.spl import annotate from openfda.parallel import IdentityReducer RUN_DIR = dirname(dirname(os.path.abspath(__file__))) BASE_DIR = './data' META_DIR = join(BASE_DIR, 'spl/meta') # Ensure meta directory is available for task tracking common.shell_cmd('mkdir -p %s', META_DIR) SPL_JS = join(RUN_DIR, 'spl/spl_to_json.js') LOINC = join(RUN_DIR, 'spl/data/sections.csv') SPL_S3_BUCKET = 's3://openfda.spl.data/data/' SPL_S3_LOCAL_DIR = join(BASE_DIR, 'spl/s3_sync') SPL_S3_CHANGE_LOG = join(SPL_S3_LOCAL_DIR, 'change_log/SPLDocuments.csv') SPL_BATCH_DIR = join(META_DIR, 'batch') SPL_PROCESS_DIR = join(BASE_DIR, 'spl/batches') common.shell_cmd('mkdir -p %s', SPL_S3_LOCAL_DIR) common.shell_cmd('mkdir -p %s', SPL_PROCESS_DIR) ES_HOST = luigi.Parameter('localhost:9200', is_global=True) SPL_S3_PROFILE = luigi.Parameter(default='openfda', is_global=True)
import arrow import elasticsearch import luigi from openfda import config, common, elasticsearch_requests, index_util, parallel from openfda.annotation_table.pipeline import CombineHarmonization from openfda.tasks import AlwaysRunTask from openfda.spl import annotate from openfda.parallel import IdentityReducer RUN_DIR = dirname(dirname(os.path.abspath(__file__))) META_DIR = config.data_dir("spl/meta") # Ensure meta directory is available for task tracking common.shell_cmd("mkdir -p %s", META_DIR) SPL_JS = join(RUN_DIR, "spl/spl_to_json.js") LOINC = join(RUN_DIR, "spl/data/sections.csv") SPL_S3_BUCKET = "s3://openfda-data-spl/data/" SPL_S3_LOCAL_DIR = config.data_dir("spl/s3_sync") SPL_S3_CHANGE_LOG = join(SPL_S3_LOCAL_DIR, "change_log/SPLDocuments.csv") SPL_BATCH_DIR = join(META_DIR, "batch") SPL_PROCESS_DIR = config.data_dir("spl/batches") common.shell_cmd("mkdir -p %s", SPL_S3_LOCAL_DIR) common.shell_cmd("mkdir -p %s", SPL_PROCESS_DIR) class SyncS3SPL(luigi.Task):