class BuildDrugCanon(): ''' Determine which package or product NDC is considered _the_ representitive for the same proprietary name ''' def __init__(self): self.sourceNames = BuildNdcWhiteList() self.sourceLabels = AcquireOpenFda() def __str__(self): return 'Determine Canonical Drugs' # ------------------------------------------------------------------------- def _map(self, label): ''' A generator function that produces multiple elements from each OpenFDA record ''' if ('openfda' in label and 'product_ndc' in label['openfda'] and 'brand_name' in label['openfda']): asString = json.dumps(label) for x in label['openfda']['brand_name']: brand = self.sourceNames.title(x) for y in label['openfda']['product_ndc']: ndc = ProductNdc.parse(y).format() yield {'proprietary_name': brand, 'ndc':ndc, 'size':len(asString)} def _reduce(self, dicts): ''' Selects the "best" element from a list ''' if len(dicts) < 1: return None best = dicts[0] for i in range(1, len(dicts)): if best['size'] < dicts[i]['size']: best = dicts[i] return best # ------------------------------------------------------------------------- def _mapLabels(self): ''' A generator function that internally calls `map` for each label ''' for record in self.sourceLabels.acquire_labels(): for node in self._map(record): yield node def _reduceToCanon(self, partitions): ''' A generator that internally calls `reduce` for each entry ''' for name in sorted(partitions): result = self._reduce(partitions[name]) if result: yield (name, result['ndc']) # ------------------------------------------------------------------------- def run(self): ''' Use the size of a record in the FDA data set to determine which package or product NDC is considered _the_ representitive for the same proprietary name ''' print('Loading White List') whiteListFileName = io.relativeToAbsolute('../../data/product_ndc.txt') records = [] with open(whiteListFileName) as f: for row in csv.DictReader(f, dialect=csv.excel_tab): # for some reason a weird 'None' column appears records.append({k:v for k,v in row.items() if k}) partitions = {x['proprietary_name']: [] for x in records} products = {x['product_ndc'] for x in records if x['proprietary_name']} print('Mapping Labels') for node in self._mapLabels(): nameKey = node['proprietary_name'] prodKey = node['ndc'] if nameKey in partitions and prodKey in products: partitions[nameKey].append(node) print('Reducing to Canon') outFileName = io.relativeToAbsolute('../../data/canon_drugs.txt') canon = {x for x in self._reduceToCanon(partitions)} print('Updating NDC Whitelist') for row in records: tuple = (row['proprietary_name'], row['product_ndc']) if tuple in canon: # consume because multiple package codes map to this key canon.remove(tuple) row['is_canon'] = 'true' else: row['is_canon'] = 'false' print('Saving') tempName = io.relativeToAbsolute('../../data/product_ndc_canon.txt') io.saveAsTabbedText(records, '../../data/product_ndc_canon.txt') # no errors, rename os.remove(whiteListFileName) os.rename(tempName, whiteListFileName)
def __init__(self): self.sourceNames = BuildNdcWhiteList() self.sourceLabels = AcquireOpenFda()
class ExtractOpenFdaFeatures(): ''' Extract attributes from the OpenFDA label data set and perform some initial cleaning of the data ''' def __init__(self): self.source = AcquireOpenFda() self.whiteList = BuildNdcWhiteList() self.features = self._buildFeatureSet() def __str__(self): return 'Extract Features from OpenFDA Labels' # ------------------------------------------------------------------------- def _buildFeatureSet(self): ''' Create the list of features to extract ''' return [{ 'feature': Feature('openfda.manufacturer_name'), 'column': 'name', 'transform': [] }, { 'feature': Feature('openfda.pharm_class_cs'), 'column': 'class_name', 'transform': [self.whiteList.title, self.trimPharmClassCs] }, { 'feature': Feature('openfda.pharm_class_epc'), 'column': 'class_name', 'transform': [self.whiteList.title, self.trimPharmClass3] }, { 'feature': Feature('openfda.pharm_class_moa'), 'column': 'class_name', 'transform': [self.whiteList.title, self.trimPharmClass3] }, { 'feature': Feature('openfda.pharm_class_pe'), 'column': 'class_name', 'transform': [self.whiteList.title, self.trimPharmClass2] }, { 'feature': Feature('openfda.product_type'), 'column': 'type_name', 'transform': [self.titleCaseIgnoreSmall] }, { 'feature': Feature('openfda.route'), 'column': 'route', 'transform': [self.whiteList.title] }, { 'feature': Feature('openfda.substance_name'), 'column': 'name', 'transform': [self.titleCaseIgnoreSmall] }, { 'feature': Feature('openfda.brand_name'), 'column': 'name', 'transform': [self.titleCaseIgnoreSmall] }, { 'feature': Feature('openfda.generic_name'), 'column': 'name', 'transform': [self.titleCaseIgnoreSmall] }, { 'feature': Feature('active_ingredient'), 'column': 'text', 'transform': [] }, { 'feature': Feature('inactive_ingredient'), 'column': 'text', 'transform': [] }] # ------------------------------------------------------------------------- # Monad-like functions for processing strings # def trimPharmClassCs(self, s): execute = '[chemical/ingredient]' in s return s[:-22] if execute else s def trimPharmClass2(self, s): return s[:-5] if s[-1] == ']' else s def trimPharmClass3(self, s): return s[:-6] if s[-1] == ']' else s def titleCaseIgnoreSmall(self, s): word_list = re.split(' ', s) final = [word_list[0].capitalize()] for word in word_list[1:]: final.append(word if len(word) < 4 else word.capitalize()) return " ".join(final) # ------------------------------------------------------------------------- def run(self): ''' Make a key-value map of certain attributes in the Open FDA dataset ''' print('Acquiring Records') for record in self.source.acquire_labels(): if 'openfda' in record and 'product_ndc' in record['openfda']: for entry in record['openfda']['product_ndc']: ndc = ProductNdc.parse(entry) id = ndc.format() for op in self.features: op['feature'].accumulate(id, record) print('Writing Features') for op in self.features: feature = op['feature'] baseName = '-'.join(feature.fields) fileName = io.relativeToAbsolute('../../data/' + baseName + '.txt') with open(fileName, 'w', encoding='utf-8') as f: print('product_ndc', op['column'], sep='\t', file=f) for pair in sorted(feature.data, key=itemgetter(0, 1)): value = pair[1] for fn in op['transform']: value = fn(value) print(pair[0], value, sep='\t', file=f)
def __init__(self): self.source = AcquireOpenFda() self.whiteList = BuildNdcWhiteList() self.features = self._buildFeatureSet()
class ExtractOpenFdaFeatures(): ''' Extract attributes from the OpenFDA label data set and perform some initial cleaning of the data ''' def __init__(self): self.source = AcquireOpenFda() self.whiteList = BuildNdcWhiteList() self.features = self._buildFeatureSet() def __str__(self): return 'Extract Features from OpenFDA Labels' # ------------------------------------------------------------------------- def _buildFeatureSet(self): ''' Create the list of features to extract ''' return [{'feature':Feature('openfda.manufacturer_name'), 'column': 'name', 'transform': []}, {'feature':Feature('openfda.pharm_class_cs'), 'column': 'class_name', 'transform': [self.whiteList.title, self.trimPharmClassCs]}, {'feature':Feature('openfda.pharm_class_epc'), 'column': 'class_name', 'transform': [self.whiteList.title, self.trimPharmClass3]}, {'feature':Feature('openfda.pharm_class_moa'), 'column': 'class_name', 'transform': [self.whiteList.title, self.trimPharmClass3]}, {'feature':Feature('openfda.pharm_class_pe'), 'column': 'class_name', 'transform': [self.whiteList.title, self.trimPharmClass2]}, {'feature':Feature('openfda.product_type'), 'column': 'type_name', 'transform': [self.titleCaseIgnoreSmall]}, {'feature':Feature('openfda.route'), 'column': 'route', 'transform': [self.whiteList.title]}, {'feature':Feature('openfda.substance_name'), 'column': 'name', 'transform': [self.titleCaseIgnoreSmall]}, {'feature':Feature('openfda.brand_name'), 'column': 'name', 'transform': [self.titleCaseIgnoreSmall]}, {'feature':Feature('openfda.generic_name'), 'column': 'name', 'transform': [self.titleCaseIgnoreSmall]}, {'feature':Feature('active_ingredient'), 'column': 'text', 'transform': []}, {'feature':Feature('inactive_ingredient'), 'column': 'text', 'transform': []} ] # ------------------------------------------------------------------------- # Monad-like functions for processing strings # def trimPharmClassCs(self, s): execute = '[chemical/ingredient]' in s return s[:-22] if execute else s def trimPharmClass2(self, s): return s[:-5] if s[-1] == ']' else s def trimPharmClass3(self, s): return s[:-6] if s[-1] == ']' else s def titleCaseIgnoreSmall(self, s): word_list = re.split(' ', s) final = [word_list[0].capitalize()] for word in word_list[1:]: final.append(word if len(word) < 4 else word.capitalize()) return " ".join(final) # ------------------------------------------------------------------------- def run(self): ''' Make a key-value map of certain attributes in the Open FDA dataset ''' print('Acquiring Records') for record in self.source.acquire_labels(): if 'openfda' in record and 'product_ndc' in record['openfda']: for entry in record['openfda']['product_ndc']: ndc = ProductNdc.parse(entry) id = ndc.format() for op in self.features: op['feature'].accumulate(id, record) print('Writing Features') for op in self.features: feature = op['feature'] baseName = '-'.join(feature.fields) fileName = io.relativeToAbsolute('../../data/'+baseName+'.txt') with open(fileName, 'w', encoding='utf-8') as f: print('product_ndc', op['column'], sep='\t', file=f) for pair in sorted(feature.data, key=itemgetter(0, 1)): value = pair[1] for fn in op['transform']: value = fn(value) print(pair[0],value,sep='\t',file=f)