Exemple #1
0
class BuildDrugCanon():
    ''' Determine which package or product NDC is considered _the_ 
    representitive for the same proprietary name
    '''
    def __init__(self):
        self.sourceNames = BuildNdcWhiteList()
        self.sourceLabels = AcquireOpenFda()

    def __str__(self):
        return 'Determine Canonical Drugs'
    # -------------------------------------------------------------------------

    def _map(self, label):
        ''' A generator function that produces multiple elements from each 
        OpenFDA record
        '''
        if ('openfda' in label and 'product_ndc' in label['openfda'] and
            'brand_name' in label['openfda']):
            asString = json.dumps(label)
            for x in label['openfda']['brand_name']:
                brand = self.sourceNames.title(x)
                for y in label['openfda']['product_ndc']:
                    ndc = ProductNdc.parse(y).format()
                    yield {'proprietary_name': brand, 
                           'ndc':ndc, 
                           'size':len(asString)}

    def _reduce(self, dicts):
        ''' Selects the "best" element from a list  
        '''
        if len(dicts) < 1:
            return None

        best = dicts[0]
        for i in range(1, len(dicts)):
            if best['size'] < dicts[i]['size']:
                best = dicts[i]
        return best

    # -------------------------------------------------------------------------

    def _mapLabels(self):
        ''' A generator function that internally calls `map` for each label
        '''
        for record in self.sourceLabels.acquire_labels():
            for node in self._map(record):
                yield node

    def _reduceToCanon(self, partitions):
        ''' A generator that internally calls `reduce` for each entry
        '''
        for name in sorted(partitions):
            result = self._reduce(partitions[name])
            if result:
                yield (name, result['ndc'])

    # -------------------------------------------------------------------------

    def run(self):
        ''' Use the size of a record in the FDA data set to determine which 
        package or product NDC is considered _the_ representitive for the same
        proprietary name
        '''
        print('Loading White List')
        whiteListFileName = io.relativeToAbsolute('../../data/product_ndc.txt')
        records = []
        with open(whiteListFileName) as f:
            for row in csv.DictReader(f, dialect=csv.excel_tab):
                # for some reason a weird 'None' column appears
                records.append({k:v for k,v in row.items() if k})

        partitions = {x['proprietary_name']: [] for x in records}
        products = {x['product_ndc'] for x in records if x['proprietary_name']}

        print('Mapping Labels')
        for node in self._mapLabels():
            nameKey = node['proprietary_name']
            prodKey = node['ndc']
            if nameKey in partitions and prodKey in products:
                partitions[nameKey].append(node)

        print('Reducing to Canon')
        outFileName = io.relativeToAbsolute('../../data/canon_drugs.txt')
        canon = {x for x in self._reduceToCanon(partitions)}

        print('Updating NDC Whitelist')
        for row in records:
            tuple = (row['proprietary_name'], row['product_ndc'])
            if tuple in canon:
                # consume because multiple package codes map to this key
                canon.remove(tuple) 
                row['is_canon'] = 'true'
            else:
                row['is_canon'] = 'false'

        print('Saving')
        tempName = io.relativeToAbsolute('../../data/product_ndc_canon.txt')
        io.saveAsTabbedText(records, '../../data/product_ndc_canon.txt')

        # no errors, rename
        os.remove(whiteListFileName)
        os.rename(tempName, whiteListFileName)