def parse_import_cosmetic(self, table: dict): product = create_product() product['name'] = self.use_nonempty(table['产品名称(中文)'], table['产品名称(英文)']) product['abs'] = self.use_nonempty(table['产品名称(英文)'], table['产品名称(中文)']) product['intro'] = table['产品名称备注'] product['asset']['lic'] = [table['批准文号']] product['addr']['line1'] = table['生产企业地址'] product['addr']['city'] = 'Unknown' product['addr']['country'] = table['生产国(地区)'] product['tag'] = [table['产品类别'], 'Cosmetic'] product['asset']['tech'] = self._dict_to_markdown_table( {k: table[k] for k in ('批件状态', '批准日期', '批件有效期', '备注')}) company = create_company() company['name'] = self.use_nonempty(table['生产企业(中文)'], table['生产企业(英文)']) company['addr']['city'] = 'Unknown' company['addr']['line1'] = table['生产企业地址'] company['addr']['country'] = table['生产国(地区)'] delegate = create_company() delegate['name'] = table['在华申报责任单位'] delegate['addr']['city'] = 'Unknown' delegate['addr']['line1'] = table['在华申报责任单位'] delegate['addr']['country'] = 'China' return {'product': product, 'company': company, 'delegate': delegate}
def parse_domestic_device(self, table: dict): product = create_product() product['name'] = table['产品名称 '] product['abs'] = table['适用范围 '] product['intro'] = table['结构及组成'] product['asset']['lic'] = [table['注册证编号'], table['产品标准']] product['addr']['line1'] = table['生产地址'] product['addr']['city'] = 'Unknown' product['addr']['country'] = '中国' product['addr']['zip'] = table['邮编'] product['tag'] = [table['剂型(中文)'], table['产品类别'], 'Medical Device'] product['asset']['market'] = self._dict_to_markdown_table({ k: table[k] for k in ('型号、规格', '其他内容', '备注', '批准日期', '有效期至', '变更日期', '变更情况', '审批部门') }) product['asset']['tech'] = self._dict_to_markdown_table({ k: table[k] for k in ('主要组成成分(体外诊断试剂)', '预期用途(体外诊断试剂)', '产品储存条件及有效期(体外诊断试剂)') }) company = create_company() company['name'] = table['注册人名称'] company['addr']['city'] = 'Unknown' company['addr']['line1'] = table['注册人住所'] company['addr']['country'] = 'China' delegate = create_company() delegate['name'] = table['代理人名称'] delegate['addr']['city'] = 'Unknown' delegate['addr']['line1'] = table['代理人住所'] delegate['addr']['country'] = 'China' return {'product': product, 'company': company, 'delegate': delegate}
def parse_device(cells): review_category = { '1': 'Ophthalmology and otorhinolaryngology', '2': 'dentistry', '3': 'cerebral, cardiovascular, respiratory, psychiatric, and neurological field', '3-1': 'Intervention devices mainly in cerebral, cardiovascular, respiratory, psychiatric, and neurological field', '3-2': 'Non-intervention devices mainly in cerebral, cardiovascular, respiratory, psychiatric, and neurological field', '4': 'cerebral, cardiovascular, respiratory, psychiatric, and neurological field', '5': 'gastrointestinal and urinary systems, obstetrics and gynecology', '6': 'orthopedic/plastic surgery and dermatology', '7': 'laboratory tests, in vitro diagnostics', '8': 'multicategory medical devices, advanced electronic medical devices, and other uncategorized medical devices', } p = create_product() p['name'] = cells[4] if isinstance(cells[3], datetime.datetime): p['created'] = cells[3].strftime("%a, %d %b %Y %H:%M:%S GMT") else: try: p['created'] = parser.parse( cells[3]).strftime("%a, %d %b %Y %H:%M:%S GMT") except: pass if isinstance(cells[1], datetime.datetime): p['updated'] = cells[1].strftime("%a, %d %b %Y %H:%M:%S GMT") else: try: p['updated'] = parser.parse( cells[1]).strftime("%a, %d %b %Y %H:%M:%S GMT") except: pass p['tag'] = remove_empty_string_from_array( [cells[5], cells[6], 'Japan PMDA', 'Medical Device']) p['asset']['lic'] = p['tag'] p['asset']['stat'] = 2 p['abs'] = review_category.get(cells[10], cells[10]) if len(p['abs']) < 1: p['abs'] = p['name'] p['asset']['market'] = cells[9] p['addr']['country'] = 'Japan' p['addr']['city'] = 'Unknown' a = create_company() a['name'] = cells[0] a['abs'] = 'A Medical Device Company' a['addr'] = p['addr'] a['tag'] = p['tag'] return p, a
def parse_import_drug(self, table: dict): product = create_product() product['name'] = self.use_nonempty(table['商品名(中文)'], table['商品名(英文)']) product['abs'] = self.use_nonempty(table['产品名称(中文)'], table['产品名称(英文)']) product['intro'] = table['商品名(英文)'] product['asset']['lic'] = [ table['注册证号'], table['原注册证号'], table['分包装批准文号'], table['药品本位码'] ] product['addr']['line1'] = table['生产地址'] product['addr']['city'] = 'Unknown' product['addr']['country'] = '中国' product['tag'] = [table['剂型(中文)'], table['产品类别'], 'Drug'] product['asset']['tech'] = self._dict_to_markdown_table({ k: table[k] for k in ('产品名称(英文)', '包装规格(中文)', '规格(中文)', '发证日期', '有效期截止日', '药品本位码备注') }) company = create_company() company['name'] = self.use_nonempty(table['公司名称(英文)'], table['公司名称(中文)']) company['addr']['city'] = 'Unknown' company['addr']['line1'] = self.use_nonempty(table['地址(英文)'], table['地址(中文)']) company['addr']['country'] = self.use_nonempty(table['国家/地区(英文)'], table['国家/地区(中文)']) manufacture = create_company() manufacture['name'] = self.use_nonempty(table['生产厂商(英文)'], table['生产厂商(中文)']) manufacture['addr']['city'] = 'Unknown' manufacture['addr']['line1'] = self.use_nonempty( table['厂商地址(英文)'], table['厂商地址(中文)']) manufacture['addr']['country'] = self.use_nonempty( table['厂商国家/地区(英文)'], table['厂商国家/地区(中文)']) return { 'product': product, 'company': company, 'manufacture': manufacture }
def parse(self, response): name = response.url.split('/')[-1] with open(os.path.join(self.work_directory, name + '.html'), 'wb') as fo: fo.write(response.body) data = self.data[response.url] p = create_product() p['name'] = data['Medicine name'] p['abs'] = data[ 'International non-proprietary name (INN) / common name'] p['tag'] = [data['Category'] + ' Medicine', 'EU', 'Drug'] if data['Therapeutic area'] is not None: p['tag'].extend(data['Therapeutic area'].split(", ")) p['website'] = response.url p['ref'] = response.url if data['Patient safety'] == 'no': p['tag'].append('Patient Risk') if data['Orphan medicine'] == 'yes': p['tag'].append('Orphan medicine') if data['Species'] is not None and len(data['Species']) > 0: p['tag'].append('Species') p['updated'] = data['Revision date'] p['created'] = data['First published'] p['asset']['lic'] = p['tag'] p['asset']['stat'] = 3 p['asset']['type'] = 1 p['intro'] = '\n'.join( response.xpath( "//div[contains(@class, 'views-field-field-ema-web-summary')]/div/p/text()" ).getall()) p['asset']['market'] = dictionary_to_markdown( self.extract_market(response)) p['asset']['tech'] = dictionary_to_markdown({ key: data[key] for key in ('Active substance', 'Type of withdrawal', 'Date of withdrawal') }) p['addr']['city'] = 'Unknown' p['addr']['country'] = 'EU' a = create_company() a['name'] = data['Marketing authorisation holder/company name'] a['abs'] = 'A Medicine Company' a['addr']['city'] = 'Unknown' a['addr']['country'] = 'EU' a['entr']['bp'] = response.xpath( "//div[contains(@class, 'views-field-view-2')]/span/li/a/@href" ).get() with open(os.path.join(self.work_directory, name + '.json'), 'w') as fo: json.dump({'product': p, 'applicant': a}, fo)
def parse_drug(cells): review_category = { '1': 'Gastrointestinal drugs, dermatologic drugs, immunosuppressive drugs, and others (not classified as other categories)', '2': "Cardiovascular drugs, antiparkinsonian drugs, anti-Alzheimer's drugs", '3-1': 'Central/peripheral nervous system drugs (excluding anesthetic drugs)', '3-2': 'Anesthetic drugs, sensory organ drugs (excluding drugs for inflammatory diseases), narcotics', '4': 'Antibacterial drugs, antiviral drugs (excluding AIDS drugs), antifungal drugs, antiprotozoal drugs, anthelmintic drugs', '5': 'Reproductive system drugs, drugs for urogenital system, combination drugs', '6-1': 'Respiratory tract drugs, anti-allergy drugs (excluding dermatologic drugs), sensory organ drugs (drugs for inflammatory diseases)', '6-2': 'Hormone drugs, drugs for metabolic disorders (including diabetes mellitus, osteoporosis, gout, and inborn errors of metabolism)', } p = create_product() p['name'] = cells[3] if isinstance(cells[2], datetime): p['created'] = cells[2] else: try: p['created'] = parser.parse(cells[2]) except: pass category = review_category.get(cells[6], cells[6]) p['tag'] = remove_empty_string_from_array([category, 'Japan PMDA', 'Drug']) p['asset']['lic'] = p['tag'] p['asset']['stat'] = 2 p['asset']['tech'] = cells[0] p['abs'] = cells[5] if len(p['abs']) < 1: p['abs'] = p['name'] p['addr']['country'] = 'Japan' p['addr']['city'] = 'Unknown' a = create_company() a['name'] = cells[0] a['abs'] = 'A Drug Company' a['addr'] = p['addr'] a['tag'] = p['tag'] return p, a
def parse_domestic_cosmetic(self, table: dict): product = create_product() product['name'] = table['产品名称'] product['abs'] = table['产品名称备注'] product['asset']['lic'] = [table['批准文号'], table['卫生许可证号']] product['addr']['line1'] = table['生产企业地址'] product['addr']['city'] = 'Unknown' product['addr']['country'] = '中国' product['tag'] = [table['产品类别'], 'Cosmetic'] product['asset']['tech'] = self._dict_to_markdown_table( {k: table[k] for k in ('批件状态', '批准日期', '批件有效期', '备注')}) manufacture = create_company() manufacture['name'] = table['生产企业 '] manufacture['addr']['city'] = 'Unknown' manufacture['addr']['line1'] = table['生产企业地址'] manufacture['addr']['country'] = 'China' return {'product': product, 'manufacture': manufacture}
def parse_list(self, response): self.log('Parse list {}'.format(response.url), level=logging.INFO) name = response.url.split('/')[-1] with open(os.path.join(self.work_directory, name + '.html'), 'wb') as fo: fo.write(response.body) # for the information of school school = create_company() meta = self.get_school_information(response) if 'Name' in meta: school['name'] = meta['Name'] if 'URL' in meta: school['ref'] = meta['URL'] school['contact']['website'] = meta['URL'] if 'Group Type' in meta: school['abs'] = meta['Group Type'] school['addr'] = deepcopy(self.address) school['addr']['line1'] = school['name'] if school['name'] in self.blacklist: return patent_links = [] if os.path.exists(os.path.join(self.work_directory, school['name'] + '.json')): patent_links = json.load(open(os.path.join(self.work_directory, school['name'] + '.json'), 'r')) else: # the id of product is provded in the <script></script> for code in response.xpath("//script").getall(): if 'id_list' in code: ids = re.findall(r'[0-9]+', re.findall(r'\[[0-9,]+\]', code)[0]) patent_links = ['https://www.flintbox.com/public/project/{}'.format(patentId) for patentId in ids] with open(os.path.join(self.work_directory, school['name'] + '.json'), 'w') as fo: json.dump(patent_links, fo) for p in patent_links: name = p.split('/')[-1] if os.path.exists(os.path.join(self.work_directory, name + '.json')): self.log('{} already parsed and will skip'.format(p), level=logging.INFO) continue yield response.follow( url=p, callback=self.parse, dont_filter=True, meta={'proxy': POOL.get(), 'school': school} if self.with_proxy else {'school': school}, errback=self.handle_failure)
def main(): product_code = get_product_code() log = create_logger('510K') result = json.load(open(os.path.expanduser('~/work/fda/device-classification-0001-of-0001.json'), 'r')) log.critical(datetime.datetime.now()) for r in result['results']: p = create_product() p['name'] = r.get('device_name', r['openfda'].get('device_name', '')) p['ref'] = r.get('k_number', r['openfda'].get('k_number', '')) p['addr']['line1'] = r.get('address_1', r['openfda'].get('address_1', '')) p['addr']['line2'] = r.get('address_2', r['openfda'].get('address_2', '')) p['addr']['city'] = r.get('city', r['openfda'].get('city', '')) p['addr']['state'] = r.get('state', r['openfda'].get('state', '')) p['addr']['zip'] = r.get('zip_code', r['openfda'].get('zip_code', '')) p['addr']['country'] = r.get('country_code', r['openfda'].get('country_code', '')) p['intro'] = r.get('statement_or_summary', r['openfda'].get('statement_or_summary', '')) p['asset']['type'] = 0 p['tag'] = [ r.get('advisory_committee_description', r['openfda'].get('advisory_committee_description', '')), r.get('medical_specialty_description', r['openfda'].get('medical_specialty_description', '')), 'FDA', 'Medical Device', '510K'] # p['tag'] is used for tags readable to common users, p['lic'] is used for tags specified for product. p['asset']['lic'] = [ 'FDA', '510K', r.get('clearance_type', r['openfda'].get('clearance_type', '')), r.get('advisory_committee_description', r['openfda'].get('advisory_committee_description', '')), r['openfda'].get('medical_specialty_description', ''), r.get('product_code', r['openfda'].get('product_code', '')), r.get('regulation_number', r['openfda'].get('regulation_number', '')), r.get('decision_description', r['openfda'].get('decision_description', '')), ] p['asset']['lic'].extend(third_party(r.get('third_party_flag', r['openfda'].get('third_party_flag', '')))) if len(r.get('expedited_review_flag', r['openfda'].get('expedited_review_flag', ''))) > 0: p['asset']['lic'].append('Expedited Review') if r.get('submission_type_id', r['openfda'].get('submission_type_id', '')) not in {'1', '2'} and \ submission_type(r.get('submission_type_id', r['openfda'].get('submission_type_id', ''))) is not None: p['asset']['lic'].append( submission_type(r.get('submission_type_id', r['openfda'].get('submission_type_id', '')))) p['tag'].append(submission_type(r.get('submission_type_id', r['openfda'].get('submission_type_id', '')))) code = product_code.get(r.get('product_code', r['openfda'].get('product_code', '')), None) if code is not None: p['abs'] = code['device_name'] p['asset']['lic'].extend([ 'Class ' + code['device_class'], 'GMP Exempt' if code['gmp_exempt_flag'] == 'N' else 'GMP Required', ]) p['tag'].append('Class ' + code['device_class']) if code['implant_flag'] != 'N': p['asset']['lic'].append('Implant') p['tag'].append('Implant') if code['life_sustain_support_flag'] != 'N': p['asset']['lic'].append('Life Sustain Support') p['tag'].append('Life Sustain Support') else: p['abs'] = p['name'] p['asset']['stat'] = map_status(r.get('decision_code', r['openfda'].get('decision_code', ''))) try: p['created'] = parser.parse(r.get('date_received', r['openfda'].get('date_received', None))).strftime( "%a, %d %b %Y %H:%M:%S GMT") except: pass try: p['updated'] = parser.parse(r.get('decision_date', r['openfda'].get('decision_date', None))).strftime( "%a, %d %b %Y %H:%M:%S GMT") except: pass p['asset']['lic'] = remove_empty_string_from_array(p['asset']['lic']) p['tag'] = remove_empty_string_from_array(p['tag']) a = create_company() a['name'] = r.get('applicant', r['openfda'].get('applicant', '')) a['abs'] = 'A Medical Device Company' a['addr'] = p['addr'] a['tag'] = p['tag'] a['group']['parentId'] = '000000000000000000000000' # contact is just the name of contact response = add_record('entity', [p, a]) if response['_status'] != 'OK': log.error('fail to create record for {}'.format(p['name'])) continue applicant_product = create_relationship(response['_items'][1]['_id'], response['_items'][0]['_id']) applicant_product['type'] = 7 applicant_product['name'] = 'Applicant' applicant_product['abs'] = 'Applicant' response = add_record('relationship', [applicant_product]) if response['_status'] != 'OK': log.error('fail to create relationship for {}'.format(p['name'])) else: log.debug('added {} to the system'.format(p['name'])) log.critical(datetime.datetime.now())
def parse(self, response): name = response.url.split('/')[-1] with open(os.path.join(self.work_directory, name + '.html'), 'wb') as fo: fo.write(response.body) data = self.data[response.url] p = create_product() p['name'] = data['Medicine name'] p['abs'] = data[ 'International non-proprietary name (INN) / common name'] p['tag'] = [data['Category'] + ' Medicine', 'EU', 'Drug'] if data['Therapeutic area'] is not None: p['tag'].extend(data['Therapeutic area'].split(", ")) p['contact']['website'] = response.url p['ref'] = response.url if data['Patient safety'] == 'no': p['tag'].append('Patient Risk') if data['Additional monitoring'] == 'yes': p['tag'].append('Additional monitoring') if data['Generic'] == 'yes': p['tag'].append('Generic') if data['Biosimilar'] == 'yes': p['tag'].append('Biosimilar') if data['Biosimilar'] == 'yes': p['tag'].append('Biosimilar') if data['Conditional approval'] == 'yes': p['tag'].append('Conditional approval') if data['Exceptional circumstances'] == 'yes': p['tag'].append('Exceptional circumstances') if data['Accelerated assessment'] == 'yes': p['tag'].append('Accelerated assessment') if data['Accelerated assessment'] == 'yes': p['tag'].append('Accelerated assessment') if data['Orphan medicine'] == 'yes': p['tag'].append('Orphan medicine') if data['Species'] is not None and len(data['Species']) > 0: p['tag'].append('Species') p['updated'] = data['Decision date'] p['created'] = data['First published'] p['asset']['lic'] = p['tag'] p['asset']['stat'] = self.status_code(data['Authorisation status']) p['asset']['type'] = 1 p['intro'] = '\n'.join( response.xpath( "//div[contains(@class, 'field-name-field-ema-web-summary')]/div/div/div/p/text()" ).getall()) p['asset']['market'] = dictionary_to_markdown( self.extract_market(response)) p['asset']['tech'] = dictionary_to_markdown({ key: data[key] for key in ('Active substance', 'Revision number', 'Revision number', 'ATCvet code', 'First published', 'Revision date', 'Decision date', 'Date of opinion', 'Marketing authorisation date', 'Date of refusal of marketing authorisation', 'ATC code', 'Product number', 'Human pharmacotherapeutic group', 'Vet pharmacotherapeutic group', 'Condition / indication') }) a = create_company() a['name'] = data['Marketing authorisation holder/company name'] a['abs'] = 'A Medicine Company' a['addr'] = self.extract_address(response) a['entr']['bp'] = response.xpath( "//div[contains(@class, 'field-name-ema-medicine-all-author-pres-docs')]/div/div/div/section/div/div/div/div/ul/li/a/@href" ).get() p['addr'] = a['addr'] with open(os.path.join(self.work_directory, name + '.json'), 'w') as fo: json.dump({'product': p, 'applicant': a}, fo)