def normalize_synonyms(abstract, lookup_dict): doc = Document(abstract) cems = doc.cems names = [] starts = [] ends = [] for cem in cems: if cem.text.lower() in lookup_dict.keys(): names.append(cem.text.lower()) starts.append(cem.start) ends.append(cem.end) names = np.array(names) starts = np.array(starts) ends = np.array(ends) sort = np.argsort(starts) names = names[sort] starts = starts[sort] ends = ends[sort] index_change = 0 for name, start, end in zip(names, starts, ends): replace_name = lookup_dict[name] replace_delta = len(replace_name) - len(name) abstract = abstract[:start+index_change] + replace_name + abstract[end+index_change:] index_change += replace_delta return abstract
def get_img(self, doc): """Get images from doc using chemdataextractor""" # Load document image data from file tem_images = [] cde_doc = Document.from_file(open(doc[1], "rb")) log.info('This article is : %s' % doc[0]) imgs = cde_doc.figures del cde_doc # Identify relevant images from records for img in imgs: detected = False # Used to avoid processing images twice records = img.records caption = img.caption for record in records: if detected is True: break rec = record.serialize() if [self.img_type] in rec.values(): detected = True log.info('%s instance found!' % self.img_type) tem_images.append((doc[0], img.id, img.url, caption.text.replace('\n', ' '))) if len(tem_images) != 0: return tem_images else: return None
def test_document_usage(self): """Test RscHtmlReader used via Document.from_file.""" fname = '1752-153X-5-55.html' f = io.open( os.path.join(os.path.dirname(__file__), 'data', 'springer', fname), 'rb') d = Document.from_file(f, readers=[SpringerHtmlReader()]) self.assertEqual(len(d.elements), 97)
def test_document_usage(self): """Test UsptoXmlReader used via Document.from_file.""" fname = 'US06840965B2.xml' f = io.open( os.path.join(os.path.dirname(__file__), 'data', 'uspto', fname), 'rb') d = Document.from_file(f, readers=[UsptoXmlReader()]) self.assertEqual(len(d.elements), 112)
def test_document_usage(self): """Test ElsevierHtmlReader used via Document.from_file.""" fname = 'S0143720816310816.html' f = io.open( os.path.join(os.path.dirname(__file__), 'data', 'elsevier', fname), 'rb') d = Document.from_file(f, readers=[ElsevierHtmlReader()]) self.assertEqual(len(d.elements), 246)
def list_chemicals(foin): fchem=open(foin,'rb') docchem=Document.from_file(fchem) ct=0 t = PrettyTable(['Filename','Entity_count','Start','End','Entity']) for i in docchem.cems: ct=ct+1 t.add_row([foin,ct,i.start,i.end,i.text]) t.align='l' t.border=False return(t)
def normalize_elements(abstract): ELEMENTS = ["H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr", "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I", "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Nh", "Fl", "Mc", "Lv", "Ts", "Og", "Uue"] ELEMENT_NAMES = ["hydrogen", "helium", "lithium", "beryllium", "boron", "carbon", "nitrogen", "oxygen", "fluorine", "neon", "sodium", "magnesium", "aluminium", "silicon", "phosphorus", "sulfur", "chlorine", "argon", "potassium", "calcium", "scandium", "titanium", "vanadium", "chromium", "manganese", "iron", "cobalt", "nickel", "copper", "zinc", "gallium", "germanium", "arsenic", "selenium", "bromine", "krypton", "rubidium", "strontium", "yttrium", "zirconium", "niobium", "molybdenum", "technetium", "ruthenium", "rhodium", "palladium", "silver", "cadmium", "indium", "tin", "antimony", "tellurium", "iodine", "xenon", "cesium", "barium", "lanthanum", "cerium", "praseodymium", "neodymium", "promethium", "samarium", "europium", "gadolinium", "terbium", "dysprosium", "holmium", "erbium", "thulium", "ytterbium", "lutetium", "hafnium", "tantalum", "tungsten", "rhenium", "osmium", "iridium", "platinum", "gold", "mercury", "thallium", "lead", "bismuth", "polonium", "astatine", "radon", "francium", "radium", "actinium", "thorium", "protactinium", "uranium", "neptunium", "plutonium", "americium", "curium", "berkelium", "californium", "einsteinium", "fermium", "mendelevium", "nobelium", "lawrencium", "rutherfordium", "dubnium", "seaborgium", "bohrium", "hassium", "meitnerium", "darmstadtium", "roentgenium", "copernicium", "nihonium", "flerovium", "moscovium", "livermorium", "tennessine", "oganesson", "ununennium"] element_dict = {} for element, name in zip(ELEMENTS, ELEMENT_NAMES): element_dict[element] = name element_dict['aluminium'] = 'aluminum' doc = Document(abstract) cems = doc.cems names = [] starts = [] ends = [] for cem in cems: if cem.text in element_dict.keys(): names.append(cem.text) starts.append(cem.start) ends.append(cem.end) names = np.array(names) starts = np.array(starts) ends = np.array(ends) sort = np.argsort(starts) names = names[sort] starts = starts[sort] ends = ends[sort] index_change = 0 for name, start, end in zip(names, starts, ends): replace_name = element_dict[name] replace_delta = len(replace_name) - len(name) abstract = abstract[:start+index_change] + replace_name + abstract[end+index_change:] index_change += replace_delta return abstract
def find_all_unique_entities(abstracts): entities = [] for i, abstract in enumerate(abstracts): if i % 10 == 0: print('{} %'.format(round(i / len(abstracts) * 100, 3))) doc = Document(abstract) for j in range(len(doc.records)): try: entities.append(doc.records[j].serialize()['names'][0]) except: pass unique_entities = list(set(entities)) return unique_entities
def remove_abbreviations(abstract): doc = Document(abstract) abbvs = doc.abbreviation_definitions cems = doc.cems if len(abbvs) > 0: abbv_dict = {} for abbv in abbvs: cem_starts = [] cem_ends = [] if abbv[-1] is not None: abbv_dict[abbv[0][0]] = [' '.join(abbv[1])] for cem in cems: if cem.text == abbv[0][0]: cem_starts.append(cem.start) cem_ends.append(cem.end) if len(cem_starts) > 0: low_idx = cem_starts[np.argmin(cem_starts)] else: low_idx = 0 abbv_dict[abbv[0][0]].append(low_idx) abbv_dict = {k: v for k, v in sorted(abbv_dict.items(), key=lambda item: item[1][1])} index_change = 0 for abbv in abbv_dict.keys(): non_abbv = abbv_dict[abbv][0] if abbv_dict[abbv][1] != 0: replacement_delta = len(non_abbv) - len(abbv) cem_starts = [] cem_ends = [] for cem in cems: if cem.text == abbv: cem_starts.append(cem.start) cem_ends.append(cem.end) if len(cem_starts) == 1: if abstract[cem_starts[0]+index_change-1]+abstract[cem_ends[0]+index_change] == '()': abstract = abstract[:cem_starts[0]-2+index_change] + abstract[cem_ends[0]+1+index_change:] index_change += cem_starts[0] - cem_ends[0] - 3 else: pass else: low_idx = np.argmin(cem_starts) cem_start_low = cem_starts[low_idx] cem_end_low = cem_ends[low_idx] if abstract[cem_start_low+index_change-1]+abstract[cem_end_low+index_change] == '()': abstract = abstract[:cem_start_low-2+index_change] + abstract[cem_end_low+1+index_change:] index_change += cem_start_low - cem_end_low - 3 else: pass abstract = re.sub(r'([\s]){}([.,;\s]|$)'.format(abbv), r' {}\2'.format(non_abbv), abstract) else: pass return abstract
def build_pubchem_synonym_dict(abstracts): entity_to_cid = {} cid_to_synonyms = {} for i, abstract in enumerate(abstracts): if i % 100 == 0: print('{} %'.format(round(i / len(abstracts) * 100, 2))) # Gather All Named Entities entities = [] doc = Document(abstract) for j in range(len(doc.records)): try: entities.append(doc.records[j].serialize()['names'][0]) except: pass # Gather Synonyms for Each CID for entity in entities: if entity.lower() in entity_to_cid.keys(): pass else: c = pcp.get_compounds(entity, 'name') if len(c) >= 1: try: c = c[0] cid = str(c.cid) entity_to_cid[entity.lower()] = cid if cid not in cid_to_synonyms.keys(): cid_to_synonyms[cid] = [entity] else: cid_to_synonyms[cid].append(entity) except TimeoutError: pass # Build Lookup Table for Each Named Entity With Synonyms lookup_dict = {} for entity, cid in entity_to_cid.items(): lookup_dict[entity] = cid_to_synonyms[cid][0] return lookup_dict, entity_to_cid, cid_to_synonyms