예제 #1
0
def normalize_synonyms(abstract, lookup_dict):
    doc = Document(abstract)
    cems = doc.cems
    names = []
    starts = []
    ends = []
    for cem in cems:
        if cem.text.lower() in lookup_dict.keys():
            names.append(cem.text.lower())
            starts.append(cem.start)
            ends.append(cem.end)
    names = np.array(names)
    starts = np.array(starts)
    ends = np.array(ends)
    sort = np.argsort(starts)
    names = names[sort]
    starts = starts[sort]
    ends = ends[sort]

    index_change = 0
    for name, start, end in zip(names, starts, ends):
        replace_name = lookup_dict[name]
        replace_delta = len(replace_name) - len(name)
        abstract = abstract[:start+index_change] + replace_name + abstract[end+index_change:]
        index_change += replace_delta
    return abstract
예제 #2
0
    def get_img(self, doc):
        """Get images from doc using chemdataextractor"""

        # Load document image data from file
        tem_images = []
        cde_doc = Document.from_file(open(doc[1], "rb"))
        log.info('This article is : %s' % doc[0])
        imgs = cde_doc.figures
        del cde_doc

        # Identify relevant images from records
        for img in imgs:
            detected = False  # Used to avoid processing images twice
            records = img.records
            caption = img.caption
            for record in records:
                if detected is True:
                    break

                rec = record.serialize()
                if [self.img_type] in rec.values():
                    detected = True
                    log.info('%s instance found!' % self.img_type)
                    tem_images.append((doc[0], img.id, img.url, caption.text.replace('\n', ' ')))

        if len(tem_images) != 0:
            return tem_images
        else:
            return None
예제 #3
0
 def test_document_usage(self):
     """Test RscHtmlReader used via Document.from_file."""
     fname = '1752-153X-5-55.html'
     f = io.open(
         os.path.join(os.path.dirname(__file__), 'data', 'springer', fname),
         'rb')
     d = Document.from_file(f, readers=[SpringerHtmlReader()])
     self.assertEqual(len(d.elements), 97)
 def test_document_usage(self):
     """Test UsptoXmlReader used via Document.from_file."""
     fname = 'US06840965B2.xml'
     f = io.open(
         os.path.join(os.path.dirname(__file__), 'data', 'uspto', fname),
         'rb')
     d = Document.from_file(f, readers=[UsptoXmlReader()])
     self.assertEqual(len(d.elements), 112)
예제 #5
0
 def test_document_usage(self):
     """Test ElsevierHtmlReader used via Document.from_file."""
     fname = 'S0143720816310816.html'
     f = io.open(
         os.path.join(os.path.dirname(__file__), 'data', 'elsevier', fname),
         'rb')
     d = Document.from_file(f, readers=[ElsevierHtmlReader()])
     self.assertEqual(len(d.elements), 246)
예제 #6
0
def list_chemicals(foin):
    fchem=open(foin,'rb')
    docchem=Document.from_file(fchem)
    ct=0
    t = PrettyTable(['Filename','Entity_count','Start','End','Entity'])
    for i in docchem.cems:
        ct=ct+1
        t.add_row([foin,ct,i.start,i.end,i.text])
    t.align='l'
    t.border=False
    return(t)
예제 #7
0
def normalize_elements(abstract):
    ELEMENTS = ["H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne", "Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar", "K",
                "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "Ge", "As", "Se", "Br", "Kr",
                "Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd", "In", "Sn", "Sb", "Te", "I",
                "Xe", "Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb",
                "Lu", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb", "Bi", "Po", "At", "Rn", "Fr",
                "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk", "Cf", "Es", "Fm", "Md", "No", "Lr", "Rf",
                "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Nh", "Fl", "Mc", "Lv", "Ts", "Og", "Uue"]

    ELEMENT_NAMES = ["hydrogen", "helium", "lithium", "beryllium", "boron", "carbon", "nitrogen", "oxygen", "fluorine",
                     "neon", "sodium", "magnesium", "aluminium", "silicon", "phosphorus", "sulfur", "chlorine", "argon",
                     "potassium", "calcium", "scandium", "titanium", "vanadium", "chromium", "manganese", "iron",
                     "cobalt", "nickel", "copper", "zinc", "gallium", "germanium", "arsenic", "selenium", "bromine",
                     "krypton", "rubidium", "strontium", "yttrium", "zirconium", "niobium", "molybdenum", "technetium",
                     "ruthenium", "rhodium", "palladium", "silver", "cadmium", "indium", "tin", "antimony", "tellurium",
                     "iodine", "xenon", "cesium", "barium", "lanthanum", "cerium", "praseodymium", "neodymium",
                     "promethium", "samarium", "europium", "gadolinium", "terbium", "dysprosium", "holmium", "erbium",
                     "thulium", "ytterbium", "lutetium", "hafnium", "tantalum", "tungsten", "rhenium", "osmium",
                     "iridium", "platinum", "gold", "mercury", "thallium", "lead", "bismuth", "polonium", "astatine",
                     "radon", "francium", "radium", "actinium", "thorium", "protactinium", "uranium", "neptunium",
                     "plutonium", "americium", "curium", "berkelium", "californium", "einsteinium", "fermium",
                     "mendelevium", "nobelium", "lawrencium", "rutherfordium", "dubnium", "seaborgium", "bohrium",
                     "hassium", "meitnerium", "darmstadtium", "roentgenium", "copernicium", "nihonium", "flerovium",
                     "moscovium", "livermorium", "tennessine", "oganesson", "ununennium"]
    element_dict = {}
    for element, name in zip(ELEMENTS, ELEMENT_NAMES):
        element_dict[element] = name
    element_dict['aluminium'] = 'aluminum'
    doc = Document(abstract)
    cems = doc.cems
    names = []
    starts = []
    ends = []
    for cem in cems:
        if cem.text in element_dict.keys():
            names.append(cem.text)
            starts.append(cem.start)
            ends.append(cem.end)
    names = np.array(names)
    starts = np.array(starts)
    ends = np.array(ends)
    sort = np.argsort(starts)
    names = names[sort]
    starts = starts[sort]
    ends = ends[sort]

    index_change = 0
    for name, start, end in zip(names, starts, ends):
        replace_name = element_dict[name]
        replace_delta = len(replace_name) - len(name)
        abstract = abstract[:start+index_change] + replace_name + abstract[end+index_change:]
        index_change += replace_delta
    return abstract
예제 #8
0
def find_all_unique_entities(abstracts):
    entities = []
    for i, abstract in enumerate(abstracts):
        if i % 10 == 0:
            print('{} %'.format(round(i / len(abstracts) * 100, 3)))
        doc = Document(abstract)
        for j in range(len(doc.records)):
            try:
                entities.append(doc.records[j].serialize()['names'][0])
            except:
                pass
    unique_entities = list(set(entities))
    return unique_entities
예제 #9
0
def remove_abbreviations(abstract):
    doc = Document(abstract)
    abbvs = doc.abbreviation_definitions
    cems = doc.cems
    if len(abbvs) > 0:
        abbv_dict = {}
        for abbv in abbvs:
            cem_starts = []
            cem_ends = []
            if abbv[-1] is not None:
                abbv_dict[abbv[0][0]] = [' '.join(abbv[1])]
                for cem in cems:
                    if cem.text == abbv[0][0]:
                        cem_starts.append(cem.start)
                        cem_ends.append(cem.end)
                if len(cem_starts) > 0:
                    low_idx = cem_starts[np.argmin(cem_starts)]
                else:
                    low_idx = 0
                abbv_dict[abbv[0][0]].append(low_idx)
        abbv_dict = {k: v for k, v in sorted(abbv_dict.items(), key=lambda item: item[1][1])}
        index_change = 0
        for abbv in abbv_dict.keys():
            non_abbv = abbv_dict[abbv][0]
            if abbv_dict[abbv][1] != 0:
                replacement_delta = len(non_abbv) - len(abbv)
                cem_starts = []
                cem_ends = []
                for cem in cems:
                    if cem.text == abbv:
                        cem_starts.append(cem.start)
                        cem_ends.append(cem.end)
                if len(cem_starts) == 1:
                    if abstract[cem_starts[0]+index_change-1]+abstract[cem_ends[0]+index_change] == '()':
                        abstract = abstract[:cem_starts[0]-2+index_change] + abstract[cem_ends[0]+1+index_change:]
                        index_change += cem_starts[0] - cem_ends[0] - 3
                    else:
                        pass
                else:
                    low_idx = np.argmin(cem_starts)
                    cem_start_low = cem_starts[low_idx]
                    cem_end_low = cem_ends[low_idx]
                    if abstract[cem_start_low+index_change-1]+abstract[cem_end_low+index_change] == '()':
                        abstract = abstract[:cem_start_low-2+index_change] + abstract[cem_end_low+1+index_change:]
                        index_change += cem_start_low - cem_end_low - 3
                    else:
                        pass
                abstract = re.sub(r'([\s]){}([.,;\s]|$)'.format(abbv), r' {}\2'.format(non_abbv), abstract)
            else:
                pass
    return abstract
예제 #10
0
def build_pubchem_synonym_dict(abstracts):
    entity_to_cid = {}
    cid_to_synonyms = {}
    for i, abstract in enumerate(abstracts):
        if i % 100 == 0:
            print('{} %'.format(round(i / len(abstracts) * 100, 2)))
        # Gather All Named Entities
        entities = []
        doc = Document(abstract)
        for j in range(len(doc.records)):
            try:
                entities.append(doc.records[j].serialize()['names'][0])
            except:
                pass

        # Gather Synonyms for Each CID
        for entity in entities:
            if entity.lower() in entity_to_cid.keys():
                pass
            else:
                c = pcp.get_compounds(entity, 'name')
                if len(c) >= 1:
                    try:
                        c = c[0]
                        cid = str(c.cid)
                        entity_to_cid[entity.lower()] = cid
                        if cid not in cid_to_synonyms.keys():
                            cid_to_synonyms[cid] = [entity]
                        else:
                            cid_to_synonyms[cid].append(entity)
                    except TimeoutError:
                        pass

    # Build Lookup Table for Each Named Entity With Synonyms
    lookup_dict = {}
    for entity, cid in entity_to_cid.items():
        lookup_dict[entity] = cid_to_synonyms[cid][0]

    return lookup_dict, entity_to_cid, cid_to_synonyms