Beispiel #1
0
    def test_cems_stoplist(self):
        """Test Document cems removes words in stoplist, ncluding words entirely made up of ignore prefix/suffix.

        GitHub issue #12.
        """
        self.assertEqual([Span('benzene', 0, 7)],
                         Document('benzene-aromatic').cems)
        self.assertEqual([], Document('-aromatic').cems)
        self.assertEqual([], Document('non-aromatic').cems)
Beispiel #2
0
 def test_parse_control_character(self):
     """Test control character in text is handled correctly."""
     # The parser doesn't like controls because it uses LXML model so must be XML compatible.
     d = Document(
         Paragraph('Yielding 2,4,6-trinitrotoluene,\n m.p. 20 \x0eC.'))
     expected = [{'names': ['2,4,6-trinitrotoluene']}]
     self.assertEqual(expected, d.records.serialize())
Beispiel #3
0
    def get_sentence(self, paragraph, para_id, specials, refs, sec_title=''):
        sents = []

        elements = self.els_xml_reader._parse_element(paragraph,
                                                      specials=specials,
                                                      refs=refs)
        doc = Document(*elements)
        for para in doc.paragraphs:  # Document object doesn't have direct access to sentences.
            for sent in para.sentences:
                token = []
                start = []
                end = []
                for tok in sent.tokens:
                    token.append(tok.text)
                    start.append(tok.start - sent.start)
                    end.append(tok.end - sent.start)

                pos = sent.pos_tags

                cems = []
                for cem in sent.cems:
                    cems.append([
                        cem.text, cem.start - sent.start, cem.end - sent.start
                    ])

                sents.append({
                    'section_title': sec_title,
                    'para_id': para_id,
                    'sent': sent.text,
                    'token_pos': list(zip(token, start, end, pos)),
                    'chemical_entity': cems
                })

        return sents
Beispiel #4
0
 def test_document_usage(self):
     """Test RscHtmlReader used via Document.from_file."""
     fname = '10.1039_C6OB02074G.html'
     f = io.open(
         os.path.join(os.path.dirname(__file__), 'data', 'rsc', fname),
         'rb')
     d = Document.from_file(f, readers=[RscHtmlReader()])
     self.assertEqual(len(d.elements), 60)
 def test_document_usage(self):
     """Test UsptoXmlReader used via Document.from_file."""
     fname = 'US06840965B2.xml'
     f = io.open(
         os.path.join(os.path.dirname(__file__), 'data', 'uspto', fname),
         'rb')
     d = Document.from_file(f, readers=[UsptoXmlReader()])
     self.assertEqual(len(d.elements), 112)
Beispiel #6
0
 def test_document_usage(self):
     """Test AcsHtmlReader used via Document.from_file."""
     fname = 'acs.jmedchem.6b00723.html'
     f = io.open(
         os.path.join(os.path.dirname(__file__), 'data', 'acs', fname),
         'rb')
     d = Document.from_file(f, readers=[AcsHtmlReader()])
     self.assertEqual(len(d.elements), 198)
Beispiel #7
0
 def extract_chemdata(self, text):
     doc = Document(text)
     cems = doc.cems
     chem_mentions = doc.records.serialize()
     materials = []
     for chem in chem_mentions:
         if 'names' in chem.keys():
             materials.append(chem["names"])
     return materials
Beispiel #8
0
    def get_sentence(self,
                     elem,
                     para_id_prefix,
                     start_para_idx,
                     specials,
                     refs,
                     sec_title=''):
        sents = []

        elements = self.rsc_html_reader._parse_element(elem,
                                                       specials=specials,
                                                       refs=refs)
        doc = Document(*elements)
        para_idx = start_para_idx
        for para in doc.paragraphs:
            for sent in para.sentences:
                token = []
                start = []
                end = []
                for tok in sent.tokens:
                    token.append(tok.text)
                    start.append(tok.start - sent.start)
                    end.append(tok.end - sent.start)

                pos = sent.pos_tags

                cems = []
                for cem in sent.cems:
                    cems.append([
                        cem.text, cem.start - sent.start, cem.end - sent.start
                    ])

                sents.append({
                    'section_title':
                    sec_title,
                    'para_id':
                    para_id_prefix + '_para_' + str(para_idx),
                    'sent':
                    sent.text,
                    'token_pos':
                    list(zip(token, start, end, pos)),
                    'chemical_entity':
                    cems
                })
            para_idx += 1

        return para_idx, sents
Beispiel #9
0
    def parse(self, html_file):
        """
		TODO: clean body texts. 02-11-2020
		Unlike other XML files, tags for body texts are not quite consistent. 
		For now, use CDE's reader to get body texts, and they have not only body texts but other preceding texts such as abstract.
		CDE's scraper can only body texts (scrape.paragraphs), but they are pure strings unlike Sentence instances.
		-> Exclude abstract and its preceding text from body text by last sentence of abstract in body text. 02-18-2020
		"""
        htmlstring = open(html_file).read()
        '''
		Remove encoding declaration since it causes the following error when Selector reads the string.
			-> ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.
		'''
        htmlstring = re.sub(r'<\?xml.*\?>', '', htmlstring)

        tree = etree.parse(html_file, self.html_parser)
        root = tree.getroot()

        # clean xml and extract essential elements.
        specials, refs = self.rsc_html_reader.preprocess(root)

        document = html.fromstring(htmlstring)

        title = document.findtext(
            './/title'
        )  # this title is only used to filter out the following error. The title from scrape below is used for JSON file.
        if title.strip(
        ) == 'RSC - Page load error':  # e.g., 101039c1jm11358e.html
            logger.error('RSC - Page load error')
            return None

        abstract_element = document.find_class("abstract")
        abstract = []
        start_para_idx = 1
        for abs in abstract_element:
            para_id_prefix = 'abs'
            start_para_idx, sents = self.get_sentence(abs, para_id_prefix,
                                                      start_para_idx, specials,
                                                      refs)
            abstract.extend(sents)
        ''' Body Text '''
        f = open(html_file, 'rb')
        doc = Document.from_file(f, readers=[self.rsc_html_reader])

        body_text = []
        sec_title = ''
        para_id_prefix = 'body'
        para_idx = 1
        for elem in doc.elements:
            if isinstance(elem, Heading):
                sec_title = elem.text
            elif isinstance(elem, Paragraph):
                for sent in elem.sentences:
                    token = []
                    start = []
                    end = []
                    for tok in sent.tokens:
                        token.append(tok.text)
                        start.append(tok.start - sent.start)
                        end.append(tok.end - sent.start)

                    pos = sent.pos_tags

                    cems = []
                    for cem in sent.cems:
                        cems.append([
                            cem.text, cem.start - sent.start,
                            cem.end - sent.start
                        ])

                    body_text.append({
                        'section_title':
                        sec_title,
                        'para_id':
                        para_id_prefix + '_para_' + str(para_idx),
                        'sent':
                        sent.text,
                        'token_pos':
                        list(zip(token, start, end, pos)),
                        'chemical_entity':
                        cems
                    })
                para_idx += 1

        # Exclude abstract and its preceding text from body text. 02-18-2020
        cut_off = -1
        #if len(abstract) != 0 and len(body_text) != 0 and all(elem in body_text for elem in abstract): # Sometimes, abstract and body have different whitespaces. e.g., 101039c005501h.json
        if len(abstract) != 0 and len(body_text) != 0:
            if len(abstract) < 3:  # debugging
                print('Abstract is a single sentence!!')
            for idx in range(len(body_text)):
                # compare only sent and remove leading and trailing whitespaces. Sometimes, abstract and body have different whitespaces. e.g., 101039c005501h.json
                # also compare preceding two sentences of the last one to increase accuracy. Some abstracts are a single sentence. e.g., 101039c2cp23070d.html
                #if abstract[-1]['sent'].strip() == body_text[idx]['sent'].strip() and abstract[-2]['sent'].strip() == body_text[idx-1]['sent'].strip() and abstract[-3]['sent'].strip() == body_text[idx-2]['sent'].strip():
                if len(
                        re.sub(r"[^a-zA-Z]", '', abstract[-1]['sent'])
                ) > 0:  # ignore sents having non-alphabets such as '.', '\n'
                    if re.sub(r'\s+', '', abstract[-1]['sent']) == re.sub(
                            r'\s+', '', body_text[idx]['sent']):
                        cut_off = idx + 1
                        break

        if cut_off != -1:
            body_text = body_text[cut_off:]
        ''' Figures '''
        sel = Selector.from_text(htmlstring)
        scrape = RscHtmlDocument(sel)

        figures = []
        for fig in scrape.figures:
            id = fig.reference if fig.reference is not None else fig.label
            label = fig.label

            if id is None:  # e.g., 101039b918103b.html has an image having only url information.
                print('figure id is none.')
                continue

            fig_file = html_file.rsplit('/', 1)[0] + '/' + fig.url.rsplit(
                '/', 1)[1]

            caption = []
            #cap = Text(fig.caption)
            #print(cap.sentences)
            if fig.caption is not None:
                for sent in Text(fig.caption):
                    token = []
                    start = []  # start offset
                    end = []  # end offset
                    for tok in sent.tokens:
                        token.append(tok.text)
                        start.append(tok.start - sent.start)
                        end.append(tok.end - sent.start)

                    pos = sent.pos_tags

                    cems = []
                    for cem in sent.cems:
                        cems.append([
                            cem.text, cem.start - sent.start,
                            cem.end - sent.start
                        ])

                    caption.append({
                        'sent':
                        sent.text,
                        'token_pos':
                        list(zip(token, start, end, pos)),
                        'chemical_entity':
                        cems
                    })

            figures.append({
                'fig_id': id,
                'label': label,
                'caption': caption,
                'fig_file': fig_file
            })

        data = {}
        data['uid'] = scrape.doi
        data['publisher'] = scrape.publisher + (
            ' - ' + scrape.journal if scrape.journal is not None else '')
        data['type'] = 'journal' if scrape.journal is not None else ''
        data['title'] = scrape.title
        data['year'] = ''
        if scrape.published_date is not None:
            data['year'] = scrape.published_date.strftime("%Y")
        elif scrape.online_date is not None:
            data['year'] = scrape.online_date.strftime("%Y")
        data['author'] = scrape.authors
        data['keywords'] = []
        data['abstract'] = abstract
        data['body_text'] = body_text
        data['figures'] = figures

        # debug
        '''
		if data['year'] == '':
			print('year is unknown.')`
			input('enter')
		
		if data['type'] == '':
			print('journal is unknown!!')	# E.g., 101039c5md00579e.html, 101039c5md00579e.html has no journal value and only abstract.
			input('enter')
		'''

        # write data to file
        output_filename = html_file.replace('.html', '.json')
        if output_filename == html_file:
            logger.error('>> HTML file does NOT exist!!')
            sys.exit()

        with open(output_filename, 'w') as outfile:
            json.dump(data, outfile)

        return scrape.doi
Beispiel #10
0
    def parse(self, xml_file):
        '''
		document encoding is ISO-8859-1, and if resolve_entities is set to True, then XMLSyntaxError occurs.
		'''
        xml_parser = etree.XMLParser(encoding='ISO-8859-1',
                                     resolve_entities=False)
        tree = etree.parse(xml_file, xml_parser)

        #try:
        #	tree = etree.parse(xml_file, self.parser)
        #except etree.XMLSyntaxError:
        #	pass

        # debug
        docinfo = tree.docinfo
        encoding_info = docinfo.encoding

        if encoding_info != 'ISO-8859-1':
            print(encoding_info)
            input("Press Enter to continue...")

        root = tree.getroot()

        title = root.find('.//title_full')
        title = etree.tostring(title).decode(
            "utf-8")  # retrive the original value to show in the TDM webpage.
        title = title.split(
            ">", 1)[1]  # TODO: find a better way to remove the top tag.
        title = title.rsplit("</", 1)[0]
        doi = root.findtext('.//doi')

        # year checking priority: (1) date_history[epub] (2) date_cover (3) date_online[header]
        year = None
        year_elem = root.find('.//date_history')
        if year_elem is not None:
            year = year_elem.get('epub')
            if year is not None:
                year = year.split('-')[0]
        if year is None:
            year_elem = root.find('.//date_cover')
            if year_elem is not None:
                year = root.findtext('.//date_cover')
                year = year.split('-')[0]
        if year is None:
            year_elem = root.find('.//date_online')
            if year_elem is not None:
                year = year_elem.get('header')
                if year is not None:
                    year = year.split('-')[0]

        authors = []
        #for author in root.findall('.//author_granular'):
        #	given = author.findtext('given')
        #	surname = author.findtext('surname')
        #	authors.append(given + ' ' + surname)
        for author in root.findall(
                './/author'):  # not all article has <author_granular> tag
            authors.append(author.text)

        abstract = []
        abstract_element = root.find(
            './/header_text')  # not all article has [heading="Abstract"]
        #abstract = ''
        #if abstract_element is not None:
        #	abstract = ' '.join(abstract_element.itertext())
        #	abstract = abstract.strip()

        if abstract_element is not None:
            # TODO: Fix the CDE related error - AttributeError: 'cython_function_or_method' object has no attribute 'lower'
            # e.g., jopt13_9_090201.pdf, jopt13_11_114001.pdf
            try:
                elements = self.nxml_reader._parse_element(abstract_element)
                doc = Document(*elements)
                for para in doc.paragraphs:
                    for sent in para.sentences:
                        #print(sent.tokens)
                        #print(sent.pos_tagged_tokens)
                        #abstract.append(sent.serialize())
                        token = []
                        start = []
                        end = []
                        for tok in sent.tokens:
                            token.append(tok.text)
                            start.append(tok.start - sent.start)
                            end.append(tok.end - sent.start)

                        pos = sent.pos_tags

                        cems = []
                        for cem in sent.cems:
                            cems.append([
                                cem.text, cem.start - sent.start,
                                cem.end - sent.start
                            ])

                        #print(sent.tokens)
                        #print(sent.pos_tagged_tokens)
                        #abstract.append(sent.serialize())
                        abstract.append({
                            'sent':
                            sent.text,
                            #'tokens': sent.tokens,
                            #'tokens': sent.tagged_tokens,
                            #'tags': sent.tags,
                            #'tokens': sent.raw_tokens,
                            #'pos_tagged_tokens': sent.pos_tagged_tokens,
                            #'ner_tagged_tokens': sent.ner_tagged_tokens
                            'token_pos':
                            list(zip(token, start, end, pos)),
                            'chemical_entity':
                            cems
                        })
            except:
                print('>> Error:', xml_file)
                pass

        copyright_text = root.findtext(
            './/copyright_text'
        )  # this will be used to remove copy right text from the extracted text from PDF.

        if doi is None:
            input("Press Enter to continue...")

        logger.debug(  #f'\n>>> Journal Title: {journal_title}\n'
            #f'>>> Publisher: {publisher}\n'
            #f'>>> Article Type: {article_type}\n'
            f'\n>>> Encoding: {encoding_info}\n'
            f'>>> Title: {title}\n'
            f'>>> Year: {year}\n'
            f'>>> UID: {doi}\n'
            f'>>> Authors: {authors}\n'
            f'>>> Abstract:\n{abstract}\n'
            f'>>> CopyRight:\n{copyright_text}\n')
        #f'>>> Keywords: {keywords}\n')
        #f'>>> Body Text:\n{body_text}\n')

        metadata = {}
        metadata['uid'] = doi
        metadata['publisher'] = 'IOP'
        metadata[
            'type'] = 'journal-article'  # TODO: exclude non journal articles.
        metadata['title'] = title
        metadata['year'] = year
        metadata['author'] = authors
        metadata['abstract'] = abstract

        return [metadata, copyright_text]
Beispiel #11
0
    writer.writerow([
        "number", "Source", "compound_from_rowheader", 'compound_from_caption',
        'refractive_index_value', 'row_headers', 'specifier', 'caption',
        'wavelength_from_caption', 'wavelength_from_headers'
    ])

s = 'the refractive index is measured at 485 nm'
#print (get_wavelength_fromcaption(s))

count = 0
if True:
    for i in range(0, 168999):
        path = r'F:\el_refractive_index_volumn_2000-2020\{}.xml'.format(i)
        try:
            f = open(path, 'rb')
            d = Document.from_file(f)
            DOI = str(d.metadata.serialize())
            # f = open(path, 'rb')
            # f1 = open(path, 'rb').read()
            # d = Document.from_file(f)
            # root = ET.fromstring(f1)
            # Journal = 'None'
            # DOI = 'None'
            # for child in root:
            #     for cchild in child:
            #         if cchild.tag == '{http://prismstandard.org/namespaces/basic/2.0/}publicationName':
            #             Journal = cchild.text[:]
            #         elif cchild.tag == '{http://prismstandard.org/namespaces/basic/2.0/}doi':
            #             DOI = cchild.text[:]

            for t in d.tables:
Beispiel #12
0
from chemdataextractor.doc import Document, Heading, Paragraph
from chemdataextractor.scrape import Selector
from chemdataextractor.scrape.pub.rsc import RscHtmlDocument
from chemdataextractor.reader import AcsHtmlReader, RscHtmlReader, PdfReader
import os
import sys
import csv

with open('file_name', 'rb') as file:
    doc = Document.from_file(file)

# initialise with an empty dictionary
compoundInfo = {}
# Produce the list of dictionaries
doc_records = doc.records.serialize()
# filter to only ratio information
ratio_doc_records = [record for record in doc_records if 'ratio' in record]

# using a loop extract the ratio information within ratio_doc_records
i = 0
for i in range(len(ratio_doc_records)):
    for key, value in ratio_doc_records[i].items():
        compoundInfo[key] = value
        # Only allow Name and Ratio information, don't show any other attributes
        if (key == 'nmr_spectra' or key == 'ir_spectra'
                or key == 'melting_points' or key == 'labels'
                or key == 'roles'):
            del compoundInfo[key]

# Open a new CSV file and append this information
    with open('csv_filename', 'a', newline='') as f:
def annotate(doi_pmid, text):
    global count
    global t0

    t1 = time.time()
    if (count % 10 == 0):
        with open("{}.log".format(out_name), "a") as f:
            f.write("\n")
            f.write("{} out of {} completed\n".format(count,
                                                      len(text_files.keys())))
            f.write("elapsed time: " + str(time.time() - start_time) + "\n")

        igem.save_json(cache_name, smiles_cache)

    print()
    print("{} out of {} completed".format(count, len(text_files.keys())))
    print(t1 - t0)

    t0 = t1
    try:
        sentences = [
            p.sentences for p in Document.from_string(text.encode())
            if hasattr(p, 'sentences')
        ]  # this has character-based indices
    except:
        sentences = [[]]
    sentence_found = []
    starts = []
    ends = []
    indices = []
    tagged = []
    chemicals_found = []
    bio_entities = []
    bio_entities_with_pos = []

    names_found = []
    smiles_found = []
    names_and_smiles = []

    sentences = sentences[0]  # weird nesting from CDE, do not change
    tot = time.time()
    times = 0
    span_total = 0
    successful_spans = 0

    for i in range(len(sentences)):  #TODO: change this to all sentences
        s = sentences[i]
        t_s_0 = time.time()

        # Part of Speech Tagger (used later for NLP)
        try:
            pos = (s.pos_tagged_tokens)
        except Exception as e:
            pos = cpt.tag(s.split())

        spans = s.cems  # generating here for enzyme finding
        span_names = [c.text for c in spans]

        # Enzymes in sentence (using regex)
        # attempt to get full enzyme names:
        enzyme_names = []
        enzyme_names_locs = []
        for i_w in range(len(pos)):
            word = pos[i_w][0]
            for m in re.finditer(r'[a-zA-Z]+ase\b', word):
                enzyme = m.group(0)
                i_l = i_w
                while i_l > 0:
                    prev_word = pos[i_l][0]
                    prev_pos = pos[i_l][1]
                    if prev_word in span_names:
                        enzyme = prev_word + " " + enzyme
                    elif prev_pos not in ":;{}|,./<>?!":
                        break
                    i_l -= 1
                enzyme_names.append(enzyme)
                enzyme_names_locs.append((enzyme, i_l, i_w))

        spans_sent = []
        smiles_sent = []
        names_sent = []
        names_smiles_sent = []
        for r in range(len(spans)):
            span = spans[r]
            c = span.text

            # Tries to get smiles on entire string, then if it doesn't work, deals with the case where c is a conglomerate of chemicals seperated by spaces.
            name_smiles_tuples = get_smiles(s, c)
            print(name_smiles_tuples)
            print()

            # Ignore chemical if not found
            if not name_smiles_tuples or (len(name_smiles_tuples) == 1
                                          and not name_smiles_tuples[0][0]):
                continue
            successful_spans += len(name_smiles_tuples)

            for name, smiles in name_smiles_tuples:
                if name:
                    span_dict = {
                        "text": name,
                        "start": span.start,
                        "end": span.end,
                        "smiles": smiles
                    }

                    # Indexing through pos tokens to find chemical entities
                    p = 0
                    while p < len(pos):
                        token = pos[p][0]
                        if token == span.text:
                            span_dict["pos"] = pos[p][1]
                            break
                        p += 1
                    spans_sent.append(span_dict)
                    names_sent.append(name)
                    smiles_sent.append(smiles)
                    names_smiles_sent.append((name, smiles))

        # Leave for loop and add entries for each sentence in a given literature to lists
        sentence_found.append(s.text)
        chemicals_found.append(spans_sent)

        names_found.append(
            ", ".join(names_sent)
        )  # two commas and a space for redundancy, since IUPAC has commas
        smiles_found.append(", ".join(smiles_sent))
        names_and_smiles.append(names_smiles_sent)

        starts.append(s.start)
        ends.append(s.end)
        indices.append(i)
        bio_entities.append(", ".join(enzyme_names))
        bio_entities_with_pos.append(enzyme_names_locs)
        tagged.append(pos)

        if len(spans) > 0:
            times += time.time() - t_s_0
            span_total += len(spans)
            #print(time.time()-t_s_0)

    # Create a dataframe with  annotations from a given literature.
    print()
    print("Average time per span (one identified chemical entity): " +
          str(times / (span_total + 0.01)))
    t_an = time.time()
    print("Time for all sentences in text: " + str(t_an - tot))
    print("Successfully classified span percent in paper: " +
          str(successful_spans / (span_total + 0.01)))

    # put all lists into a dictionary and coerce to dataframe! good riddance
    annotations = {
        "sentence": sentence_found,
        "start": starts,
        "end": ends,
        "indices": indices,
        "sentence_pos": tagged,
        "enzymes": bio_entities,
        "enzyme_locations": bio_entities_with_pos,
        "chemical_entities_full": chemicals_found,
        "chemical_names": names_found,
        "chemical_smiles": smiles_found,
        "name_smile_tuples": names_and_smiles
    }
    annots_csv = pd.DataFrame(annotations)

    annots_csv["lit_id"] = doi_pmid

    # Reorder our dataframe.
    annots_csv = annots_csv[[
        "lit_id", "indices", "start", "end", "sentence", "sentence_pos",
        "enzymes", "enzyme_locations", "chemical_entities_full",
        "chemical_names", "chemical_smiles", "name_smile_tuples"
    ]]

    # Add the datagram to our csv_file, appending if it exists and creating a new one if not.
    if os.path.isfile(csv_file):
        annots_csv.to_csv(csv_file, mode='a', header=False, index=False)
    else:
        annots_csv.to_csv(csv_file, index=False)