Ejemplo n.º 1
0
 def test_document_usage(self):
     """Test UsptoXmlReader used via Document.from_file."""
     fname = 'US06840965B2.xml'
     f = io.open(
         os.path.join(os.path.dirname(__file__), 'data', 'uspto', fname),
         'rb')
     d = Document.from_file(f, readers=[UsptoXmlReader()])
     self.assertEqual(len(d.elements), 112)
Ejemplo n.º 2
0
 def test_document_usage(self):
     """Test RscHtmlReader used via Document.from_file."""
     fname = '10.1039_C6OB02074G.html'
     f = io.open(
         os.path.join(os.path.dirname(__file__), 'data', 'rsc', fname),
         'rb')
     d = Document.from_file(f, readers=[RscHtmlReader()])
     self.assertEqual(len(d.elements), 60)
Ejemplo n.º 3
0
 def test_document_usage(self):
     """Test AcsHtmlReader used via Document.from_file."""
     fname = 'acs.jmedchem.6b00723.html'
     f = io.open(
         os.path.join(os.path.dirname(__file__), 'data', 'acs', fname),
         'rb')
     d = Document.from_file(f, readers=[AcsHtmlReader()])
     self.assertEqual(len(d.elements), 198)
Ejemplo n.º 4
0
    def parse(self, html_file):
        """
		TODO: clean body texts. 02-11-2020
		Unlike other XML files, tags for body texts are not quite consistent. 
		For now, use CDE's reader to get body texts, and they have not only body texts but other preceding texts such as abstract.
		CDE's scraper can only body texts (scrape.paragraphs), but they are pure strings unlike Sentence instances.
		-> Exclude abstract and its preceding text from body text by last sentence of abstract in body text. 02-18-2020
		"""
        htmlstring = open(html_file).read()
        '''
		Remove encoding declaration since it causes the following error when Selector reads the string.
			-> ValueError: Unicode strings with encoding declaration are not supported. Please use bytes input or XML fragments without declaration.
		'''
        htmlstring = re.sub(r'<\?xml.*\?>', '', htmlstring)

        tree = etree.parse(html_file, self.html_parser)
        root = tree.getroot()

        # clean xml and extract essential elements.
        specials, refs = self.rsc_html_reader.preprocess(root)

        document = html.fromstring(htmlstring)

        title = document.findtext(
            './/title'
        )  # this title is only used to filter out the following error. The title from scrape below is used for JSON file.
        if title.strip(
        ) == 'RSC - Page load error':  # e.g., 101039c1jm11358e.html
            logger.error('RSC - Page load error')
            return None

        abstract_element = document.find_class("abstract")
        abstract = []
        start_para_idx = 1
        for abs in abstract_element:
            para_id_prefix = 'abs'
            start_para_idx, sents = self.get_sentence(abs, para_id_prefix,
                                                      start_para_idx, specials,
                                                      refs)
            abstract.extend(sents)
        ''' Body Text '''
        f = open(html_file, 'rb')
        doc = Document.from_file(f, readers=[self.rsc_html_reader])

        body_text = []
        sec_title = ''
        para_id_prefix = 'body'
        para_idx = 1
        for elem in doc.elements:
            if isinstance(elem, Heading):
                sec_title = elem.text
            elif isinstance(elem, Paragraph):
                for sent in elem.sentences:
                    token = []
                    start = []
                    end = []
                    for tok in sent.tokens:
                        token.append(tok.text)
                        start.append(tok.start - sent.start)
                        end.append(tok.end - sent.start)

                    pos = sent.pos_tags

                    cems = []
                    for cem in sent.cems:
                        cems.append([
                            cem.text, cem.start - sent.start,
                            cem.end - sent.start
                        ])

                    body_text.append({
                        'section_title':
                        sec_title,
                        'para_id':
                        para_id_prefix + '_para_' + str(para_idx),
                        'sent':
                        sent.text,
                        'token_pos':
                        list(zip(token, start, end, pos)),
                        'chemical_entity':
                        cems
                    })
                para_idx += 1

        # Exclude abstract and its preceding text from body text. 02-18-2020
        cut_off = -1
        #if len(abstract) != 0 and len(body_text) != 0 and all(elem in body_text for elem in abstract): # Sometimes, abstract and body have different whitespaces. e.g., 101039c005501h.json
        if len(abstract) != 0 and len(body_text) != 0:
            if len(abstract) < 3:  # debugging
                print('Abstract is a single sentence!!')
            for idx in range(len(body_text)):
                # compare only sent and remove leading and trailing whitespaces. Sometimes, abstract and body have different whitespaces. e.g., 101039c005501h.json
                # also compare preceding two sentences of the last one to increase accuracy. Some abstracts are a single sentence. e.g., 101039c2cp23070d.html
                #if abstract[-1]['sent'].strip() == body_text[idx]['sent'].strip() and abstract[-2]['sent'].strip() == body_text[idx-1]['sent'].strip() and abstract[-3]['sent'].strip() == body_text[idx-2]['sent'].strip():
                if len(
                        re.sub(r"[^a-zA-Z]", '', abstract[-1]['sent'])
                ) > 0:  # ignore sents having non-alphabets such as '.', '\n'
                    if re.sub(r'\s+', '', abstract[-1]['sent']) == re.sub(
                            r'\s+', '', body_text[idx]['sent']):
                        cut_off = idx + 1
                        break

        if cut_off != -1:
            body_text = body_text[cut_off:]
        ''' Figures '''
        sel = Selector.from_text(htmlstring)
        scrape = RscHtmlDocument(sel)

        figures = []
        for fig in scrape.figures:
            id = fig.reference if fig.reference is not None else fig.label
            label = fig.label

            if id is None:  # e.g., 101039b918103b.html has an image having only url information.
                print('figure id is none.')
                continue

            fig_file = html_file.rsplit('/', 1)[0] + '/' + fig.url.rsplit(
                '/', 1)[1]

            caption = []
            #cap = Text(fig.caption)
            #print(cap.sentences)
            if fig.caption is not None:
                for sent in Text(fig.caption):
                    token = []
                    start = []  # start offset
                    end = []  # end offset
                    for tok in sent.tokens:
                        token.append(tok.text)
                        start.append(tok.start - sent.start)
                        end.append(tok.end - sent.start)

                    pos = sent.pos_tags

                    cems = []
                    for cem in sent.cems:
                        cems.append([
                            cem.text, cem.start - sent.start,
                            cem.end - sent.start
                        ])

                    caption.append({
                        'sent':
                        sent.text,
                        'token_pos':
                        list(zip(token, start, end, pos)),
                        'chemical_entity':
                        cems
                    })

            figures.append({
                'fig_id': id,
                'label': label,
                'caption': caption,
                'fig_file': fig_file
            })

        data = {}
        data['uid'] = scrape.doi
        data['publisher'] = scrape.publisher + (
            ' - ' + scrape.journal if scrape.journal is not None else '')
        data['type'] = 'journal' if scrape.journal is not None else ''
        data['title'] = scrape.title
        data['year'] = ''
        if scrape.published_date is not None:
            data['year'] = scrape.published_date.strftime("%Y")
        elif scrape.online_date is not None:
            data['year'] = scrape.online_date.strftime("%Y")
        data['author'] = scrape.authors
        data['keywords'] = []
        data['abstract'] = abstract
        data['body_text'] = body_text
        data['figures'] = figures

        # debug
        '''
		if data['year'] == '':
			print('year is unknown.')`
			input('enter')
		
		if data['type'] == '':
			print('journal is unknown!!')	# E.g., 101039c5md00579e.html, 101039c5md00579e.html has no journal value and only abstract.
			input('enter')
		'''

        # write data to file
        output_filename = html_file.replace('.html', '.json')
        if output_filename == html_file:
            logger.error('>> HTML file does NOT exist!!')
            sys.exit()

        with open(output_filename, 'w') as outfile:
            json.dump(data, outfile)

        return scrape.doi
Ejemplo n.º 5
0
    writer.writerow([
        "number", "Source", "compound_from_rowheader", 'compound_from_caption',
        'refractive_index_value', 'row_headers', 'specifier', 'caption',
        'wavelength_from_caption', 'wavelength_from_headers'
    ])

s = 'the refractive index is measured at 485 nm'
#print (get_wavelength_fromcaption(s))

count = 0
if True:
    for i in range(0, 168999):
        path = r'F:\el_refractive_index_volumn_2000-2020\{}.xml'.format(i)
        try:
            f = open(path, 'rb')
            d = Document.from_file(f)
            DOI = str(d.metadata.serialize())
            # f = open(path, 'rb')
            # f1 = open(path, 'rb').read()
            # d = Document.from_file(f)
            # root = ET.fromstring(f1)
            # Journal = 'None'
            # DOI = 'None'
            # for child in root:
            #     for cchild in child:
            #         if cchild.tag == '{http://prismstandard.org/namespaces/basic/2.0/}publicationName':
            #             Journal = cchild.text[:]
            #         elif cchild.tag == '{http://prismstandard.org/namespaces/basic/2.0/}doi':
            #             DOI = cchild.text[:]

            for t in d.tables:
Ejemplo n.º 6
0
from chemdataextractor.doc import Document, Heading, Paragraph
from chemdataextractor.scrape import Selector
from chemdataextractor.scrape.pub.rsc import RscHtmlDocument
from chemdataextractor.reader import AcsHtmlReader, RscHtmlReader, PdfReader
import os
import sys
import csv

with open('file_name', 'rb') as file:
    doc = Document.from_file(file)

# initialise with an empty dictionary
compoundInfo = {}
# Produce the list of dictionaries
doc_records = doc.records.serialize()
# filter to only ratio information
ratio_doc_records = [record for record in doc_records if 'ratio' in record]

# using a loop extract the ratio information within ratio_doc_records
i = 0
for i in range(len(ratio_doc_records)):
    for key, value in ratio_doc_records[i].items():
        compoundInfo[key] = value
        # Only allow Name and Ratio information, don't show any other attributes
        if (key == 'nmr_spectra' or key == 'ir_spectra'
                or key == 'melting_points' or key == 'labels'
                or key == 'roles'):
            del compoundInfo[key]

# Open a new CSV file and append this information
    with open('csv_filename', 'a', newline='') as f: