Ejemplo n.º 1
0
 def test_melting_point_heading_salt(self):
     """Test extraction of melting point from a heading and paragraphs. Example taken from patent US06840965B2."""
     d = Document(
         Heading(
             'D. Synthesis of 4-Amino-2-(3-thienyl)phenol Hydrochloride'),
         Paragraph(
             '3 g (13.5 mmoles) of 4-nitro-2-(3-thienyl)phenol was dissolved in 40 mL of ethanol and hydrogenated at 25° C. in the presence of 600 mg of a palladium—active carbon catalyst (10%). After the theoretically required amount of hydrogen had been absorbed, the catalyst was filtered off. Following concentration in a rotary evaporator, the reaction mixture was poured onto 20 mL of cold diethyl ether. The precipitated product was filtered off and dried.'
         ),
         Paragraph(
             'This gave 1.95 g (75% of the theoretical) of 4-amino-2-(3-thienyl)phenol hydrochloride with a melting point of 130-132° C.'
         ))
     expected = [{
         'names': ['4-nitro-2-(3-thienyl)phenol']
     }, {
         'names': ['ethanol']
     }, {
         'names': ['palladium']
     }, {
         'names': ['carbon']
     }, {
         'names': ['hydrogen']
     }, {
         'names': ['diethyl ether']
     }, {
         'melting_points': [{
             'units': '°C',
             'value': '130-132'
         }],
         'names': [
             '4-Amino-2-(3-thienyl)phenol Hydrochloride',
             '4-amino-2-(3-thienyl)phenol hydrochloride'
         ],
         'roles': ['product']
     }]
     self.assertEqual(expected, d.records.serialize())
Ejemplo n.º 2
0
    def tokenize(text, cems=False):
        if cems:
            # getting initial annotation
            cde_cem_starts = [cem.start for cem in Document(text).cems]
        else:
            cde_cem_starts = []

        # getting all tokens
        cde_p = Paragraph(text)
        all_tokens = cde_p.tokens
        pos_tokens = cde_p.pos_tagged_tokens  # part of speech tagger
        # building the array for annotation
        tokens = []
        for row_idx, sentence in enumerate(all_tokens):
            tokens.append([])
            for idx, elem in enumerate(sentence):
                tokens[row_idx].append({
                    "id":
                    "token-" + str(elem.start) + "-" + str(elem.end),
                    "annotation":
                    ('CHM' if elem.start in cde_cem_starts else None),
                    "pos":
                    pos_tokens[row_idx][idx][1],
                    "text":
                    elem.text,
                    "start":
                    elem.start,
                    "end":
                    elem.end
                })
        return tokens
Ejemplo n.º 3
0
def custom_tokenize(text,
                    lowercase=False,
                    deacc=False,
                    encoding='utf8',
                    errors="strict",
                    to_lower=False,
                    lower=False,
                    cde=True):
    text = to_unicode(text, encoding, errors=errors)
    lowercase = lowercase or to_lower or lower
    if lowercase:
        text = text.lower()
    if deacc:
        text = deaccent(text)
    if cde:
        text = " ".join(text.split())
        cde_p = Paragraph(text)
        tokens = cde_p.tokens
        toks = []
        for sentence in tokens:
            toks.append([])
            for tok in sentence:
                if tok.text not in string.punctuation:
                    yield tok.text
    else:
        for match in PAT_ALPHABETIC.finditer(text):
            yield match.group()
Ejemplo n.º 4
0
 def test_parse_control_character(self):
     """Test control character in text is handled correctly."""
     # The parser doesn't like controls because it uses LXML model so must be XML compatible.
     d = Document(
         Paragraph('Yielding 2,4,6-trinitrotoluene,\n m.p. 20 \x0eC.'))
     expected = [{'names': ['2,4,6-trinitrotoluene']}]
     self.assertEqual(expected, d.records.serialize())
Ejemplo n.º 5
0
    def build_abbreviations_dict(self, materials_list, paragraphs):
        """

        :param materials_list: list of found materials entities
        :param paragraphs: list of paragraphs where look for abbreviations
        :return: dictionary abbreviation - corresponding entity
        """

        abbreviations_dict = {
            t: ''
            for t in materials_list
            if self.__is_abbreviation(t.replace(' ', ''))
        }
        not_abbreviations = list(
            set(materials_list) - set(abbreviations_dict.keys()))

        # run through all materials list to resolve abbreviations among its entities
        for abbr in abbreviations_dict.keys():

            for material_name in not_abbreviations:
                if sorted(re.findall('[A-NP-Z]', abbr)) == sorted(
                        re.findall('[A-NP-Z]', material_name)):
                    abbreviations_dict[abbr] = material_name

        # for all other abbreviations going through the paper text
        for abbr, name in abbreviations_dict.items():

            if name == '':

                sents = ' '.join([
                    s.text for p in paragraphs for s in Paragraph(p).sentences
                    if abbr in s.text
                ]).split(abbr)
                i = 0
                while abbreviations_dict[abbr] == '' and i < len(sents):
                    sent = sents[i]
                    for tok in sent.split(' '):
                        if sorted(re.findall('[A-NP-Z]', tok)) == sorted(
                                re.findall('[A-NP-Z]', abbr)):
                            abbreviations_dict[abbr] = tok
                    i = i + 1

        for abbr in abbreviations_dict.keys():
            parts = re.split('-', abbr)
            if all(p in abbreviations_dict for p in parts
                   ) and abbreviations_dict[abbr] == '' and len(parts) > 1:
                name = ''.join('(' + abbreviations_dict[p] + ')' + '-'
                               for p in parts).rstrip('-')
                abbreviations_dict[abbr] = name

        empty_list = [
            abbr for abbr, name in abbreviations_dict.items() if name == ''
        ]
        for abbr in empty_list:
            del abbreviations_dict[abbr]

        return abbreviations_dict
Ejemplo n.º 6
0
    def tokenize(self, text, split_oxidation=True, keep_sentences=True):
        """Converts a string to a list tokens (words) using a modified chemdataextractor tokenizer.

        Adds a few fixes for inorganic materials science, such as splitting common units from numbers
        and splitting the valence state.

        Args:
            text: input text as a string
            split_oxidation: if True, will split the oxidation state from the element, e.g. iron(II)
                will become iron (II), same with Fe(II), etc.
            keep_sentences: if False, will disregard the sentence structure and return tokens as a
                single list of strings. Otherwise returns a list of lists, each sentence separately.

        Returns:
            A list of strings if keep_sentence is False, otherwise a list of list of strings, which each
            list corresponding to a single sentence.
        """
        def split_token(token, so=split_oxidation):
            """Processes a single token, in case it needs to be split up.

            There are 2 cases when the token is split: A number with a common unit, or an
            element with a valence state.

            Args:
                token: The string to be processed.
                so: If True, split the oxidation (valence) string. Units are always split.

            Returns:
                A list of strings.
            """
            elem_with_valence = self.ELEMENT_VALENCE_IN_PAR.match(
                token) if so else None
            nr_unit = self.NR_AND_UNIT.match(token)
            if nr_unit is not None and nr_unit.group(2) in self.SPLIT_UNITS:
                # Splitting the unit from number, e.g. "5V" -> ["5", "V"].
                return [nr_unit.group(1), nr_unit.group(2)]
            elif elem_with_valence is not None:
                # Splitting element from it"s valence state, e.g. "Fe(II)" -> ["Fe", "(II)"].
                return [elem_with_valence.group(1), elem_with_valence.group(2)]
            else:
                return [token]

        cde_p = Paragraph(text)
        tokens = cde_p.tokens
        toks = []
        for sentence in tokens:
            if keep_sentences:
                toks.append([])
                for tok in sentence:
                    toks[-1] += split_token(tok.text, so=split_oxidation)
            else:
                for tok in sentence:
                    toks += split_token(tok.text, so=split_oxidation)
        return toks
Ejemplo n.º 7
0
 def _find_variables(self, var, raw_text, mp):
     sents = Paragraph(raw_text).sentences
     i = 0
     values = []
     while len(values) == 0 and i < len(sents):
         sent = sents[i]
         try:
             values, mode = mp.get_stoichiometric_values(var, sent.text)
         except ValueError:
             pass
         i += 1
     return values
Ejemplo n.º 8
0
def get_ents(paragraphs):

    # get extractor
    global extractor

    config_path = os.path.join(os.path.realpath('.'), '.env')
    load_dotenv(dotenv_path=config_path)
    models_dir = environ.get('MODELS_DIR')
    model_name = environ.get('ACTIVE_MODEL')
    model_dir = os.path.join(models_dir, model_name)

    if extractor is None:
        extractor = RxnExtractor(model_dir=model_dir)

    # Get sentences
    paragraphs = [Paragraph(p).sentences for p in paragraphs]
    sentences = []

    for par in paragraphs:
        for sent in par:
            sentences.append(str(sent))

    reactions = extractor.get_reactions(sentences)

    # Re-combine sentences into paragraphs
    extractions = []
    off = 0
    for par in paragraphs:
        tokens = []
        recs = []
        for j in range(off, off + len(par)):
            sent_react = reactions[j]
            for r in sent_react["reactions"]:
                r_offset = {}
                for k in r:
                    r_offset[k] = []
                    for e in r[k]:
                        if isinstance(e, (list, tuple)):
                            r_offset[k].append(
                                [j + len(tokens) for j in e[1:]])
                        else:
                            if isinstance(e, int):
                                r_offset[k].append(e + len(tokens))

                recs.append(r_offset)

            tokens.extend(sent_react['tokens'])

        extractions.append({'tokens': tokens, 'reactions': recs})
        off += len(par)

    return extractions
Ejemplo n.º 9
0
 def tokenize(text):
     """
     Returns a 1d list of tokens using chemdataextractor tokenizer. Removes all punctuation but
     keeps the structure of sentences.
     """
     cde_p = Paragraph(text)
     tokens = cde_p.tokens
     toks = []
     for sentence in tokens:
         toks.append([])
         for tok in sentence:
             toks[-1].append(tok.text)
     return toks
Ejemplo n.º 10
0
def append_cde_mols(text, mol_list, ptable):
    """
    This function uses ChemDataExtractor to find all molecules in a chunk of text.

    Parameters:
        text (str, required): The text to find molecules in

    Returns:
        list: list of all molecules in the text
    """
    para = Paragraph(text)
    new_mols = para.cems  # find all molecules in the text

    for mol in new_mols:
        mol_list.append(mol.text)
        print('appended ', mol)
Ejemplo n.º 11
0
def test_syn_order():
    '''Tests if function syn_order works'''

    paragraph = Paragraph(
        'After drying, the HTM was deposited by spin-coating a solution of spiro-MeOTAD, 4-tert-butylpyridine, \
        lithium bis(trifluoromethylsulphonyl)imide and tris(2-(1H-pyrazol-1-yl)-4-tert-butylpyridine)cobalt(iii) \
        bis(trifluoromethylsulphonyl)imide in chlorobenzene.\
        Annealing the as-deposited films at 100\u2009°C for 45\u2009min in the N2-filled glove box \
        before spin-coating the hole transporter enabled full crystallization of the perovskite, darkening \
        the colour and resulting in an apparent growth of the crystal features visible in the SEM image, \
        as shown in Extended Data Fig. 1.')
    vb_order, vb_dict = order.syn_order(paragraph)

    assert vb_order[0][0] == 'dry', 'First action identified is incorrect'
    assert vb_order[0][
        1] == 0, 'Sentence number where first action is identified is incorrect'
    assert vb_order[2][0] == 'anneal', 'Fails to identify capitalized word'
    assert ['anneal', 'spin-coat'] in vb_dict.values(), \
    'Fails to store all steps found in vb_dict output'
Ejemplo n.º 12
0
    def tokenize(self, text, split_oxidation=True, keep_sentences=True):
        """
        Converts string to a list tokens (words) using chemdataextractor tokenizer, with a couple of fixes
        for inorganic materials science.
        Keeps the structure of sentences.
        :param text: input text as a string
        :param split_oxidation: if True, will split the oxidation state from the element, e.g. iron(II)
        will become iron (II), same with Fe(II), etc.
        :param keep_sentences: if False, will disregard the sentence structure and return tokens as a
        single list of strings. Otherwise returns a list of lists, each sentence separately.
        """
        def split_token(token, so=split_oxidation):
            """
            Process a single token, in case it needs to be split up. There are 2 cases:
            It's a number with a unit, or an element with a valence state.
            """
            elem_with_valence = self.ELEMENT_VALENCE_IN_PAR.match(
                token) if so else None
            nr_unit = self.NR_AND_UNIT.match(token)
            if nr_unit is not None and nr_unit.group(2) in self.SPLIT_UNITS:
                # splitting the unit from number, e.g. "5V" -> ["5", "V"]
                return [nr_unit.group(1), nr_unit.group(2)]
            elif elem_with_valence is not None:
                # splitting element from it's valence state, e.g. "Fe(II)" -> ["Fe", "(II)"]
                return [elem_with_valence.group(1), elem_with_valence.group(2)]
            else:
                return [token]

        cde_p = Paragraph(text)
        tokens = cde_p.tokens
        toks = []
        for sentence in tokens:
            if keep_sentences:
                toks.append([])
                for tok in sentence:
                    toks[-1] += split_token(tok.text, so=split_oxidation)
            else:
                for tok in sentence:
                    toks += split_token(tok.text, so=split_oxidation)
        return toks
Ejemplo n.º 13
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--annotation-file",
                        type=str,
                        required=True,
                        help="The full annotation file.")
    parser.add_argument("--output-file",
                        type=str,
                        required=True,
                        help="The product recognition data file.")
    args = parser.parse_args()

    # data = []
    with open(args.output_file, "w") as fw:
        with open(args.annotation_file, "r") as fr:
            reader = csv.DictReader(fr, delimiter=',')
            for row in tqdm(reader):
                text = row["description"]
                sents = [s.text for s in Paragraph(text)]
                # data += sents
                for sent in sents:
                    fw.write(f"{sent}\n")
Ejemplo n.º 14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--annotation-file",
                        type=str,
                        required=True,
                        help="The full annotation file.")
    parser.add_argument("--output-file",
                        type=str,
                        required=True,
                        help="The product recognition data file.")
    args = parser.parse_args()

    data = []
    with open(args.annotation_file, "r") as f:
        reader = csv.DictReader(f, delimiter=',')
        for row in reader:
            text = row["description"]
            tokens = text.split(' ')
            product = row["Products"]
            product_tags = row["Products-tag"]

            if product_tags == "":  # skip rows if products is empty
                assert (product == "")
                continue
            product_spans = make_spans(product_tags)

            # verify the correspondence between ${product} and tags
            val = " ".join([
                " ".join(tokens[start:end]) for (start, end) in product_spans
            ])
            assert (val == product)

            sents = [s.text.split(' ') for s in Paragraph(text)]
            # make sure the indexes don't change
            # assert(text == " ".join([" ".join(sent) for sent in sents]))
            if (text != " ".join([" ".join(sent) for sent in sents])):
                print("text not matched after tokenization, skip.")
                continue
            # get sentence boundaries
            sent_boundaries = [
                0,
            ]
            for sent in sents:
                offset = sent_boundaries[-1]
                sent_boundaries.append(offset + len(sent))

            def get_segment(interval, boundaries, window=3):
                """ """
                cxt = int((window - 1) / 2)
                start, end = interval
                for i, b in enumerate(boundaries):
                    if start >= b and (end - 1 < boundaries[i + 1]):
                        sent_id = i
                        break
                segment_start = boundaries[max(0, sent_id - cxt)]
                segment_end = boundaries[min(
                    len(boundaries) - 1, sent_id + cxt + 1)]
                return (segment_start, segment_end)

            for span in product_spans:  # for each product mention, create an individual instance
                seg_start, seg_end = get_segment(span,
                                                 sent_boundaries,
                                                 window=1)

                tagged_text = []
                for p, token in enumerate(tokens):
                    tagged_text.append([token, 'O'])

                # assign B/I- tags to each token
                for field in FOI:
                    fval = row[field]
                    fval_tags = row[field + "-tag"]
                    if fval_tags == "":
                        assert (fval == "")
                        continue
                    fval_spans = make_spans(fval_tags)

                    for fval_span in fval_spans:
                        start, end = fval_span
                        tagged_text[start][1] = f'B-{field}'
                        if end == start + 1:
                            continue
                        for i in range(start + 1, end):
                            tagged_text[i][1] = f'I-{field}'

                prod_span_start, prod_span_end = span
                tagged_text.insert(prod_span_start, ["[P1]", "O"])
                tagged_text.insert(prod_span_end + 1, ["[P2]", "O"])

                tagged_segment = tagged_text[seg_start:(seg_end + 2)]

                data.append(tagged_segment)

    with open(args.output_file, "w") as f:
        for tt in data:
            for token, tag in tt:
                f.write(f"{token}\t{tag}\n")
            f.write("\n")
Ejemplo n.º 15
0
# coding: utf-8

# # Extracting Solubility

# In[ ]:

from chemdataextractor import Document
from chemdataextractor.model import Compound
from chemdataextractor.doc import Paragraph, Heading

# In[5]:

d = Document(
    Paragraph(
        u'The procedure was followed to yield a pale yellow solid Hippeastrine Hydrobromide. ( melting point of Amodiaquine is 137 °C)'
    ))

# In[6]:

d.records.serialize()

# In[37]:

from chemdataextractor.model import BaseModel, StringType, ListType, ModelType


class Solubility(BaseModel):
    value = StringType()
    units = StringType()

Ejemplo n.º 16
0
from operations_extractor import OperationsExtractor
oe = OperationsExtractor()

tp = TextCleanUp()

text_sents = [
    "LiNixMn2−xO4 (x=0.05,0.1,0.3,0.5) samples were prepared in either an air or an O2 atmosphere by solid-state reactions.",
    "Mixtures of Li2CO3,MnCO3, and NiO were heated at 700°C for 24 to 48 h with intermittent grinding.",
    "All these samples were cooled to room temperature at a controlled rate of 1°C/min."
]

paragraph_data = []
for sent in text_sents:

    text = tp.cleanup_text(sent)
    sent_toks = [tok for sent in Paragraph(text).raw_tokens for tok in sent]
    # output, sentence, tokens = get_operations(sent)
    operations, spacy_tokens = oe.get_operations(sent_toks)
    updated_operations = oe.operations_correction(spacy_tokens,
                                                  operations,
                                                  parsed_tokens=True)
    updated_operations = oe.find_aqueous_mixing(spacy_tokens,
                                                updated_operations,
                                                parsed_tokens=True)
    paragraph_data.append((spacy_tokens, updated_operations))

paragraph_data_upd = oe.operations_refinement(paragraph_data,
                                              parsed_tokens=True)

pprint(paragraph_data_upd)
Ejemplo n.º 17
0
# -*- coding: utf-8 -*-
"""
Created on Wed Feb 17 09:07:39 2021

@author: Kristian
"""
from chemdataextractor import Document
from chemdataextractor.model import Compound
from chemdataextractor.doc import Paragraph, Heading

#u in front of the string indicates that a unicode string is to be created
#We think the unicode is for symbols like the degree since it may not be recognized ASCII
d = Document(
    Heading(u'Synthesis of HKUST-1-AC'),
    Paragraph(
        u'The BET surface area and CO2 uptake capacity values for the HKUST-1–AC composite were 1381 m2 g−1 and 8.1 mmol g−1 (at 273 K and 1 bar), respectively, representing increases of 70% and 39%, respectively, over the reported values for HKUST-1'
    ))

from chemdataextractor.model import BaseModel, StringType, ListType, ModelType
import re
from chemdataextractor.parse import R, I, W, Optional, merge


class Capacity(BaseModel):
    value = StringType()
    units = StringType()


Compound.capacity = ListType(ModelType(Capacity))

prefix = (I(u'capacity') | I(u'CO2') + I(u'uptake')).hide()
Ejemplo n.º 18
0
def get_CDE_mols(corpus_path, years, ppy, output_path, mode='fulltext'):
    """
    This function grabs

    Parameters:
        corpus_path (str, required): Path to the corpus

        years (list, required): List of years to find mols for

        ppy (int, required): Papers per year. How many papers to get mols from per
            year

        output_path (str, required): path to place output data to be furher analyzed

        mode (str, optional): Either 'fulltext' or 'abstract' or 'both'
    """
    paper_count = 0
    # make sure we have consistent endings
    if not corpus_path.endswith('/'):
        corpus_path += '/'

    # get a list of all the journal directories and remove the README
    journals = os.listdir(corpus_path)
    journals.remove('README.txt')

    random.seed(42)
    random.shuffle(journals)

    # iterate through every journal in corpus
    for journal_name in journals:
        journal_path = corpus_path + journal_name + '/'

        journal_json = journal_path + journal_name + '.json'

        print('On journal ', journal_name)

        # open the entire dictionary corresponding to a single jornal
        with open(journal_json) as json_file:
            journal_dict = json.load(json_file)

        # iterate through the specified years in parameter
        for year in years:
            year_dict = journal_dict[year]
            print(year)
            try:
                # don't know if there will be enough papers in this year for this pub
                paper_idxs = random.sample(range(len(year_dict)), ppy)
            except:
                continue
            for num in paper_idxs:
                paper_count += 1
                print('On paper ', paper_count, ' of ',
                      len(journals) * len(years) * ppy)
                # grab the paper from this year corresponding to the 'numth' paper
                paper_dict = year_dict[str(num)]

                # get the fulltext out
                try:
                    text = paper_dict['fulltext']
                except:
                    continue
                if type(text) != str:
                    continue
                # remove nonsense information
                text = clean_paper(text)

                para = Paragraph(text)
                mols = para.cems  # find all molecules in the text

                mols = ['<<NEW_PAPER>>'] + [mol.text for mol in mols]
                with open(output_path, 'a') as file:
                    for entry in mols:
                        file.write(entry + '\n')
                    file.write('\n')
    def tokenize(self,
                 texts='default',
                 entities='default',
                 use_entities=True,
                 keep_sentences=True,
                 exclude_punct=False,
                 save=False):
        """
        Takes the set of normalized texts and tokenizes them

        Parameters:
            texts (list): List of texts to tokenize. If `default` then
                          self.normalized_texts will be used
            entities (dict): Dictionary of entity names and index positions. If
                             `default` then self.entities_per_text will be used
            use_entities (bool): If true then entity dict will be used to tokenize
                                 multi-word phrases and chemical entities. Otherwise
                                 all words in text list will be tokenized with the
                                 same algorithm and some entities may be split
            keep_sentences (bool): If true then abstract will be split into list of
                                   lists where each nested list is a single sentence.
                                   Otherwise abstract will be split into a single list
                                   of tokens
            exclude_punct (bool): If true then common punctuation marks will be left out
                                  of token list. Otherwise all punctuation will remain
        """
        if texts == 'default':
            texts = self.normalized_texts
        if entities == 'default':
            entities = self.entities_per_text
        if use_entities:
            assert len(texts) == len(
                entities
            ), "ERROR: SIZE OF ENTITY AND TEXT LISTS DO NOT MATCH. YOU CAN EITHER RUN A NORMALIZATION FUNCTION ON UNPROCESSED TEXT OR LOAD FILES OF MATCHING SIZE"

        ### Instantiate Mat2Vec MaterialsTextProcessor
        MTP = MaterialsTextProcessor()

        ### Iterate through all abstracts and corresponding entities if applicable
        for i in trange(len(texts)):
            text = texts[i]
            entity_spans = []
            if use_entities:
                entry = entities[i]
                for entity in entry:
                    name = entity[0]
                    start = entity[1]
                    stop = entity[2]
                    entity_spans.append((start, stop))
                    new_name = name.replace(' ', '_')
                    text = text[:start] + new_name + text[stop:]

            if keep_sentences == False:
                ### Split text into entities vs. non-entities
                token_list = self.extract_entity_tokens(text, entity_spans)

                ### Tokenize non-entities and combine with entities
                tokens, self.entity_idxs[i] = self.process_token_list(
                    token_list)

                ### Use Mat2Vec MaterialsTextProcessor for casing, number normalization, puncutation, etc.
                tokens, _ = MTP.process(tokens,
                                        exclude_punct=exclude_punct,
                                        normalize_materials=False,
                                        split_oxidation=False)
            else:
                ### Split text into sentences
                tokens = []
                self.entity_idxs[i] = []
                para = Paragraph(text)
                prior_split = 0
                for j, sentence in enumerate(para.sentences):
                    split = sentence.end
                    sentence = sentence.text
                    sentence_entities = []
                    for span in entity_spans:
                        if span[1] < split and span[0] >= prior_split:
                            new_span = (span[0] - split, span[1] - split)
                            sentence_entities.append(new_span)
                    prior_split = split

                    ### Make a token_list for each sentence
                    token_list = self.extract_entity_tokens(
                        sentence, sentence_entities)

                    ### Tokenize non-entities and combine with entities
                    sentence_tokens, sentence_entity_idxs = self.process_token_list(
                        token_list)
                    self.entity_idxs[i].append(sentence_entity_idxs)

                    ### Mat2Vec Processing
                    sentence_tokens, _ = MTP.process(
                        sentence_tokens,
                        exclude_punct=exclude_punct,
                        normalize_materials=False,
                        split_oxidation=False)
                    tokens.append(sentence_tokens)
            self.tokenized_texts[i] = tokens

        if save:
            os.makedirs('preprocessor_files', exist_ok=True)

            with io.open('preprocessor_files/tokenized_texts.json',
                         'w',
                         encoding='utf8') as f:
                out_ = json.dumps(self.tokenized_texts,
                                  indent=4,
                                  sort_keys=False,
                                  separators=(',', ': '),
                                  ensure_ascii=False)
                f.write(str(out_))

            with io.open('preprocessor_files/tokenized_entity_idxs.json',
                         'w',
                         encoding='utf8') as f:
                out_ = json.dumps(self.entity_idxs,
                                  indent=4,
                                  sort_keys=False,
                                  separators=(',', ': '),
                                  ensure_ascii=False)
                f.write(str(out_))
Ejemplo n.º 20
0
"""
Created on Tue Feb 16 08:42:34 2021

@author: Kristian
"""

from chemdataextractor import Document
from chemdataextractor.model import Compound
from chemdataextractor.doc import Paragraph, Heading

#u in front of the string indicates that a unicode string is to be created
#We think the unicode is for symbols like the degree since it may not be recognized ASCII
d = Document(
    Heading(u'Synthesis of 2,4,6-trinitrotoluene (3a)'),
    Paragraph(
        u'The procedure was followed to yield a pale yellow solid (b.p. 240 °C) and a white solid (b.p. 60 °C)'
    ))

from chemdataextractor.model import BaseModel, StringType, ListType, ModelType


class BoilingPoint(BaseModel):
    value = StringType()
    units = StringType()


Compound.boiling_points = ListType(ModelType(BoilingPoint))

import re
from chemdataextractor.parse import R, I, W, Optional, merge
'''
Ejemplo n.º 21
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--annotation-file",
                        type=str,
                        required=True,
                        help="The full annotation file.")
    parser.add_argument("--output-file",
                        type=str,
                        required=True,
                        help="The product recognition data file.")
    args = parser.parse_args()

    data = defaultdict(list)
    with open(args.annotation_file, "r") as f:
        reader = csv.DictReader(f, delimiter=',')
        for row in reader:
            text = row["description"]
            tokens = text.split(' ')
            product = row["Products"]
            product_tags = row["Products-tag"]
            if product_tags == "":
                assert (product == "")
                continue
            product_tags = list(map(int, product_tags.split(',')))

            # verify the correspondence between ${product} and tags
            product_spans = []
            for i in range(int(len(product_tags) / 2)):
                product_spans.append(
                    (product_tags[i * 2], product_tags[i * 2 + 1]))
            val = " ".join([
                " ".join(tokens[start:end]) for (start, end) in product_spans
            ])
            assert (val == product)

            for span in product_spans:
                if span not in data[text]:
                    data[text].append(span)

    n_sents = 0
    with open(args.output_file, "w") as f:
        # segment each paragraph into sentences, and map indexes correspondingly
        for text, prod_spans in data.items():
            print(prod_spans)
            sents = [s.text.split(' ') for s in Paragraph(text)]
            print("{} sentences detected".format(len(sents)))
            n_sents += len(sents)

            # make sure the indexes don't change
            # assert(text == " ".join([" ".join(sent) for sent in sents]))
            merged_text = " ".join([" ".join(sent) for sent in sents])
            if text != merged_text:
                print("text: %s (len: %d)" % (text, len(text)))
                print("merged text: %s (len: %d)" %
                      (merged_text, len(merged_text)))

            # get sentence boundaries
            sent_boundaries = [
                0,
            ]
            for sent in sents:
                offset = sent_boundaries[-1]
                sent_boundaries.append(offset + len(sent))
            print(sent_boundaries)

            # initialize all tokens with tag "O"
            tagged_text = []
            for p, token in enumerate(text.split(' ')):
                tagged_text.append([token, 'O'])

            # check if a span (interval) crosses any sentence boundary
            def cross_boundary(interval, refs):
                # e.g., interval = [10, 12]
                for i, b in enumerate(refs):
                    if b > interval[0] and b <= interval[1] - 1:
                        return i
                return -1

            # tag Product tokens
            for span in prod_spans:
                # if the span crosses sentence boundaries, skip (or merge the two sents)
                if cross_boundary(span, sent_boundaries) >= 0:
                    print("cross_boundary!")
                    continue

                start, end = span
                tagged_text[start][1] = 'B-Prod'
                if end == start + 1:
                    continue
                for i in range(start + 1, end):
                    tagged_text[i][1] = 'I-Prod'

            # split paragraph to sentences
            tagged_sents = []
            for i in range(len(sents)):
                bos = sent_boundaries[i]
                eos = sent_boundaries[i + 1]
                tagged_sents.append(tagged_text[bos:eos])

            for ts in tagged_sents:
                for token, tag in ts:
                    f.write(f"{token}\t{tag}\n")
                f.write("\n")