Exemple #1
0
 def list_of_cems(self):
     """List of cems e.g. cem, cem, cem and cem"""
     return Group(self.single_cem +
                  Optional(lbrct + R('^\d+$') + rbrct).hide() +
                  ZeroOrMore(delim.hide() | self.single_cem | R('^\d+$')) +
                  (I('and') | I('or') | I('to')).hide() + self.single_cem +
                  Optional(lbrct + R('^\d+$') + rbrct).hide() +
                  Optional(I('compounds') | I('samples')))('cem_list')
Exemple #2
0
 def value_phrase(self):
     number = R('^[\+\-–−]?\d+(\.\d+)?$')
     joined_range = R('^[\+\-–−]?\d+(\.\d+)?[\-–−~∼˜]\d+(\.\d+)?$')('raw_value').add_action(merge)
     spaced_range = (number + (R('^[\-–−~∼˜]$') + number | number))('raw_value').add_action(
         merge)
     to_range = (number + I('to') + number)('raw_value').add_action(join)
     plusminus_range = (number + R('±') + number)('value').add_action(join)
     value_range = (Optional(R('^[\-–−]$')) + (plusminus_range | joined_range | spaced_range | to_range))(
         'raw_value').add_action(merge)
     value_single = (Optional(R('^[~∼˜\<\>]$')) + Optional(R('^[\-–−]$')) + number)('raw_value').add_action(merge)
     inumber = (R('\d*\.?\d*[i]$')).add_action(join)
     # inumber = R('^([-+]?(\d+\.?\d*|\d*\.?\d+)([Ee][-+]?[0-2]?\d{1,2})?[r]?|[-+]?((\d+\.?\d*|\d*\.?\d+)([Ee][-+]?[0-2]?\d{1,2})?)?[i]|[-+]?(\d+\.?\d*|\d*\.?\d+)([Ee][-+]?[0-2]?\d{1,2})?[r]?[-+]((\d+\.?\d*|\d*\.?\d+)([Ee][-+]?[0-2]?\d{1,2})?)?[i])$')
     ivalue = (R('\d*\.?\d*$') + R('^[\+\-–−]?') + inumber).add_action(join)
     value = Optional(lbrct).hide() + (ivalue | value_range | value_single)('raw_value') + Not(I('wt%')|I('vol%')|I('K')|I('times')|I('GPa')|I('wt')|I('vol')|I('%')|I('nm')|I('zF')|W('°')|W('KV')|W('kV')|W('MV')|I('kHz')|I('Hz')|I('GHz')|W('V')|W('J')|W('eV')|I('MHz')) + Optional(rbrct).hide()
     return value
Exemple #3
0
 def specifier_value_cem(self):
     return (Optional(I('below') | I('at')) \
             + self.specifier_and_value \
             + Optional((I('has') + I('been') + I('found') + I('for')) | (
             I('was') + (I('observed') | I('determined') | I('measured') | I('calculated')))).hide() \
             + Optional(I('in') | I('for') | I('of')).hide() \
             + Optional(I('the')).hide() \
             + Optional(R('^[:;,]$')).hide() \
             + Optional(I('bulk') | I('powdered') | I('doped') | I('full')| (I('thin') + I('film'))).hide()
             + Optional(rbrct) \
             + self.cem_phrase)('root_phrase')
Exemple #4
0
 def cem_value_specifier(self):
     return (self.cem_phrase
             + Optional((I('is') | I('was') | I('were')) + Optional(I('reported') | I('found') | I('calculate') | I('measured') | I('shown') | I('found'))
             + Optional(I('to'))).hide() \
             + Optional((I('exhibit') | I('exhibits') | I('exhibiting') | R('^show[s]*$') | I('demonstrates') | I('undergoes') | I('have') | I('has') | I('having') | I('determined') | I('with'))).hide() \
             + Optional(I('the') | I('a') | I('an')).hide() \
             + Optional(I('value') | I('values')).hide() \
             + Optional(I('varies') + I('from')).hide() \
             + Optional(W('=') | W('~') | W('≈') | W('≃') | I('was') | I('is') | I('at') | I('as') | I('near') | I('above') | I('below')).hide() \
             + Optional(I('in') + I('the') + I('range') | I('ranging')).hide() \
             + Optional(I('of') | I('about') | I('from') | I('approximately') | I('around') | (I('high') + I('as')) | (I('higher') | I('lower') + I('than'))).hide() \
             + self.value_phrase \
             + Optional(I('as') | I('of') | I('for')).hide() \
             + Optional(I('its') | I('their') | I('the')).hide() + self.specifier_phrase)('root_phrase')
Exemple #5
0
 def cem_specifier_value(self):
     return ((
         self.cem_phrase + Optional(delim).hide() +
         Optional(I('samples') | I('system') | I('systems') | I('sample')) +
         Optional(
             I('that') | I('which') | I('was') | I('since')
             | I('the')).hide() + Optional(I('typically')).hide() +
         Optional(
             I('exhibits') | I('exhibiting') | R('^show[s]*$')
             | I('demonstrates') | I('undergoes') | I('has') | I('having')
             | I('determined') | I('with') | I('where') | I('orders')
             | (I('is') + Optional(I('classified') + I('as')))).hide() +
         Optional(I('reported') + I('to') + self.have).hide() +
         Optional(lbrct).hide() + self.specifier_and_value +
         Optional(rbrct))('root_phrase'))
Exemple #6
0
 def prefix(self):
     """Specifier and prefix"""
     return (self.specifier_phrase
             + Optional(I('values')).hide()
             + Optional(delim).hide()
             + Optional((I('varies') + I('from')) |
                        R('^increase(s|d)?') | I('falls') | I('reaches')).hide()
             + Optional(I('steeply')).hide()
             + Optional(I('recorded') | I('reported')).hide()
             + Optional(I('of') | I('was') | I('is') | I('at') | I('near') |
                        I('above') | I('below') | I('with') | I('to') | I('were') | I('a')).hide()
             + Optional(I('reported') | I('determined') |
                        I('estimated') | I('found') | I('occurs')).hide()
             + Optional(I('temperatures')).hide()
             + Optional(I('as') | (I('to') + I('be'))).hide()
             + Optional(I('in') + I('the') + I('range')).hide()
             + Optional(I('as') + I('high') + I('as'))
             + Optional(I('ranging') + I('from')).hide()
             + Optional(I('of')).hide()
             + Optional(I('rather') | I('quite')).hide()
             + Optional(I('high') | I('low') | I('maximum') | I('minimum')).hide()
             + Optional(I('the')).hide()
             + Optional(delim | lbrct | rbrct)
             + Optional(
                 I('of') | I('about') | I('approximately') | I('typically') | I('ca.') | I('around') | I('at') | I(
                     'above') | I('below') | I('high') | I('low')
                 | ((I('higher') | I('lower') | I('more') | I('less')) + I('than')) | I('order') | (
                             I('for') + I('instance')) | (I('up') + I('to')) | I('reaching') | I('value')).hide()
             + Optional(I('a') | I('an') | I('as')).hide()
             + Optional(I('maximum')).hide()
             + Optional(I('of')).hide()
             + ZeroOrMore(lbrct | delim | rbrct)
             + Optional(self.specifier_phrase)
             + Optional(I('of')).hide()
             + Optional(I('the')).hide()
             + Optional(I('order')).hide()
             + Optional((I('up') | I('equal')) + I('to')).hide()
             + Optional(I('of')).hide()
             + ZeroOrMore(lbrct | delim | rbrct)
             + Optional(W('=') | W('~') | W('≈') |
                        W('≃') | W('>') | W('<')).hide()
             + ZeroOrMore(lbrct | delim | rbrct).hide()).add_action(join)
Exemple #7
0
class Obstructions:  #Class for bracket
    bracket = Optional(R(u'\('))
    curlLine = Optional(R(u'\~'))
    of = Optional(W(u'of'))
    hyphen = Optional(R(u'\-'))
    all = bracket + curlLine + of + hyphen
Exemple #8
0
class MpRegex:
    prefix = (R(u'^m\.?p\.?$', re.I)
              | I(u'melting') + I(u'point')).hide()  #u-createsunicodestring
    units = Optional(R(u'^[CFK]\.?$'))(u'units').add_action(merge)
    value = R(u'^\d+(\.\d+)?$')(u'value')
    mp = (prefix + Obstructions.all + value + units)(u'mp')
Exemple #9
0
class BpRegex:
    prefix = (R(u'^b\.?p\.?$', re.I)
              | I(u'boiling') + I(u'point')).hide()  #u-createsunicodestring
    units = (W(u'°') + Optional(R(u'^[CFK]\.?$')))(u'units').add_action(merge)
    value = R(u'^\d+(\.\d+)?$')(u'value')
    bp = (prefix + value + units)(u'bp')
#from chemdataextractor.doc import Sentence
from chemdataextractor.parse import R, I, W, Optional, merge, join
from chemdataextractor.parse.base import BaseSentenceParser
from chemdataextractor.utils import first

frequency_value = ((R("^\d+?\.\d+?$") | R("^\d+?$")) +
                   (W('kHz') | W('MHz') | W('GHz')
                    | W('Hz')))('frequencyvalue').add_action(join)


class DielectricConstantFrequencyParser(BaseSentenceParser):
    root = frequency_value

    def interpret(self, result, start, end):
        raw_value = first(result.xpath('//frequencyvalue/text()'))
        #print (type(raw_value))
        frequency = self.model(frequency=raw_value)
        #print (frequency)
        yield frequency
#from chemdataextractor.doc import Sentence
from chemdataextractor.parse import R, I, W, Optional, merge, join
from chemdataextractor.parse.base import BaseSentenceParser
from chemdataextractor.utils import first

wavelength_value = (
    (R("^\d+?$") + W('nm')) | (R("^\d+?\.\d+?$") + W('μm')) |
    (R("^\d+?\.\d+?$") + W('nm')))('wavelengthvalue').add_action(join)


class RefractiveIndexWavelengthParser(BaseSentenceParser):
    root = wavelength_value

    def interpret(self, result, start, end):
        raw_value = first(result.xpath('//wavelengthvalue/text()'))
        wavelength = self.model(wavelength=raw_value)
        yield wavelength
from chemdataextractor.doc import Paragraph, Heading, Sentence
from lxml import etree

class CurieTemperature(BaseModel):
    specifier = StringType()
    value = StringType()
    units = StringType()

Compound.curie_temperatures = ListType(ModelType(CurieTemperature))

#%% [markdown]
# Now define parse elements that describe how to identify the entities in text. Think of these as tagging processes.

#%%
# Define a very basic entity tagger
specifier = (I('curie') + I('temperature') + Optional(lrb | delim) + Optional(R('^T(C|c)(urie)?')) + Optional(rrb) | R('^T(C|c)(urie)?'))('specifier').add_action(join)
units = (R('^[CFK]\.?$'))('units').add_action(merge)
value = (R('^\d+(\.\,\d+)?$'))('value')

#%% [markdown]
# Note we tag each with a unique identifier that will be used later. Now let the entities in a sentence be any ordering of these (or whatever ordering you feel like). Here we specify that the value and units must coincide, but this does not have to be the case. 
# 
# We also define an extremely general parse phrase, this will be used to identify candidate sentences.

#%%
# Let the entities be any combination of chemical names, specifier values and units
entities = (chemical_name | specifier | value + units)

# Now create a very generic parse phrase that will match any combination of these entities
curie_temperature_phrase = (entities + OneOrMore(entities | Any()))('curie_temperature')
H = Hide
Converter for ignoring the results of a parsed expression

'''
'''
R matches the token text to the regular expression
^ matches any string that starts with b
\ removes special meaning from from the . character
? Matches either once or zero times; marks the p as optional
$ Matches at the end of a line
I means the case is not sensitive
| matches either b.p. or boiling point
.hide() I think just makes it so the prefix does not show up in the output but
it still searches the document for the text
'''
prefix = (R(u'^b\.?p\.?$', re.I) | I(u'boiling') + I(u'point')).hide()
'''
W matches the degree symbol exactly 
Optional means that the unit will be included if it is in the text
R matches the token text to the regular experssion
^ matches any string that contains a C OR F OR K
'''

units = (W(u'°') + Optional(R(u'^[CFK]\.?$')))(u'units').add_action(merge)
'''
R matches the token text to a regular experssion
^ matches any string that starts with a digit
\d looks for a number between 0 and 9
+ matches following digits one or more times
\. matches . symbol if it is there
() is the capturing group
import io
import logging
import os
import unittest
import numpy as np

from chemdataextractor.relex.utils import mode_rows, match, KnuthMorrisPratt
from chemdataextractor.doc import Sentence
from chemdataextractor.relex import Relation, Entity, Phrase, Cluster
from chemdataextractor.parse.cem import chemical_name
from chemdataextractor.parse import R, merge

logging.basicConfig(level=logging.DEBUG)
log = logging.getLogger(__name__)

units = (R('^[CFK]\.?$'))('units').add_action(merge)
value = (R('^\d+(\.\,\d+)?$'))('value')


class TestRelexUtils(unittest.TestCase):
    maxDiff = None

    def test_mode_rows(self):
        a = np.array([[1, 1, 1, 1], [1, 2, 3, 4], [1, 1, 1, 1], [3, 4, 5, 6]])
        expected = [1, 1, 1, 1]
        result = list(mode_rows(a))
        self.assertListEqual(result, expected)

    def test_match(self):
        s1 = Sentence('BiFeO3 with 1103 K')
        entities = [
Exemple #15
0
    footflow = StringType(contextual=True)
    footflowunit = StringType(contextual=True)
    pressure = StringType(contextual=True)
    presunits = StringType(contextual=True)
    tofvalue = StringType()
    yieldtest = StringType(contextual=True)
    selectivity = ListType(ModelType(Selectivity, contextual=True),
                           contextual=True)


Compound.conv = ListType(ModelType(Conv))

supportstring = "ZnO|YSZ|ceria|alumina|silica|SBA|ZSM|CNT|Al2O3|MgO|CeO2|TiO2|CMK|MnO|Y2O3|ZrO2|Tb4O7|HfO2|La2O3|Co3O4|ThO2|SiO2|Fe2O3|Sm2O3|Mo2C|Gd2O3|Yb2O3|CaO|CuO|NiO"
symbolstring = "|Li|Be|Ne|Na|Mg|Al|Si|Cl|Ox|Ca|Sc|Ti|V|Cr|Mn|Fe|Co|Ni|Cu|Zn|Ga|Ge|As|Se|Br|Kr|Rb|Sr|Y|Zr|Nb|Mo|Tc|Ru|Rh|Pd|Ag|Cd|In|Sn|Sb|Te|Xe|Cs|Ba|La|Ce|Pr|Nd|Pm|Sm|Eu|Gd|Tb|Dy|Ho|Er|Tm|Yb|Lu|Hf|Ta|Re|Os|Ir|Pt|Au|Hg|Tl|Pb|Bi|Po|At|Rn|Fr|Ra|Ac|Th|Pa|Np|Pu|Am|Cm|Bk|Cf|Es|Fm|Md|No|Lr|Rf|Db|Sg|Bh|Hs|Mt|Ds|Rg|Cn|Nh|Fl|Mc|Lv|Ts|Og"
name = (R(
    u"^((\d|%)*(" + symbolstring + "|" + supportstring +
    ")(?![a-z])(([A-Zα-ωΑ-Ω0-9\/\-,\.]|(wt.?))[a-zA-Z0-9α-ωΑ-Ω0-9\/\-,\.]*)?)")
        )
post = R("^([A-Z]|[0-9]|(wt.?)|" + symbolstring +
         u"|\/|-|\.|,|%|\(|\))+$") | T("SYM")
name = (name + ZeroOrMore(post))(u"name").add_action(merge)

catalyst = R(
    "(Catalysts?)|(precursors?)|(abb)|(sample)|(abb)|(materials?)|(composition)",
    re.IGNORECASE)


class CatalystHeadingParser(BaseParser):
    root = catalyst

    def interpret(self, result, start, end):

class Anneal(BaseModel):
    """
    Class for full list of spin-coating step parameters for full process.
    """
    temps = ListType(ModelType(AnnealTemp))
    times = ListType(ModelType(AnnealTime))


# Associating anneal parameters with a chemical object
Compound.anneal = ListType(ModelType(Anneal))  # currently not working

# Defining object parameters for the AnnealParser parser
# Deliminators
delim = R('^[;:,\./]$').hide()

# Defining formats for annealing temperature and units
tempprefix = (I('at') | I('or')).hide()
tempunits = (W('°') + R('^[CFK]\.?$'))('tempunits').add_action(merge)
tempvalue = R('^\d{2,4}?$')('tempvalue').add_action(merge) + Optional(delim)

# Defining formats for spin-coating time and time units
timeprefix = I('for').hide()
timeunits = (
    R('^s?(ec|econds)?$') | R('^m?(in|inute)?(s)?$')
    | R('^h?(ou)?(r)?(s)?$'))('timeunits').add_action(join) + Optional(delim)
timevalue = R('^\d{,2}$')('timevalue') + Optional(delim)

# Putting everything together
temp = (tempvalue)('temp')
Exemple #17
0
Compound.solubility = ListType(ModelType(Solubility))

# In[38]:

import re
from chemdataextractor.parse import R, I, W, Optional, merge

# prefix = (R(u'^m\.?p\.?$', re.I) | I(u'melting') + I(u'point')).hide()
prefix = (I(u'solubility')).hide() + Optional(
    W('=') | I('of') | I('was') | I('is') | I('at')).hide() + Optional(
        I('in') + I('the') + I('range') + Optional(I('of'))
        | I('about')).hide()

# delim = R(u'^[:;\.,]$')
value = R(u'^\d+(\.\d+)?$')(u'value')

units = (W(u'nM') | W(u'μM') | W(u'mM') | W(u'μg')
         | W(u'mg'))(u'units').add_action(merge)

so = (prefix + Optional(R('\w+\s\w+')).hide() + value + units)(u'so')

# units = (W(u'°') + Optional(R(u'^[CFK]\.?$')))(u'units').add_action(merge)
# value = R(u'^\d+(\.\d+)?$')(u'value')
# bp = (prefix + + value + units)(u'bp')

# In[39]:

from chemdataextractor.parse.base import BaseParser
from chemdataextractor.utils import first
Exemple #18
0

def find_subset(items, target, acc=[]):
    if target == 0:
        return acc
    if len(items) == 0:
        return None
    acc_take = acc.copy()
    acc_take.append(items[0])
    take = find_subset(items[1:], target - items[0], acc_take)
    if take:
        return take
    return find_subset(items[1:], target, acc)


units = (Optional("(") + Optional(R(u'°')) +
         R(u'[CFK℃]')).add_action(merge)(u'units')
value = (Optional(R('~')) + R(u'^\d{3,4}')).add_action(merge)(u'value')
value2 = (Optional(R('~')) + R(u'^\d{3,4}(°C|K)')).add_action(merge)(u'value')
temp1 = (value + units)(u'temp1')
temp2 = value2
temp = (temp1 | value2)(u'tempphrase')
#for catalyst name
supportstring = "ZnO|YSZ|ceria|alumina|silica|SBA|ZSM|CNT|Al2O3|MgO|CeO2|TiO2|CMK|MnO|Y2O3|ZrO2|Tb4O7|HfO2|La2O3|Co3O4|ThO2|SiO2|Fe2O3|Sm2O3|Mo2C|Gd2O3|Yb2O3|CaO|CuO|NiO"
symbolstring = "|Li|Be|Ne|Na|Mg|Al|Si|Cl|Ox|Ca|Sc|Ti|V|Cr|Mn|Fe|Co|Ni|Cu|Zn|Ga|Ge|As|Se|Br|Kr|Rb|Sr|Y|Zr|Nb|Mo|Tc|Ru|Rh|Pd|Ag|Cd|In|Sn|Sb|Te|Xe|Cs|Ba|La|Ce|Pr|Nd|Pm|Sm|Eu|Gd|Tb|Dy|Ho|Er|Tm|Yb|Lu|Hf|Ta|Re|Os|Ir|Pt|Au|Hg|Tl|Pb|Bi|Po|At|Rn|Fr|Ra|Ac|Th|Pa|Np|Pu|Am|Cm|Bk|Cf|Es|Fm|Md|No|Lr|Rf|Db|Sg|Bh|Hs|Mt|Ds|Rg|Cn|Nh|Fl|Mc|Lv|Ts|Og"
name = (R(
    u"^((" + symbolstring + "|" + supportstring +
    ")(?![a-z])(([A-Zα-ωΑ-Ω0-9\/\-,\.]|(wt.?))[a-zA-Z0-9α-ωΑ-Ω0-9\/\-,\.]*)?)")
        )
post = R("^([A-Z]|[0-9]|(wt.?)|" + symbolstring +
         u"|\/|-|\.|,|%|\(|\))+$") | T("SYM")
Exemple #19
0
#from chemdataextractor.model import Compound
from chemdataextractor.parse import R, I, W, Optional, merge
from chemdataextractor.parse.base import BaseSentenceParser
#from chemdataextractor.model import Compound
from chemdataextractor.parse.common import lbrct, dt, rbrct
from chemdataextractor.parse.elements import W, I, R, Optional, Any, OneOrMore, Not, ZeroOrMore, SkipTo
import re
from chemdataextractor.parse.auto import BaseAutoParser,construct_unit_element,Group,match_dimensions_of,value_element,value_element_plain
from chemdataextractor.parse.actions import merge, join
from lxml import etree
import logging
log = logging.getLogger(__name__)



delim = R('^[:;\.,]$')

class PropertyParserTemplate(BaseAutoParser, BaseSentenceParser):
    """Template parser for QuantityModel-type structures

    Finds Cem, Specifier, Value and Units from single sentences

    Other entities are merged contextually

    Returns:
        [type] -- [description]
    """

    @property
    def specifier_phrase(self):
        return self.model.specifier.parse_expression('specifier')
Exemple #20
0
from chemdataextractor.model import BaseModel, StringType, ListType, ModelType
import re
from chemdataextractor.parse import R, I, W, Optional, merge


class Capacity(BaseModel):
    value = StringType()
    units = StringType()


Compound.capacity = ListType(ModelType(Capacity))

prefix = (I(u'capacity') | I(u'CO2') + I(u'uptake')).hide()
#Left the optional in because if I take it out then there is a syntax error on line 33
units = (W(u'mmol g-1') +
         Optional(R(u'^[CFK]\.?$')))(u'units').add_action(merge)
value = R(u'^\d+(\.\d+)?$')(u'value')

cp = (prefix + value + units)(u'cp')

from chemdataextractor.parse.base import BaseParser
from chemdataextractor.utils import first


class CpParser(BaseParser):
    root = cp

    def interpret(self, result, start, end):
        compound = Compound(capacity=[
            Capacity(value=first(result.xpath('./value/text()')),
                     units=first(result.xpath('./units/text()')))
Exemple #21
0
class SpinCoat(BaseModel):
    """
    Class for full list of spin-coating step parameters for full process.
    """
    #solvent = StringType(contextual=True)
    spds = ListType(ModelType(SpinSpd))
    times = ListType(ModelType(SpinTime))


# Associate the spin-coating class with a given compound.  May be worth
# getting rid of for our eventual implementation, not yet sure.
Compound.spin_coat = ListType(ModelType(SpinCoat))

# Variable assignments
# Deliminators -- hide from tokenization
delim = R('^[;:,\./]$').hide()

# Defining formats for spin-coating value and units
spdunits = (
    R(u'^r(\.)?p(\.)?m(\.)?$') | R(u'^r(\.)?c(\.)?f(\.)?$')
    | R(u'^([x×]?)(\s?)?g$'))('spdunits').add_action(join) + ZeroOrMore(delim)
spdvalue = Optional(
    W('(')).hide() + R(u'^\d+(,\d+)?[0][0]$')('spdvalue') + Optional(
        W(')')).hide()

# Defining formats for spin-coating time and time units
timeprefix = I('for').hide()
timeunits = (
    R('^s?(ec|econds)?$') | R('^m?(in|inute)?(s)?$')
    | R('^h?(ou)?(r)?(s)?$'))('timeunits').add_action(join) + Optional(delim)
timevalue = R('^\d{,3}$')('timevalue') + Optional(delim)  #<3 digits
Exemple #22
0
#from chemdataextractor.doc import Sentence
from chemdataextractor.parse import R, I, W, Optional, merge, join
from chemdataextractor.parse.base import BaseSentenceParser
from chemdataextractor.utils import first

dielectriclost = R('^[0]\.[0][0-9]+]?')('dielectricloss')


class DielectricLossParser(BaseSentenceParser):
    root = dielectriclost

    def interpret(self, result, start, end):
        raw_value = first(result.xpath('//dielectricloss/text()'))
        #print (type(raw_value))
        frequency = self.model(dielectricloss=raw_value)
        #print (frequency)
        yield frequency