def list_of_properties(self): """List of specifiers and units""" return Group(Optional(lbrct).hide() + self.single_specifier_and_value_with_optional_unit + (I('and') | delim | (I('that') + I('exhibits'))).hide() + self.single_specifier_and_value + Optional(rbrct).hide())('property_list')
def multi_entity_phrase_3b(self): """multiple compounds, single specifier, multiple transitions cems last e.g. Tc = 750 and 640 K in LaFeO3 and BiFeO3, respectivel """ return Group(self.prefix + self.list_of_values + Optional(I('in') | I('for')) + self.list_of_cems + Optional(delim + I('respectively')))('multi_entity_phrase_3')
def list_of_cems(self): """List of cems e.g. cem, cem, cem and cem""" return Group(self.single_cem + Optional(lbrct + R('^\d+$') + rbrct).hide() + ZeroOrMore(delim.hide() | self.single_cem | R('^\d+$')) + (I('and') | I('or') | I('to')).hide() + self.single_cem + Optional(lbrct + R('^\d+$') + rbrct).hide() + Optional(I('compounds') | I('samples')))('cem_list')
def multi_entity_phrase_2(self): """single compound, single specifier, multiple transitions e.g. BiFeO3 shows magnetic transitions at 1093 and 640 K """ return Group(self.single_cem + OneOrMore(Not(self.single_cem | self.specifier_phrase | self.value_phrase) + Any().hide()) + self.specifier_phrase + OneOrMore(Not(self.single_cem | self.specifier_phrase | self.value_phrase) + Any().hide()) + self.list_of_values + Optional(delim).hide() + Optional(I('respectively')))('multi_entity_phrase_2')
def multi_entity_phrase_3a(self): """multiple compounds, single specifier, multiple transitions cems first e.g. TC in BiFeO3 and LaFeO3 of 640 and 750 K """ return Group(Optional(self.specifier_phrase) + Optional(I('in') | I('for')).hide() + self.list_of_cems + OneOrMore(Not(self.single_cem | self.specifier_phrase | self.value_phrase) + Any().hide()) + self.prefix + self.list_of_values + Optional(delim.hide() + I('respectively').hide()))('multi_entity_phrase_3')
def multi_entity_phrase_3c(self): """multiple compounds, single specifier, multiple transitions cems last e.g. curie temperatures from 100 K in MnO to 300 K in NiO """ return Group(self.single_specifier_and_value + Optional(I('for') | I('in')).hide() + self.single_cem + (Optional(I('up')) + I('to')).hide() + self.value_phrase + Optional(I('in') | I('for')).hide() + self.single_cem)('multi_entity_phrase_3')
def value_specifier_cem_phrase(self): return ( self.value_phrase + Optional(delim | lbrct | rbrct) + Optional(I('which') | I('there')).hide() + Optional(I('likely') | I('close') | (I('can') + I('be'))).hide() + Optional(I('corresponds') | I('associated') | I('corresponding')).hide() + Optional(I('to') | I('with') | I('is')).hide() + Optional(I('the') | I('a')).hide() + Optional(I('transition')).hide() + Optional(I('to').hide()) + Optional(I('a')).hide() + self.specifier_phrase + Not(I('=')) + Not(self.value_phrase) + Optional(I('of') | I('in')).hide() + self.cem_phrase)('root_phrase')
def value_specifier_cem(self): return (Optional(I('of')) \ + self.value_phrase \ + Optional(delim) \ + Optional(I('which') | I('that')).hide() + Optional(I('has') + I('been') | I('was') | I('is') | I('were')).hide() \ + Optional(I('found') | I('observed') | I('measured') | I('calculated') | I('determined')).hide() + Optional(I('likely') | I('close') | (I('can') + I('be'))).hide() \ + Optional(I('corresponds') | I('associated')).hide() \ + Optional(I('to') + I('be') | I('with') | I('is') | I('as')).hide() \ + Optional(I('the')).hide() \ + self.specifier_phrase \ + Optional(I('of') | I('in')).hide() \ + (self.cem_phrase))('root_phrase')
def cem_specifier_value(self): return (( Optional(self.cem_phrase) + Optional(delim).hide() + Optional(I('samples') | I('system') | I('systems') | I('sample')) + Optional( I('that') | I('which') | I('was') | I('since') | I('the')).hide() + Optional(I('typically')).hide() + Optional( I('exhibits') | I('exhibiting') | R('^show[s]*$') | I('demonstrates') | I('undergoes') | I('has') | I('having') | I('determined') | I('with') | I('where') | I('orders') | (I('is') + Optional(I('classified') + I('as')))).hide() + Optional(I('reported') + I('to') + I('have')).hide() + Optional(lbrct).hide() + self.specifier_and_value + Optional(rbrct))('root_phrase'))
def value_phrase(self): number = R('^[\+\-–−]?\d+(\.\d+)?$') joined_range = R('^[\+\-–−]?\d+(\.\d+)?[\-–−~∼˜]\d+(\.\d+)?$')('raw_value').add_action(merge) spaced_range = (number + (R('^[\-–−~∼˜]$') + number | number))('raw_value').add_action( merge) to_range = (number + I('to') + number)('raw_value').add_action(join) plusminus_range = (number + R('±') + number)('value').add_action(join) value_range = (Optional(R('^[\-–−]$')) + (plusminus_range | joined_range | spaced_range | to_range))( 'raw_value').add_action(merge) value_single = (Optional(R('^[~∼˜\<\>]$')) + Optional(R('^[\-–−]$')) + number)('raw_value').add_action(merge) inumber = (R('\d*\.?\d*[i]$')).add_action(join) # inumber = R('^([-+]?(\d+\.?\d*|\d*\.?\d+)([Ee][-+]?[0-2]?\d{1,2})?[r]?|[-+]?((\d+\.?\d*|\d*\.?\d+)([Ee][-+]?[0-2]?\d{1,2})?)?[i]|[-+]?(\d+\.?\d*|\d*\.?\d+)([Ee][-+]?[0-2]?\d{1,2})?[r]?[-+]((\d+\.?\d*|\d*\.?\d+)([Ee][-+]?[0-2]?\d{1,2})?)?[i])$') ivalue = (R('\d*\.?\d*$') + R('^[\+\-–−]?') + inumber).add_action(join) value = Optional(lbrct).hide() + (ivalue | value_range | value_single)('raw_value') + Not(I('wt%')|I('vol%')|I('K')|I('times')|I('GPa')|I('wt')|I('vol')|I('%')|I('nm')|I('zF')|W('°')|W('KV')|W('kV')|W('MV')|I('kHz')|I('Hz')|I('GHz')|W('V')|W('J')|W('eV')|I('MHz')) + Optional(rbrct).hide() return value
def specifier_cem_value(self): return (Optional(I('the') | I('a') | I('an') | I('its') | I('with')).hide() \ + self.specifier_phrase \ + Optional(I('of') | I('in') | I('for')).hide() \ + Optional( I('bulk') | I('powdered') | I('doped') | I('the') | I('a') | I('an') | I('these') | I('those') | I('this') | I('that')).hide() \ + self.cem_phrase \ + Optional(I('is') | I('was') | I('were') | I('occurs') | I('of') | ( I('can') + I('be') + I('assigned') + Optional(I('at') | I('to')))).hide() \ + Optional(I('observed') | I('determined') | I('measured') | I('calculated') | I('found')).hide() \ + Optional(I('in') + I('the') + I('range') + I('of') | I('ranging') + I('from') | I('as') | I('to') + I('be') | I('about') | I('over') | (I('higher') | I('lower')) + I('than') | I('above')).hide() \ + Optional(W('=') | W('~') | W('≈') | W('≃') | I('of') | I('was') | I('is') | I('at') | I('as') | I('near') | I('above') | I('below')).hide() + Optional(lbrct).hide() \ + (self.value_phrase) + Optional(rbrct))('root_phrase')
def list_of_values(self): """List of values with either multiple units or one at the end""" # option 1: single unit at the end option_1 = Group(self.value_with_optional_unit + Optional(OneOrMore(delim.hide() + self.value_with_optional_unit)) + Optional(delim).hide() + (I('and') | I('or')).hide() + Optional(delim).hide() + self.value_phrase)('value_list') # option 2: Multiple units option_2 = (self.value_phrase + Optional(OneOrMore(delim.hide() + self.value_phrase)) + Optional(delim).hide() + (I('and') | I('or') | delim).hide() + self.value_phrase)('value_list') return (option_1 | option_2)
def specifier_before_cem_and_value_phrase(self): return ( self.specifier_phrase + OneOrMore(Not(self.cem_phrase | self.specifier_phrase | self.value_phrase) + Any().hide()) + self.cem_phrase + OneOrMore(Not(self.cem_phrase | self.specifier_phrase | self.value_phrase) + Any().hide()) + Optional(self.prefix) + self.value_phrase)('root_phrase')
def specifier_before_cem_and_value_phrase(self): return (self.specifier_phrase + OneOrMore( Not(self.cem_phrase | self.specifier_phrase | self.value_phrase) + Any().hide()) #+ self.connection + Optional(self.article)) + self.cem_phrase + OneOrMore( Not(self.cem_phrase | self.specifier_phrase | I('dielectric') + I('constant') | self.value_phrase | I('not')) + Any().hide()) + Optional(self.prefix) + self.value_phrase)('root_phrase')
def specifier_value_cem(self): return (Optional(I('below') | I('at')) \ + self.specifier_and_value \ + Optional((I('has') + I('been') + I('found') + I('for')) | ( I('was') + (I('observed') | I('determined') | I('measured') | I('calculated')))).hide() \ + Optional(I('in') | I('for') | I('of')).hide() \ + Optional(I('the')).hide() \ + Optional(R('^[:;,]$')).hide() \ + Optional(I('bulk') | I('powdered') | I('doped') | (I('thin') + I('film'))).hide() + Optional(rbrct) \ + Optional(self.cem_phrase))('root_phrase')
def multi_entity_phrase_3c(self): """multiple compounds, single specifier, multiple transitions cems last e.g. curie temperatures from 100 K in MnO to 300 K in NiO """ return Group(self.specifier_phrase + self.pure_prefix + self.value_phrase('value1') + (I('for') | I('in') | I('of')) + self.single_cem('cem1') + (Optional(I('up')) + I('to') | I('to')).hide() + self.value_phrase('value2') + (I('for') | I('in') | I('of')) + self.single_cem('cem2'))('multi_entity_phrase_3c')
def cem_value_specifier(self): return (self.cem_phrase + Optional((I('is') | I('was') | I('were')) + Optional(I('reported') | I('found') | I('calculate') | I('measured') | I('shown') | I('found')) + Optional(I('to'))).hide() \ + Optional((I('exhibit') | I('exhibits') | I('exhibiting') | R('^show[s]*$') | I('demonstrates') | I('undergoes') | I('have') | I('has') | I('having') | I('determined') | I('with'))).hide() \ + Optional(I('the') | I('a') | I('an')).hide() \ + Optional(I('value') | I('values')).hide() \ + Optional(I('varies') + I('from')).hide() \ + Optional(W('=') | W('~') | W('≈') | W('≃') | I('was') | I('is') | I('at') | I('as') | I('near') | I('above') | I('below')).hide() \ + Optional(I('in') + I('the') + I('range') | I('ranging')).hide() \ + Optional(I('of') | I('about') | I('from') | I('approximately') | I('around') | (I('high') + I('as')) | (I('higher') | I('lower') + I('than'))).hide() \ + self.value_phrase \ + Optional(I('as') | I('of') | I('for')).hide() \ + Optional(I('its') | I('their') | I('the')).hide() + self.specifier_phrase)('root_phrase')
from chemdataextractor.doc import Paragraph, Heading, Sentence from lxml import etree class CurieTemperature(BaseModel): specifier = StringType() value = StringType() units = StringType() Compound.curie_temperatures = ListType(ModelType(CurieTemperature)) #%% [markdown] # Now define parse elements that describe how to identify the entities in text. Think of these as tagging processes. #%% # Define a very basic entity tagger specifier = (I('curie') + I('temperature') + Optional(lrb | delim) + Optional(R('^T(C|c)(urie)?')) + Optional(rrb) | R('^T(C|c)(urie)?'))('specifier').add_action(join) units = (R('^[CFK]\.?$'))('units').add_action(merge) value = (R('^\d+(\.\,\d+)?$'))('value') #%% [markdown] # Note we tag each with a unique identifier that will be used later. Now let the entities in a sentence be any ordering of these (or whatever ordering you feel like). Here we specify that the value and units must coincide, but this does not have to be the case. # # We also define an extremely general parse phrase, this will be used to identify candidate sentences. #%% # Let the entities be any combination of chemical names, specifier values and units entities = (chemical_name | specifier | value + units) # Now create a very generic parse phrase that will match any combination of these entities curie_temperature_phrase = (entities + OneOrMore(entities | Any()))('curie_temperature')
class Solubility(BaseModel): value = StringType() units = StringType() Compound.solubility = ListType(ModelType(Solubility)) # In[38]: import re from chemdataextractor.parse import R, I, W, Optional, merge # prefix = (R(u'^m\.?p\.?$', re.I) | I(u'melting') + I(u'point')).hide() prefix = (I(u'solubility')).hide() + Optional( W('=') | I('of') | I('was') | I('is') | I('at')).hide() + Optional( I('in') + I('the') + I('range') + Optional(I('of')) | I('about')).hide() # delim = R(u'^[:;\.,]$') value = R(u'^\d+(\.\d+)?$')(u'value') units = (W(u'nM') | W(u'μM') | W(u'mM') | W(u'μg') | W(u'mg'))(u'units').add_action(merge) so = (prefix + Optional(R('\w+\s\w+')).hide() + value + units)(u'so') # units = (W(u'°') + Optional(R(u'^[CFK]\.?$')))(u'units').add_action(merge) # value = R(u'^\d+(\.\d+)?$')(u'value') # bp = (prefix + + value + units)(u'bp')
class Obstructions: #Class for bracket bracket = Optional(R(u'\(')) curlLine = Optional(R(u'\~')) of = Optional(W(u'of')) hyphen = Optional(R(u'\-')) all = bracket + curlLine + of + hyphen
class MpRegex: prefix = (R(u'^m\.?p\.?$', re.I) | I(u'melting') + I(u'point')).hide() #u-createsunicodestring units = Optional(R(u'^[CFK]\.?$'))(u'units').add_action(merge) value = R(u'^\d+(\.\d+)?$')(u'value') mp = (prefix + Obstructions.all + value + units)(u'mp')
from chemdataextractor.model import BaseModel, StringType, ListType, ModelType import re from chemdataextractor.parse import R, I, W, Optional, merge class Capacity(BaseModel): value = StringType() units = StringType() Compound.capacity = ListType(ModelType(Capacity)) prefix = (I(u'capacity') | I(u'CO2') + I(u'uptake')).hide() #Left the optional in because if I take it out then there is a syntax error on line 33 units = (W(u'mmol g-1') + Optional(R(u'^[CFK]\.?$')))(u'units').add_action(merge) value = R(u'^\d+(\.\d+)?$')(u'value') cp = (prefix + value + units)(u'cp') from chemdataextractor.parse.base import BaseParser from chemdataextractor.utils import first class CpParser(BaseParser): root = cp def interpret(self, result, start, end): compound = Compound(capacity=[ Capacity(value=first(result.xpath('./value/text()')), units=first(result.xpath('./units/text()')))
def find_subset(items, target, acc=[]): if target == 0: return acc if len(items) == 0: return None acc_take = acc.copy() acc_take.append(items[0]) take = find_subset(items[1:], target - items[0], acc_take) if take: return take return find_subset(items[1:], target, acc) units = (Optional("(") + Optional(R(u'°')) + R(u'[CFK℃]')).add_action(merge)(u'units') value = (Optional(R('~')) + R(u'^\d{3,4}')).add_action(merge)(u'value') value2 = (Optional(R('~')) + R(u'^\d{3,4}(°C|K)')).add_action(merge)(u'value') temp1 = (value + units)(u'temp1') temp2 = value2 temp = (temp1 | value2)(u'tempphrase') #for catalyst name supportstring = "ZnO|YSZ|ceria|alumina|silica|SBA|ZSM|CNT|Al2O3|MgO|CeO2|TiO2|CMK|MnO|Y2O3|ZrO2|Tb4O7|HfO2|La2O3|Co3O4|ThO2|SiO2|Fe2O3|Sm2O3|Mo2C|Gd2O3|Yb2O3|CaO|CuO|NiO" symbolstring = "|Li|Be|Ne|Na|Mg|Al|Si|Cl|Ox|Ca|Sc|Ti|V|Cr|Mn|Fe|Co|Ni|Cu|Zn|Ga|Ge|As|Se|Br|Kr|Rb|Sr|Y|Zr|Nb|Mo|Tc|Ru|Rh|Pd|Ag|Cd|In|Sn|Sb|Te|Xe|Cs|Ba|La|Ce|Pr|Nd|Pm|Sm|Eu|Gd|Tb|Dy|Ho|Er|Tm|Yb|Lu|Hf|Ta|Re|Os|Ir|Pt|Au|Hg|Tl|Pb|Bi|Po|At|Rn|Fr|Ra|Ac|Th|Pa|Np|Pu|Am|Cm|Bk|Cf|Es|Fm|Md|No|Lr|Rf|Db|Sg|Bh|Hs|Mt|Ds|Rg|Cn|Nh|Fl|Mc|Lv|Ts|Og" name = (R( u"^((" + symbolstring + "|" + supportstring + ")(?![a-z])(([A-Zα-ωΑ-Ω0-9\/\-,\.]|(wt.?))[a-zA-Z0-9α-ωΑ-Ω0-9\/\-,\.]*)?)") ) post = R("^([A-Z]|[0-9]|(wt.?)|" + symbolstring + u"|\/|-|\.|,|%|\(|\))+$") | T("SYM")
root = name def interpret(self, result, start, end): # print("inside cat cell") # print(etree.tostring(result)) # print() # print("test") c = Compound(catalyst_name=[CAT(name=first(result.xpath('text()')), )]) yield c flowunit = R( r"^(h(-|−)\d|ml|s|g|N|min|cm3|l|kg|mol|\\|cat|\/|:|\(|\)|<|>|[|]|,|\.|×)+(-|−)?(\d+)?$", re.IGNORECASE)(u'flowunit') flowname = R("^flow", re.IGNORECASE) + Optional(flowunit) GHSV = (R("GHSV", re.IGNORECASE) + Optional(flowunit))( u"textandunit") #space velocity, (residence time?) space = (R("space", re.IGNORECASE) + R("velocity", re.IGNORECASE) + Optional(flowunit))(u"textandunit") flow = (flowname | GHSV | space)(u"flowphrase") flowheader = (flow + ZeroOrMore(SkipTo(flowunit) + flowunit).add_action(merge)('units'))("phrase") flowvalue = R("^\d+((\.|,)\d+)*")("value") flowcell = (flowvalue + ZeroOrMore(SkipTo(flowunit) + flowunit).add_action(merge)(u"units"))("phrase") class FlowHeadingParser(BaseParser):
times = ListType(ModelType(SpinTime)) # Associate the spin-coating class with a given compound. May be worth # getting rid of for our eventual implementation, not yet sure. Compound.spin_coat = ListType(ModelType(SpinCoat)) # Variable assignments # Deliminators -- hide from tokenization delim = R('^[;:,\./]$').hide() # Defining formats for spin-coating value and units spdunits = ( R(u'^r(\.)?p(\.)?m(\.)?$') | R(u'^r(\.)?c(\.)?f(\.)?$') | R(u'^([x×]?)(\s?)?g$'))('spdunits').add_action(join) + ZeroOrMore(delim) spdvalue = Optional( W('(')).hide() + R(u'^\d+(,\d+)?[0][0]$')('spdvalue') + Optional( W(')')).hide() # Defining formats for spin-coating time and time units timeprefix = I('for').hide() timeunits = ( R('^s?(ec|econds)?$') | R('^m?(in|inute)?(s)?$') | R('^h?(ou)?(r)?(s)?$'))('timeunits').add_action(join) + Optional(delim) timevalue = R('^\d{,3}$')('timevalue') + Optional(delim) #<3 digits # Putting everything together spdprefix = I('at').hide() spd = (spdvalue)('spd') spds = ( spd + ZeroOrMore(ZeroOrMore(spdunits | delim | W('and')).hide() + spd))('spds')
""" temps = ListType(ModelType(AnnealTemp)) times = ListType(ModelType(AnnealTime)) # Associating anneal parameters with a chemical object Compound.anneal = ListType(ModelType(Anneal)) # currently not working # Defining object parameters for the AnnealParser parser # Deliminators delim = R('^[;:,\./]$').hide() # Defining formats for annealing temperature and units tempprefix = (I('at') | I('or')).hide() tempunits = (W('°') + R('^[CFK]\.?$'))('tempunits').add_action(merge) tempvalue = R('^\d{2,4}?$')('tempvalue').add_action(merge) + Optional(delim) # Defining formats for spin-coating time and time units timeprefix = I('for').hide() timeunits = ( R('^s?(ec|econds)?$') | R('^m?(in|inute)?(s)?$') | R('^h?(ou)?(r)?(s)?$'))('timeunits').add_action(join) + Optional(delim) timevalue = R('^\d{,2}$')('timevalue') + Optional(delim) # Putting everything together temp = (tempvalue)('temp') temps = (temp + ZeroOrMore( ZeroOrMore(tempprefix | tempunits | delim | W('and')).hide() + temp))('temps') time = (timevalue)('time') times = (time +
def prefix(self): return (self.specifier_phrase + Optional(I('values')).hide() + Optional(delim).hide() + Optional((I('varies') + I('from')) | R('^increase(s|d)?') | I('falls') | I('reaches')).hide() + Optional(I('steeply')).hide() + Optional(I('recorded') | I('reported')).hide() + Optional(I('of') | I('was') | I('is') | I('at') | I('near') | I('above') | I('below') | I('with') | I('to') | I('were') | I('a')).hide() + Optional(I('reported') | I('determined') | I('estimated') | I('found') | I('occurs')).hide() + Optional(I('temperatures')).hide() + Optional(I('as') | (I('to') + I('be'))).hide() + Optional(I('in') + I('the') + I('range')).hide() + Optional(I('as') + I('high') + I('as')) + Optional(I('ranging') + I('from')).hide() + Optional(I('of')).hide() + Optional(I('rather') | I('quite')).hide() + Optional(I('high') | I('low') | I('maximum') | I('minimum')).hide() + Optional(I('the')).hide() + Optional(delim | lbrct | rbrct) + Optional( I('of') | I('about') | I('approximately') | I('typically') | I('ca.') | I('around') | I('at') | I( 'above') | I('below') | I('high') | I('low') | ((I('higher') | I('lower') | I('more') | I('less')) + I('than')) | I('order') | ( I('for') + I('instance')) | (I('up') + I('to')) | I('reaching') | I('value')).hide() + Optional(I('a') | I('an') | I('as')).hide() + Optional(I('maximum')).hide() + Optional(I('of')).hide() + ZeroOrMore(lbrct | delim | rbrct) + Optional(self.specifier_phrase) + Optional(I('of')).hide() + Optional(I('the')).hide() + Optional(I('order')).hide() + Optional((I('up') | I('equal')) + I('to')).hide() + Optional(I('of')).hide() + ZeroOrMore(lbrct | delim | rbrct) + Optional(W('=') | W('~') | W('≈') | W('≃') | W('>') | W('<')).hide() + ZeroOrMore(lbrct | delim | rbrct).hide()).add_action(join)
class BpRegex: prefix = (R(u'^b\.?p\.?$', re.I) | I(u'boiling') + I(u'point')).hide() #u-createsunicodestring units = (W(u'°') + Optional(R(u'^[CFK]\.?$')))(u'units').add_action(merge) value = R(u'^\d+(\.\d+)?$')(u'value') bp = (prefix + value + units)(u'bp')