def cem_value_specifier(self): return (self.cem_phrase + Optional((I('is') | I('was') | I('were')) + Optional(I('reported') | I('found') | I('calculate') | I('measured') | I('shown') | I('found')) + Optional(I('to'))).hide() \ + Optional((I('exhibit') | I('exhibits') | I('exhibiting') | R('^show[s]*$') | I('demonstrates') | I('undergoes') | I('have') | I('has') | I('having') | I('determined') | I('with'))).hide() \ + Optional(I('the') | I('a') | I('an')).hide() \ + Optional(I('value') | I('values')).hide() \ + Optional(I('varies') + I('from')).hide() \ + Optional(W('=') | W('~') | W('≈') | W('≃') | I('was') | I('is') | I('at') | I('as') | I('near') | I('above') | I('below')).hide() \ + Optional(I('in') + I('the') + I('range') | I('ranging')).hide() \ + Optional(I('of') | I('about') | I('from') | I('approximately') | I('around') | (I('high') + I('as')) | (I('higher') | I('lower') + I('than'))).hide() \ + self.value_phrase \ + Optional(I('as') | I('of') | I('for')).hide() \ + Optional(I('its') | I('their') | I('the')).hide() + self.specifier_phrase)('root_phrase')
def specifier_cem_value(self): return (Optional(I('the') | I('a') | I('an') | I('its') | I('with')).hide() \ + self.specifier_phrase \ + Optional(I('of') | I('in') | I('for')).hide() \ + Optional( I('bulk') | I('powdered') | I('doped') | I('the') | I('a') | I('an') | I('these') | I('those') | I('this') | I('that')).hide() \ + self.cem_phrase \ + Optional(I('is') | I('was') | I('were') | I('occurs') | I('of') | ( I('can') + I('be') + I('assigned') + Optional(I('at') | I('to')))).hide() \ + Optional(I('observed') | I('determined') | I('measured') | I('calculated') | I('found')).hide() \ + Optional(I('in') + I('the') + I('range') + I('of') | I('ranging') + I('from') | I('as') | I('to') + I('be') | I('about') | I('over') | (I('higher') | I('lower')) + I('than') | I('above')).hide() \ + Optional(W('=') | W('~') | W('≈') | W('≃') | I('of') | I('was') | I('is') | I('at') | I('as') | I('near') | I('above') | I('below')).hide() + Optional(lbrct).hide() \ + (self.value_phrase) + Optional(rbrct))('root_phrase')
def value_phrase(self): number = R('^[\+\-–−]?\d+(\.\d+)?$') joined_range = R('^[\+\-–−]?\d+(\.\d+)?[\-–−~∼˜]\d+(\.\d+)?$')( 'raw_value').add_action(merge) spaced_range = (number + (R('^[\-–−~∼˜]$') + number | number))('raw_value').add_action(merge) to_range = (number + I('to') + number)('raw_value').add_action(join) plusminus_range = (number + R('±') + number)('value').add_action(join) value_range = (Optional(R('^[\-–−]$')) + (plusminus_range | joined_range | spaced_range | to_range))('raw_value').add_action(merge) value_single = (Optional(R('^[~∼˜\<\>]$')) + Optional(R('^[\-–−]$')) + number)('raw_value').add_action(merge) inumber = (R('\d*\.?\d*[i]$')).add_action(join) # inumber = R('^([-+]?(\d+\.?\d*|\d*\.?\d+)([Ee][-+]?[0-2]?\d{1,2})?[r]?|[-+]?((\d+\.?\d*|\d*\.?\d+)([Ee][-+]?[0-2]?\d{1,2})?)?[i]|[-+]?(\d+\.?\d*|\d*\.?\d+)([Ee][-+]?[0-2]?\d{1,2})?[r]?[-+]((\d+\.?\d*|\d*\.?\d+)([Ee][-+]?[0-2]?\d{1,2})?)?[i])$') ivalue = (R('\d*\.?\d*$') + R('^[\+\-–−]?') + inumber).add_action(join) value = Optional(lbrct).hide() + ( ivalue | value_range | value_single)('raw_value') + Not( I('wt%') | I('vol%') | I('K') | I('times') | I('GPa') | I('wt') | I('vol') | I('%') | I('nm') | I('zF') | W('°') | W('KV') | W('kV') | W('MV') | I('kHz') | I('Hz') | I('GHz') | W('V') | W('J') | W('eV') | I('MHz')) + Optional(rbrct).hide() return value
def prefix(self): """Specifier and prefix""" return (self.specifier_phrase + Optional(I('values')).hide() + Optional(delim).hide() + Optional((I('varies') + I('from')) | R('^increase(s|d)?') | I('falls') | I('reaches')).hide() + Optional(I('steeply')).hide() + Optional(I('recorded') | I('reported')).hide() + Optional(I('of') | I('was') | I('is') | I('at') | I('near') | I('above') | I('below') | I('with') | I('to') | I('were') | I('a')).hide() + Optional(I('reported') | I('determined') | I('estimated') | I('found') | I('occurs')).hide() + Optional(I('temperatures')).hide() + Optional(I('as') | (I('to') + I('be'))).hide() + Optional(I('in') + I('the') + I('range')).hide() + Optional(I('as') + I('high') + I('as')) + Optional(I('ranging') + I('from')).hide() + Optional(I('of')).hide() + Optional(I('rather') | I('quite')).hide() + Optional(I('high') | I('low') | I('maximum') | I('minimum')).hide() + Optional(I('the')).hide() + Optional(delim | lbrct | rbrct) + Optional( I('of') | I('about') | I('approximately') | I('typically') | I('ca.') | I('around') | I('at') | I( 'above') | I('below') | I('high') | I('low') | ((I('higher') | I('lower') | I('more') | I('less')) + I('than')) | I('order') | ( I('for') + I('instance')) | (I('up') + I('to')) | I('reaching') | I('value')).hide() + Optional(I('a') | I('an') | I('as')).hide() + Optional(I('maximum')).hide() + Optional(I('of')).hide() + ZeroOrMore(lbrct | delim | rbrct) + Optional(self.specifier_phrase) + Optional(I('of')).hide() + Optional(I('the')).hide() + Optional(I('order')).hide() + Optional((I('up') | I('equal')) + I('to')).hide() + Optional(I('of')).hide() + ZeroOrMore(lbrct | delim | rbrct) + Optional(W('=') | W('~') | W('≈') | W('≃') | W('>') | W('<')).hide() + ZeroOrMore(lbrct | delim | rbrct).hide()).add_action(join)
class Solubility(BaseModel): value = StringType() units = StringType() Compound.solubility = ListType(ModelType(Solubility)) # In[38]: import re from chemdataextractor.parse import R, I, W, Optional, merge # prefix = (R(u'^m\.?p\.?$', re.I) | I(u'melting') + I(u'point')).hide() prefix = (I(u'solubility')).hide() + Optional( W('=') | I('of') | I('was') | I('is') | I('at')).hide() + Optional( I('in') + I('the') + I('range') + Optional(I('of')) | I('about')).hide() # delim = R(u'^[:;\.,]$') value = R(u'^\d+(\.\d+)?$')(u'value') units = (W(u'nM') | W(u'μM') | W(u'mM') | W(u'μg') | W(u'mg'))(u'units').add_action(merge) so = (prefix + Optional(R('\w+\s\w+')).hide() + value + units)(u'so') # units = (W(u'°') + Optional(R(u'^[CFK]\.?$')))(u'units').add_action(merge) # value = R(u'^\d+(\.\d+)?$')(u'value') # bp = (prefix + + value + units)(u'bp')
Class for full list of spin-coating step parameters for full process. """ temps = ListType(ModelType(AnnealTemp)) times = ListType(ModelType(AnnealTime)) # Associating anneal parameters with a chemical object Compound.anneal = ListType(ModelType(Anneal)) # currently not working # Defining object parameters for the AnnealParser parser # Deliminators delim = R('^[;:,\./]$').hide() # Defining formats for annealing temperature and units tempprefix = (I('at') | I('or')).hide() tempunits = (W('°') + R('^[CFK]\.?$'))('tempunits').add_action(merge) tempvalue = R('^\d{2,4}?$')('tempvalue').add_action(merge) + Optional(delim) # Defining formats for spin-coating time and time units timeprefix = I('for').hide() timeunits = ( R('^s?(ec|econds)?$') | R('^m?(in|inute)?(s)?$') | R('^h?(ou)?(r)?(s)?$'))('timeunits').add_action(join) + Optional(delim) timevalue = R('^\d{,2}$')('timevalue') + Optional(delim) # Putting everything together temp = (tempvalue)('temp') temps = (temp + ZeroOrMore( ZeroOrMore(tempprefix | tempunits | delim | W('and')).hide() + temp))('temps') time = (timevalue)('time')
class Obstructions: #Class for bracket bracket = Optional(R(u'\(')) curlLine = Optional(R(u'\~')) of = Optional(W(u'of')) hyphen = Optional(R(u'\-')) all = bracket + curlLine + of + hyphen
class BpRegex: prefix = (R(u'^b\.?p\.?$', re.I) | I(u'boiling') + I(u'point')).hide() #u-createsunicodestring units = (W(u'°') + Optional(R(u'^[CFK]\.?$')))(u'units').add_action(merge) value = R(u'^\d+(\.\d+)?$')(u'value') bp = (prefix + value + units)(u'bp')
#from chemdataextractor.doc import Sentence from chemdataextractor.parse import R, I, W, Optional, merge, join from chemdataextractor.parse.base import BaseSentenceParser from chemdataextractor.utils import first frequency_value = ((R("^\d+?\.\d+?$") | R("^\d+?$")) + (W('kHz') | W('MHz') | W('GHz') | W('Hz')))('frequencyvalue').add_action(join) class DielectricConstantFrequencyParser(BaseSentenceParser): root = frequency_value def interpret(self, result, start, end): raw_value = first(result.xpath('//frequencyvalue/text()')) #print (type(raw_value)) frequency = self.model(frequency=raw_value) #print (frequency) yield frequency
# Associate the spin-coating class with a given compound. May be worth # getting rid of for our eventual implementation, not yet sure. Compound.spin_coat = ListType(ModelType(SpinCoat)) # Variable assignments # Deliminators -- hide from tokenization delim = R('^[;:,\./]$').hide() # Defining formats for spin-coating value and units spdunits = ( R(u'^r(\.)?p(\.)?m(\.)?$') | R(u'^r(\.)?c(\.)?f(\.)?$') | R(u'^([x×]?)(\s?)?g$'))('spdunits').add_action(join) + ZeroOrMore(delim) spdvalue = Optional( W('(')).hide() + R(u'^\d+(,\d+)?[0][0]$')('spdvalue') + Optional( W(')')).hide() # Defining formats for spin-coating time and time units timeprefix = I('for').hide() timeunits = ( R('^s?(ec|econds)?$') | R('^m?(in|inute)?(s)?$') | R('^h?(ou)?(r)?(s)?$'))('timeunits').add_action(join) + Optional(delim) timevalue = R('^\d{,3}$')('timevalue') + Optional(delim) #<3 digits # Putting everything together spdprefix = I('at').hide() spd = (spdvalue)('spd') spds = ( spd + ZeroOrMore(ZeroOrMore(spdunits | delim | W('and')).hide() + spd))('spds')
from chemdataextractor.model import BaseModel, StringType, ListType, ModelType import re from chemdataextractor.parse import R, I, W, Optional, merge class Capacity(BaseModel): value = StringType() units = StringType() Compound.capacity = ListType(ModelType(Capacity)) prefix = (I(u'capacity') | I(u'CO2') + I(u'uptake')).hide() #Left the optional in because if I take it out then there is a syntax error on line 33 units = (W(u'mmol g-1') + Optional(R(u'^[CFK]\.?$')))(u'units').add_action(merge) value = R(u'^\d+(\.\d+)?$')(u'value') cp = (prefix + value + units)(u'cp') from chemdataextractor.parse.base import BaseParser from chemdataextractor.utils import first class CpParser(BaseParser): root = cp def interpret(self, result, start, end): compound = Compound(capacity=[ Capacity(value=first(result.xpath('./value/text()')),
context["footflowunit"] = flowunit if context: # print(context) c.conv.append(Conv(**context)) yield c unit = R("mass|wt|weight|mol|mole|m|vol")(u"value") compname = (R("$composition", re.IGNORECASE))(u"phrase") feedname = (R("mixture", re.IGNORECASE))(u"phrase") conditionname = R("condition", re.IGNORECASE)(u"phrase") conditionname = R("conditions?", re.IGNORECASE)(u"phrase") compprefix = (compname | feedname)(u"prefix") units = (unit + (R(u'%') | W(u"percent")))(u'compunits') value = (R(u"^\d{1,2}(\.\d)?$"))(u'compvalue') name = (R("H2") | R("H2O") | R("CO") | R("N2") | R("He"))(u"compname") # comppre = (compprefix + OneOrMore(SkipTo(name) + name + SkipTo(value) + value + Optional(units)))("phrase") precomp = (OneOrMore(value | name | Any()))(u"phrase") comp = (precomp)('comp') class FootCompParser(BaseParser): root = comp def __init__(self): pass
? Matches either once or zero times; marks the p as optional $ Matches at the end of a line I means the case is not sensitive | matches either b.p. or boiling point .hide() I think just makes it so the prefix does not show up in the output but it still searches the document for the text ''' prefix = (R(u'^b\.?p\.?$', re.I) | I(u'boiling') + I(u'point')).hide() ''' W matches the degree symbol exactly Optional means that the unit will be included if it is in the text R matches the token text to the regular experssion ^ matches any string that contains a C OR F OR K ''' units = (W(u'°') + Optional(R(u'^[CFK]\.?$')))(u'units').add_action(merge) ''' R matches the token text to a regular experssion ^ matches any string that starts with a digit \d looks for a number between 0 and 9 + matches following digits one or more times \. matches . symbol if it is there () is the capturing group ? matches the expression zero or one times ''' value = R(u'^\d+(\.\d+)?$')(u'value') bp = (prefix + value + units)(u'bp') from chemdataextractor.parse.base import BaseParser from chemdataextractor.utils import first
def connection(self): return (I('at') | W('=') | I('of') | I('about') | I('for') | I('as') + I('regards') | I('attributed') + I('to') | I('concerning')).add_action(join)
values = [(value[i], names[i]) for i, j in enumerate(value)] context["values"] = values elif value and len(value) == 1 and len(names) < 1: context["values"] = value # print("table comp") print(context) if context: c.comp.append(Comp(**context)) yield c no = (R("calcin", re.IGNORECASE) | R("thermo", re.IGNORECASE) | R("equi", re.IGNORECASE)).hide() prefix = (R("^temp", re.IGNORECASE) | R("^T[^OP]") | R("^T$"))("prefix") units = (Optional(W(u'°')) + R(u'[CFK]'))(u'units').add_action(merge) temp = (Optional(Not(no)) + SkipTo(prefix) + prefix + Optional(SkipTo(units) + units))(u"temp") temphead = (temp)(u"tempphrase") class TempHeadingParser(BaseParser): root = temphead def interpret(self, result, start, end): """""" # print("inside temp") # print(etree.tostring(result)) # print() # print(lollol)
#from chemdataextractor.doc import Sentence from chemdataextractor.parse import R, I, W, Optional, merge, join from chemdataextractor.parse.base import BaseSentenceParser from chemdataextractor.utils import first wavelength_value = ( (R("^\d+?$") + W('nm')) | (R("^\d+?\.\d+?$") + W('μm')) | (R("^\d+?\.\d+?$") + W('nm')))('wavelengthvalue').add_action(join) class RefractiveIndexWavelengthParser(BaseSentenceParser): root = wavelength_value def interpret(self, result, start, end): raw_value = first(result.xpath('//wavelengthvalue/text()')) wavelength = self.model(wavelength=raw_value) yield wavelength