class SpinCoat(BaseModel): """ Class for full list of spin-coating step parameters for full process. """ #solvent = StringType(contextual=True) spds = ListType(ModelType(SpinSpd)) times = ListType(ModelType(SpinTime))
class Conv(BaseModel): tempvalue = StringType(contextual=True) tempunits = StringType(contextual=True) convvalue = StringType() convtype = StringType(contextual=True) convunits = StringType(contextual=True) captioninfo = StringType(contextual=True) check = StringType(contextual=True) catname = StringType(contextual=True) captioncatname = StringType(contextual=True) comp = StringType(contextual=True) unit = StringType(contextual=True) flow = StringType(contextual=True) flowunit = StringType(contextual=True) footflow = StringType(contextual=True) footflowunit = StringType(contextual=True) pressure = StringType(contextual=True) presunits = StringType(contextual=True) tofvalue = StringType() yieldtest = StringType(contextual=True) selectivity = ListType(ModelType(Selectivity, contextual=True), contextual=True)
from chemdataextractor.model import BaseModel, StringType, ListType, ModelType, Compound import re from chemdataextractor.parse import R, I, W, Optional, merge, join, OneOrMore, Any, ZeroOrMore, Start from chemdataextractor.parse.cem import chemical_name, chemical_label from chemdataextractor.parse.base import BaseParser from chemdataextractor.parse.common import lrb, rrb, delim from chemdataextractor.utils import first from chemdataextractor.doc import Paragraph, Heading, Sentence from lxml import etree class CurieTemperature(BaseModel): specifier = StringType() value = StringType() units = StringType() Compound.curie_temperatures = ListType(ModelType(CurieTemperature)) #%% [markdown] # Now define parse elements that describe how to identify the entities in text. Think of these as tagging processes. #%% # Define a very basic entity tagger specifier = (I('curie') + I('temperature') + Optional(lrb | delim) + Optional(R('^T(C|c)(urie)?')) + Optional(rrb) | R('^T(C|c)(urie)?'))('specifier').add_action(join) units = (R('^[CFK]\.?$'))('units').add_action(merge) value = (R('^\d+(\.\,\d+)?$'))('value') #%% [markdown] # Note we tag each with a unique identifier that will be used later. Now let the entities in a sentence be any ordering of these (or whatever ordering you feel like). Here we specify that the value and units must coincide, but this does not have to be the case. # # We also define an extremely general parse phrase, this will be used to identify candidate sentences.
# In[6]: d.records.serialize() # In[37]: from chemdataextractor.model import BaseModel, StringType, ListType, ModelType class Solubility(BaseModel): value = StringType() units = StringType() Compound.solubility = ListType(ModelType(Solubility)) # In[38]: import re from chemdataextractor.parse import R, I, W, Optional, merge # prefix = (R(u'^m\.?p\.?$', re.I) | I(u'melting') + I(u'point')).hide() prefix = (I(u'solubility')).hide() + Optional( W('=') | I('of') | I('was') | I('is') | I('at')).hide() + Optional( I('in') + I('the') + I('range') + Optional(I('of')) | I('about')).hide() # delim = R(u'^[:;\.,]$') value = R(u'^\d+(\.\d+)?$')(u'value')
class Anneal(BaseModel): """ Class for full list of spin-coating step parameters for full process. """ temps = ListType(ModelType(AnnealTemp)) times = ListType(ModelType(AnnealTime))
Class for each spin-coating time in a spin-coating process. """ timevalue = StringType() timeunits = StringType(contextual=True) class Anneal(BaseModel): """ Class for full list of spin-coating step parameters for full process. """ temps = ListType(ModelType(AnnealTemp)) times = ListType(ModelType(AnnealTime)) # Associating anneal parameters with a chemical object Compound.anneal = ListType(ModelType(Anneal)) # currently not working # Defining object parameters for the AnnealParser parser # Deliminators delim = R('^[;:,\./]$').hide() # Defining formats for annealing temperature and units tempprefix = (I('at') | I('or')).hide() tempunits = (W('°') + R('^[CFK]\.?$'))('tempunits').add_action(merge) tempvalue = R('^\d{2,4}?$')('tempvalue').add_action(merge) + Optional(delim) # Defining formats for spin-coating time and time units timeprefix = I('for').hide() timeunits = ( R('^s?(ec|econds)?$') | R('^m?(in|inute)?(s)?$') | R('^h?(ou)?(r)?(s)?$'))('timeunits').add_action(join) + Optional(delim)
from chemdataextractor.model import Compound, UvvisSpectrum, UvvisPeak, BaseModel, StringType, ListType, ModelType from chemdataextractor.parse.common import hyphen,lbrct, dt, rbrct from chemdataextractor.parse.base import BaseParser from chemdataextractor.utils import first from chemdataextractor.parse.actions import strip_stop, merge, join from chemdataextractor.parse.elements import W, I, T, R, Optional, ZeroOrMore, OneOrMore, Or, And, Not, Any from chemdataextractor.parse.cem import chemical_name,cem, chemical_label, lenient_chemical_label, solvent_name from chemdataextractor.doc import Paragraph, Sentence, Caption, Figure,Table, Heading from chemdataextractor.doc.table import Table, Cell class Ff(BaseModel): value = StringType() units = StringType() Compound.ff_pattern = ListType(ModelType(Ff)) common_text = R('(\w+)?\D(\D+)+(\w+)?').hide() units = (W(u'%') | I(u'percent'))(u'units') value = R(u'\d+(\.\d+)?')(u'value') abbrv_prefix = (I(u'FF') | I(u'ff')).hide() words_pref = (I(u'fill') + I(u'factor')).hide() hyphanated_pref = (I(u'fill') + I(u'-') + I('factor')).hide() joined_range = R('^[\+\-–−]?\d+(\.\d+)?[\-–−~∼˜]\d+(\.\d+)?$')('value').add_action(merge) spaced_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (R('^[\-–−~∼˜]$') + R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(merge) to_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (I('to') + R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(join) prefix = Optional(I('a')).hide() + (Optional(lbrct) + W('FF') + Optional(rbrct) | I('fill') + Optional(I('factor'))
timevalue = StringType() timeunits = StringType(contextual=True) class SpinCoat(BaseModel): """ Class for full list of spin-coating step parameters for full process. """ #solvent = StringType(contextual=True) spds = ListType(ModelType(SpinSpd)) times = ListType(ModelType(SpinTime)) # Associate the spin-coating class with a given compound. May be worth # getting rid of for our eventual implementation, not yet sure. Compound.spin_coat = ListType(ModelType(SpinCoat)) # Variable assignments # Deliminators -- hide from tokenization delim = R('^[;:,\./]$').hide() # Defining formats for spin-coating value and units spdunits = ( R(u'^r(\.)?p(\.)?m(\.)?$') | R(u'^r(\.)?c(\.)?f(\.)?$') | R(u'^([x×]?)(\s?)?g$'))('spdunits').add_action(join) + ZeroOrMore(delim) spdvalue = Optional( W('(')).hide() + R(u'^\d+(,\d+)?[0][0]$')('spdvalue') + Optional( W(')')).hide() # Defining formats for spin-coating time and time units timeprefix = I('for').hide()
Heading(u'Synthesis of HKUST-1-AC'), Paragraph( u'The BET surface area and CO2 uptake capacity values for the HKUST-1–AC composite were 1381 m2 g−1 and 8.1 mmol g−1 (at 273 K and 1 bar), respectively, representing increases of 70% and 39%, respectively, over the reported values for HKUST-1' )) from chemdataextractor.model import BaseModel, StringType, ListType, ModelType import re from chemdataextractor.parse import R, I, W, Optional, merge class Capacity(BaseModel): value = StringType() units = StringType() Compound.capacity = ListType(ModelType(Capacity)) prefix = (I(u'capacity') | I(u'CO2') + I(u'uptake')).hide() #Left the optional in because if I take it out then there is a syntax error on line 33 units = (W(u'mmol g-1') + Optional(R(u'^[CFK]\.?$')))(u'units').add_action(merge) value = R(u'^\d+(\.\d+)?$')(u'value') cp = (prefix + value + units)(u'cp') from chemdataextractor.parse.base import BaseParser from chemdataextractor.utils import first class CpParser(BaseParser): root = cp
from chemdataextractor.model import BaseModel, StringType, ListType, ModelType from chemdataextractor.model import Compound class BoilingPoint(BaseModel): value = StringType() units = StringType() Compound.boiling_points = ListType(ModelType(BoilingPoint)) class MeltingPoint(BaseModel): value = StringType() units = StringType() Compound.melting_points = ListType(ModelType(MeltingPoint))
from chemdataextractor.parse.common import hyphen from chemdataextractor.parse.base import BaseParser from chemdataextractor.utils import first from chemdataextractor.parse.actions import strip_stop from chemdataextractor.parse.elements import W, I, T, R, Optional, ZeroOrMore, OneOrMore from chemdataextractor.parse.cem import chemical_name from chemdataextractor.doc import Paragraph, Sentence # From my ipynb ... class Pce(BaseModel): value = StringType() units = StringType() Compound.pce_pattern = ListType(ModelType(Pce)) abbrv_prefix = (I(u'PCE') | I(u'PCEs') | I(u'pce')).hide() words_pref = (I(u'power') + I(u'conversion') + I(u'efficiency')).hide() hyphanated_pref = (I(u'power-conversion') + I(u'efficiency')).hide() prefix = abbrv_prefix | words_pref | hyphanated_pref common_text = R('(\w+)?\D(\D+)+(\w+)?').hide() units = (W(u'%') | I(u'percent'))(u'units') # value = R(u'^\d+(\.\d+)?$')(u'value') value = R(u'\d+(\.\d+)?')(u'value') pce_first = (prefix + ZeroOrMore(common_text) + value + units)(u'pce') pce_second = (value + units + prefix)(u'pce') pce_pattern = pce_first | pce_second
from chemdataextractor.parse.common import hyphen,lbrct, dt, rbrct from chemdataextractor.parse.base import BaseParser from chemdataextractor.utils import first from chemdataextractor.parse.actions import strip_stop, merge, join from chemdataextractor.parse.elements import W, I, T, R, Optional, ZeroOrMore, OneOrMore, Or, And, Not, Any from chemdataextractor.parse.cem import chemical_name,cem, chemical_label, lenient_chemical_label, solvent_name from chemdataextractor.doc import Paragraph, Sentence, Caption, Figure,Table, Heading from chemdataextractor.doc.table import Table, Cell class Eqe(BaseModel): value = StringType() units = StringType() Compound.eqe_pattern = ListType(ModelType(Eqe)) common_text = R('(\w+)?\D(\D+)+(\w+)?').hide() units = (W(u'%') | I(u'percent'))(u'units') value = R(u'\d+(\.\d+)?')(u'value') abbrv_prefix = (I(u'EQE') | I(u'eqe')).hide() words_pref = (I(u'external') + I(u'quantum') + I(u'efficiency')).hide() hyphanated_pref = (I(u'external') + I(u'-') + I('quantum') + I(u'efficiency')).hide() joined_range = R('^[\+\-–−]?\d+(\.\d+)?[\-–−~∼˜]\d+(\.\d+)?$')('value').add_action(merge) spaced_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (R('^[\-–−~∼˜]$') + R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(merge) to_range = (R('^[\+\-–−]?\d+(\.\d+)?$') + Optional(units).hide() + (I('to') + R('^[\+\-–−]?\d+(\.\d+)?$') | R('^[\+\-–−]\d+(\.\d+)?$')))('value').add_action(join) prefix = Optional(I('a')).hide() + (Optional(lbrct) + W('EQE') + Optional(rbrct) | I('external') + Optional(I('quantum')) + Optional((I('efficiency')))
from chemdataextractor.model import BaseModel, StringType, ListType, ModelType, Compound from chemdataextractor.parse import R, I, W, Optional, merge, ZeroOrMore, join, SkipTo from chemdataextractor.parse.base import BaseParser from chemdataextractor.parse import ZeroOrMore, Any, OneOrMore, Start, End, Group, Not, T from chemdataextractor.utils import first # print("inside heading") # print(etree.tostring(result)) # print() class BET(BaseModel): value = StringType() unit = StringType(contextual=True) Compound.bet = ListType(ModelType(BET)) class Flow(BaseModel): value = StringType() unit = StringType(contextual=True) Compound.flow = ListType(ModelType(Flow)) class CAT(BaseModel): name = StringType() Compound.catalyst_name = ListType(ModelType(CAT))