Example #1
0
def get_wavelength_fromcaption(caption):
    caption_scan = Sentence(caption).tagged_tokens
    wavelength_information = wavelength_value.scan(caption_scan)
    for result in wavelength_information:
        #print (result[0])
        text_list = result[0].xpath('//' + 'wavelengthvalue' + '/text()')
        wavelength_fromcaption = text_list[0]
        return wavelength_fromcaption
def parse_anneal(anneal_str):
    """
    Given a string as input, converts the string into a ChemDrawExtractor
    Sentence and returns a list of annealing parameters (temperatures and
    times) found via parsing the string.
    """
    Sentence.parsers = [AnnealParser()]
    return Sentence(anneal_str).records.serialize()
Example #3
0
def parse_spincoat(spincoat_str):
    """
    Given a string as input, converts the string into a ChemDrawExtractor
    Paragraph and returns a list of spin-coating parameters (speeds and times)
    found via parsing the string.
    """
    Sentence.parsers = [SpinCoatParser()]
    return Sentence(spincoat_str).records.serialize()
def get_frequency_fromany(caption):
    caption_scan = Sentence(caption).tagged_tokens
    frequency_information = frequency_value.scan(caption_scan)
    for result in frequency_information:
        # print (result[0])
        text_list = result[0].xpath('//' + 'wavelengthvalue' + '/text()')
        frequency_fromcaption = text_list[0]
        return frequency_fromcaption
Example #5
0
def get_compound_fromany(sentence):
    compound = []
    text_list = None
    sentence_scan = Sentence(sentence).tagged_tokens
    compound_information = chemical.scan(sentence_scan)
    for result in compound_information:
        text_list = result[0].xpath('//' + 'names' + '/text()')
        compound.append(''.join(text_list))
    return compound
Example #6
0
def parse_ff(list_of_sentences):
    """ 
    Takes a list of sentences and parses for quantified PCE
    information and relationships to chemicals/chemical labels
    """

    Sentence.parsers.append(FfParser())

    cde_senteces = [Sentence(sent).records.serialize()
                    for sent in list_of_sentences]
    return cde_senteces
Example #7
0
 def test_match(self):
     s1 = Sentence('BiFeO3 with 1103 K')
     entities = [
         Entity('BiFeO3', chemical_name, 0, 1),
         Entity('1103', value, 2, 3),
         Entity('K', units, 2, 3)
     ]
     rel1 = [Relation(entities, 1.0)]
     phrase = Phrase(s1.raw_tokens, rel1, prefix_length=1, suffix_length=1)
     cluster = Cluster(label=0, order=phrase.order, learning_rate=0.5)
     cluster.add_phrase(phrase)
     similarity = match(phrase,
                        cluster,
                        prefix_weight=0.1,
                        middles_weight=0.8,
                        suffix_weight=0.1)
     expected = 1.0
     self.assertEqual(similarity, expected)
# , name_1 has recently attracted much attention due to its high specifier_1 ∼ value_1  units_1 ) with confidence score 1.0
# 
#  name_1 is probably the most studied half metal because of it high specifier_1 ∼ value_1  units_1 ) with confidence score 1.0
# 
# , name_1 has a high spin polarization ( > 95 % )118 and a specifier_1 of value_1  units_1 . with confidence score 1.0
# 
# , name_1 ( name_2 ) has received the most attention , due to its high ferroelectric specifier_1 ∼ value_1  units_1 ) with confidence score 1.0
# 
# 
# 
#%% [markdown]
# Now let's extract a new relationship from a previously unseen sentence. We will save to a different file so we can see the new clusters afterwards. We hope that the sentence will be similar enough to a previously seen sentence in order for us to extract the new relationship.

#%%
snowball.save_file_name = 'curie_new'
test_sentence = Sentence('BiFeO3 is highly ferromagnetic with a curie temperature of 1103 K and that is extremely interesting')
rels = snowball.extract(test_sentence)
print("Found relationship:", rels)

#%% [markdown]
# As we can see, we found the right entities. Lets see how confident we are in this relation

#%%
print(rels[0].confidence)

#%% [markdown]
# Lets look at the new clusters that have updated to reflect the new sentence: in ```curie_test_output_clusters```
# 
# Cluster 3 contains 2 phrases
# 
# CoS2 is ferromagnetic with a Curie temperature of 116 K and Co9S8 is antiferromagnetic with a Néel temperature above the decomposition temperature.28 The magnetic susceptibility of Ni3S2 was found to be temperature - independent , which is consistent with Pauli paramagnetism.
Example #9
0
def find_wavelength_specifier_index_secondlayer(table, caption, DOI, count):
    index_specifier = []
    index_wavelength = []
    for token in table[1]:
        specifier_result = specifier.scan(Sentence(token).tagged_tokens)
        for r in specifier_result:
            text = r[0].xpath('//specifier/text()')
            if text:
                index_specifier.append(table[1].index(token))
    for token in table[1]:
        wavelength_result = wavelength.scan(Sentence(token).tagged_tokens)
        for r in wavelength_result:
            text = r[0].xpath('//wavelength/text()')
            if text:
                index_wavelength.append(table[1].index(token))
    if index_specifier:
        for i in range(len(table) - 2):
            for ind_spe in index_specifier:
                if index_wavelength:
                    for ind_wav in index_wavelength:
                        #dic = {'compound_information':table[0][0] +': ' +table[i+1][0] + ', ' + table[0][1] +': ' +table[i+1][1],'wavelength_information':table[1][ind_wav]+': ' + table[i+2][ind_wav],'refractive_index':table[i+2][ind_spe],'specifier':table[1][ind_spe],'table_caption':caption,'DOI':DOI,'wavelength_fromcaption':get_wavelength_fromcaption(caption),'compound_fromcaption':get_compound_fromcaption(caption)}

                        dic = {
                            'row_headers':
                            table[0][0] + ': ' + table[i + 2][0] + ', ' +
                            table[0][1] + ': ' + table[i + 2][1],
                            'compound_from_row_headers':
                            ', '.join(
                                get_compound_fromany(', '.join(table[i + 2]))),
                            'wavelength_information':
                            table[0][ind_wav] + ': ' + table[i + 2][ind_wav],
                            'refractive_index':
                            table[i + 2][ind_spe],
                            'specifier':
                            table[0][ind_spe],
                            'table_caption':
                            caption,
                            'DOI':
                            DOI,
                            'wavelength_fromcaption':
                            'wavelength_fromcaption, ' +
                            str(get_wavelength_fromcaption(table[0][ind_spe]))
                            + ',' + str(get_wavelength_fromcaption(caption)),
                            'compound_fromcaption':
                            ', '.join(get_compound_fromany(caption))
                        }
                        if re.match("^\d+?\.\d+?$",
                                    dic['refractive_index'][:3]):
                            count += 1
                            write_into_file(dic)
                            #print (dic)
                else:
                    dic = {
                        'row_headers':
                        table[1][0] + ': ' + table[i + 2][0] + ', ' +
                        table[1][1] + ': ' + table[i + 2][1],
                        'compound_from_row_headers':
                        ', '.join(get_compound_fromany(', '.join(table[i +
                                                                       2]))),
                        'refractive_index':
                        table[i + 2][ind_spe],
                        'specifier':
                        table[1][ind_spe],
                        'table_caption':
                        caption,
                        'DOI':
                        DOI,
                        'wavelength_fromcaption':
                        'wavelength_fromcaption, ' +
                        str(get_wavelength_fromcaption(table[0][ind_spe])) +
                        ',' + str(get_wavelength_fromcaption(caption)),
                        'compound_fromcaption':
                        get_compound_fromany(caption)
                    }
                    if re.match("^\d+?\.\d+?$", dic['refractive_index'][:3]):
                        count += 1
                        write_into_file(dic)
                        # print (dic)
    return count
def find_frequency_specifier_index_secondlayer(table, caption, DOI, count):
    index_specifier = []
    index_frequency = []
    for token in table[1]:
        specifier_result = specifier.scan(Sentence(token).tagged_tokens)
        for r in specifier_result:
            text = r[0].xpath('//specifier/text()')
            if text:
                index_specifier.append(table[1].index(token))
    for token in table[1]:
        frequency_result = frequency.scan(Sentence(token).tagged_tokens)
        for r in frequency_result:
            text = r[0].xpath('//frequency/text()')
            if text:
                index_frequency.append(table[1].index(token))
    if index_specifier:
        for i in range(len(table) - 2):
            for ind_spe in index_specifier:
                if index_frequency:
                    for ind_freq in index_frequency:
                        # dic = {'compound_information':table[0][0] +': ' +table[i+1][0] + ', ' + table[0][1] +': ' +table[i+1][1],'wavelength_information':table[1][ind_wav]+': ' + table[i+2][ind_wav],'refractive_index':table[i+2][ind_spe],'specifier':table[1][ind_spe],'table_caption':caption,'DOI':DOI,'wavelength_fromcaption':get_wavelength_fromcaption(caption),'compound_fromcaption':get_compound_fromcaption(caption)}

                        dic = {
                            'row_headers':
                            table[0][0] + ': ' + table[i + 2][0] + ', ' +
                            table[0][1] + ': ' + table[i + 2][1],
                            'compound_from_row_headers':
                            get_compound_fromany(table[i + 2][0] + ', ' +
                                                 table[i + 2][1]),
                            'frequency_information':
                            table[1][ind_freq] + ': ' + table[i + 2][ind_freq],
                            'dielectric_constant':
                            table[i + 2][ind_spe],
                            'specifier':
                            table[1][ind_spe],
                            'table_caption':
                            caption,
                            'DOI':
                            DOI,
                            'frequency_from_caption':
                            get_frequency_fromany(caption),
                            'frequency_from_specifier':
                            get_frequency_fromany(table[1][ind_spe]),
                            'compound_fromcaption':
                            ', '.join(get_compound_fromany(caption))
                        }
                        if re.match("^[0-9](\.[0-9]+)?$",
                                    dic['dielectric_constant'][:1]):
                            count += 1
                            write_into_file(dic)
                            # print (dic)
                else:
                    dic = {
                        'row_headers':
                        table[1][0] + ': ' + table[i + 2][0] + ', ' +
                        table[1][1] + ': ' + table[i + 2][1],
                        'compound_from_row_headers':
                        get_compound_fromany(table[i + 2][0] + ', ' +
                                             table[i + 2][1]),
                        'dielectric_constant':
                        table[i + 2][ind_spe],
                        'specifier':
                        table[1][ind_spe],
                        'table_caption':
                        caption,
                        'DOI':
                        DOI,
                        'frequency_from_caption':
                        get_frequency_fromany(caption),
                        'frequency_from_specifier':
                        get_frequency_fromany(table[1][ind_spe]),
                        'compound_fromcaption':
                        get_compound_fromany(caption)
                    }
                    if re.match("^[0-9](\.[0-9]+)?$",
                                dic['dielectric_constant'][:1]):
                        count += 1
                        write_into_file(dic)
                        # print (dic)
    return count