def get_wavelength_fromcaption(caption): caption_scan = Sentence(caption).tagged_tokens wavelength_information = wavelength_value.scan(caption_scan) for result in wavelength_information: #print (result[0]) text_list = result[0].xpath('//' + 'wavelengthvalue' + '/text()') wavelength_fromcaption = text_list[0] return wavelength_fromcaption
def parse_anneal(anneal_str): """ Given a string as input, converts the string into a ChemDrawExtractor Sentence and returns a list of annealing parameters (temperatures and times) found via parsing the string. """ Sentence.parsers = [AnnealParser()] return Sentence(anneal_str).records.serialize()
def parse_spincoat(spincoat_str): """ Given a string as input, converts the string into a ChemDrawExtractor Paragraph and returns a list of spin-coating parameters (speeds and times) found via parsing the string. """ Sentence.parsers = [SpinCoatParser()] return Sentence(spincoat_str).records.serialize()
def get_frequency_fromany(caption): caption_scan = Sentence(caption).tagged_tokens frequency_information = frequency_value.scan(caption_scan) for result in frequency_information: # print (result[0]) text_list = result[0].xpath('//' + 'wavelengthvalue' + '/text()') frequency_fromcaption = text_list[0] return frequency_fromcaption
def get_compound_fromany(sentence): compound = [] text_list = None sentence_scan = Sentence(sentence).tagged_tokens compound_information = chemical.scan(sentence_scan) for result in compound_information: text_list = result[0].xpath('//' + 'names' + '/text()') compound.append(''.join(text_list)) return compound
def parse_ff(list_of_sentences): """ Takes a list of sentences and parses for quantified PCE information and relationships to chemicals/chemical labels """ Sentence.parsers.append(FfParser()) cde_senteces = [Sentence(sent).records.serialize() for sent in list_of_sentences] return cde_senteces
def test_match(self): s1 = Sentence('BiFeO3 with 1103 K') entities = [ Entity('BiFeO3', chemical_name, 0, 1), Entity('1103', value, 2, 3), Entity('K', units, 2, 3) ] rel1 = [Relation(entities, 1.0)] phrase = Phrase(s1.raw_tokens, rel1, prefix_length=1, suffix_length=1) cluster = Cluster(label=0, order=phrase.order, learning_rate=0.5) cluster.add_phrase(phrase) similarity = match(phrase, cluster, prefix_weight=0.1, middles_weight=0.8, suffix_weight=0.1) expected = 1.0 self.assertEqual(similarity, expected)
# , name_1 has recently attracted much attention due to its high specifier_1 ∼ value_1 units_1 ) with confidence score 1.0 # # name_1 is probably the most studied half metal because of it high specifier_1 ∼ value_1 units_1 ) with confidence score 1.0 # # , name_1 has a high spin polarization ( > 95 % )118 and a specifier_1 of value_1 units_1 . with confidence score 1.0 # # , name_1 ( name_2 ) has received the most attention , due to its high ferroelectric specifier_1 ∼ value_1 units_1 ) with confidence score 1.0 # # # #%% [markdown] # Now let's extract a new relationship from a previously unseen sentence. We will save to a different file so we can see the new clusters afterwards. We hope that the sentence will be similar enough to a previously seen sentence in order for us to extract the new relationship. #%% snowball.save_file_name = 'curie_new' test_sentence = Sentence('BiFeO3 is highly ferromagnetic with a curie temperature of 1103 K and that is extremely interesting') rels = snowball.extract(test_sentence) print("Found relationship:", rels) #%% [markdown] # As we can see, we found the right entities. Lets see how confident we are in this relation #%% print(rels[0].confidence) #%% [markdown] # Lets look at the new clusters that have updated to reflect the new sentence: in ```curie_test_output_clusters``` # # Cluster 3 contains 2 phrases # # CoS2 is ferromagnetic with a Curie temperature of 116 K and Co9S8 is antiferromagnetic with a Néel temperature above the decomposition temperature.28 The magnetic susceptibility of Ni3S2 was found to be temperature - independent , which is consistent with Pauli paramagnetism.
def find_wavelength_specifier_index_secondlayer(table, caption, DOI, count): index_specifier = [] index_wavelength = [] for token in table[1]: specifier_result = specifier.scan(Sentence(token).tagged_tokens) for r in specifier_result: text = r[0].xpath('//specifier/text()') if text: index_specifier.append(table[1].index(token)) for token in table[1]: wavelength_result = wavelength.scan(Sentence(token).tagged_tokens) for r in wavelength_result: text = r[0].xpath('//wavelength/text()') if text: index_wavelength.append(table[1].index(token)) if index_specifier: for i in range(len(table) - 2): for ind_spe in index_specifier: if index_wavelength: for ind_wav in index_wavelength: #dic = {'compound_information':table[0][0] +': ' +table[i+1][0] + ', ' + table[0][1] +': ' +table[i+1][1],'wavelength_information':table[1][ind_wav]+': ' + table[i+2][ind_wav],'refractive_index':table[i+2][ind_spe],'specifier':table[1][ind_spe],'table_caption':caption,'DOI':DOI,'wavelength_fromcaption':get_wavelength_fromcaption(caption),'compound_fromcaption':get_compound_fromcaption(caption)} dic = { 'row_headers': table[0][0] + ': ' + table[i + 2][0] + ', ' + table[0][1] + ': ' + table[i + 2][1], 'compound_from_row_headers': ', '.join( get_compound_fromany(', '.join(table[i + 2]))), 'wavelength_information': table[0][ind_wav] + ': ' + table[i + 2][ind_wav], 'refractive_index': table[i + 2][ind_spe], 'specifier': table[0][ind_spe], 'table_caption': caption, 'DOI': DOI, 'wavelength_fromcaption': 'wavelength_fromcaption, ' + str(get_wavelength_fromcaption(table[0][ind_spe])) + ',' + str(get_wavelength_fromcaption(caption)), 'compound_fromcaption': ', '.join(get_compound_fromany(caption)) } if re.match("^\d+?\.\d+?$", dic['refractive_index'][:3]): count += 1 write_into_file(dic) #print (dic) else: dic = { 'row_headers': table[1][0] + ': ' + table[i + 2][0] + ', ' + table[1][1] + ': ' + table[i + 2][1], 'compound_from_row_headers': ', '.join(get_compound_fromany(', '.join(table[i + 2]))), 'refractive_index': table[i + 2][ind_spe], 'specifier': table[1][ind_spe], 'table_caption': caption, 'DOI': DOI, 'wavelength_fromcaption': 'wavelength_fromcaption, ' + str(get_wavelength_fromcaption(table[0][ind_spe])) + ',' + str(get_wavelength_fromcaption(caption)), 'compound_fromcaption': get_compound_fromany(caption) } if re.match("^\d+?\.\d+?$", dic['refractive_index'][:3]): count += 1 write_into_file(dic) # print (dic) return count
def find_frequency_specifier_index_secondlayer(table, caption, DOI, count): index_specifier = [] index_frequency = [] for token in table[1]: specifier_result = specifier.scan(Sentence(token).tagged_tokens) for r in specifier_result: text = r[0].xpath('//specifier/text()') if text: index_specifier.append(table[1].index(token)) for token in table[1]: frequency_result = frequency.scan(Sentence(token).tagged_tokens) for r in frequency_result: text = r[0].xpath('//frequency/text()') if text: index_frequency.append(table[1].index(token)) if index_specifier: for i in range(len(table) - 2): for ind_spe in index_specifier: if index_frequency: for ind_freq in index_frequency: # dic = {'compound_information':table[0][0] +': ' +table[i+1][0] + ', ' + table[0][1] +': ' +table[i+1][1],'wavelength_information':table[1][ind_wav]+': ' + table[i+2][ind_wav],'refractive_index':table[i+2][ind_spe],'specifier':table[1][ind_spe],'table_caption':caption,'DOI':DOI,'wavelength_fromcaption':get_wavelength_fromcaption(caption),'compound_fromcaption':get_compound_fromcaption(caption)} dic = { 'row_headers': table[0][0] + ': ' + table[i + 2][0] + ', ' + table[0][1] + ': ' + table[i + 2][1], 'compound_from_row_headers': get_compound_fromany(table[i + 2][0] + ', ' + table[i + 2][1]), 'frequency_information': table[1][ind_freq] + ': ' + table[i + 2][ind_freq], 'dielectric_constant': table[i + 2][ind_spe], 'specifier': table[1][ind_spe], 'table_caption': caption, 'DOI': DOI, 'frequency_from_caption': get_frequency_fromany(caption), 'frequency_from_specifier': get_frequency_fromany(table[1][ind_spe]), 'compound_fromcaption': ', '.join(get_compound_fromany(caption)) } if re.match("^[0-9](\.[0-9]+)?$", dic['dielectric_constant'][:1]): count += 1 write_into_file(dic) # print (dic) else: dic = { 'row_headers': table[1][0] + ': ' + table[i + 2][0] + ', ' + table[1][1] + ': ' + table[i + 2][1], 'compound_from_row_headers': get_compound_fromany(table[i + 2][0] + ', ' + table[i + 2][1]), 'dielectric_constant': table[i + 2][ind_spe], 'specifier': table[1][ind_spe], 'table_caption': caption, 'DOI': DOI, 'frequency_from_caption': get_frequency_fromany(caption), 'frequency_from_specifier': get_frequency_fromany(table[1][ind_spe]), 'compound_fromcaption': get_compound_fromany(caption) } if re.match("^[0-9](\.[0-9]+)?$", dic['dielectric_constant'][:1]): count += 1 write_into_file(dic) # print (dic) return count