class AllAttributes: def __init__(self, prop_name, txt_name, text_path, triple_path, out_path, c_path): self.txt_name = txt_name self.text_path = text_path self.triple_path = triple_path self.prop_name = prop_name self.out_path = out_path self.c_path = c_path self.log_wp = LogWp() def get_toexcel(self): xls = xlwt.Workbook() sheet = xls.add_sheet("all_attributes") sheet.write(0, 0, 'txt_name') sheet.write(0, 1, 'Full_text') sheet.write(0, 2, 'Target_sentences') sheet.write(0, 3, 'alloy') sheet.write(0, 4, 'property') sheet.write(0, 5, 'value') # triples_tuple data_triples = xlrd.open_workbook(self.triple_path) table_triples = data_triples.sheet_by_index(0) for x in range(0, len(table_triples.col_values(0))): sheet.write(x + 1, 1, str(table_triples.col_values(0)[x])) sheet.write(x + 1, 2, str(table_triples.col_values(1)[x])) sheet.write(x + 1, 3, str(table_triples.col_values(2)[x])) sheet.write(x + 1, 4, str(table_triples.col_values(3)[x])) sheet.write(x + 1, 5, str(table_triples.col_values(4)[x])) # txt_name+doi_list k = 0 for i in range(0, len(table_triples.col_values(0))): if table_triples.col_values(0)[i]: sheet.write(i + 1, 0, str(self.txt_name[0][k])) k += 1 # text + filtered_text_1 + Target sentences txt_name = os.listdir(self.text_path) process_text = [] for i in range(0, len(txt_name)): file = open(self.text_path + '/' + str(txt_name[i]), 'r', encoding='utf-8') sole_text = file.read() pre_processor = PreProcessor(sole_text, self.c_path) filter_txt = pre_processor.pre_processor() path_2 = self.text_path + '/proprecess' is_exists = os.path.exists(path_2) if is_exists: text_path = path_2 + '/' + str(txt_name[i]) file = open(text_path, 'w', encoding='utf-8') file.write(filter_txt) process_text.append(text_path) else: os.makedirs(path_2) text_path = path_2 + '/' + txt_name[i] file = open(text_path, 'w', encoding='utf-8') file.write(filter_txt) process_text.append(text_path) self.log_wp.excel_save(xls, self.out_path)
def get_extraction_outcome(xml_path, save_path, config_path): TableExtractor_m = TableExtractorModifiedtoalloy(xml_path, save_path, config_path) all_error_file = [] xml_name = os.listdir(xml_path) log_wp = LogWp() for file_i in range(len(os.listdir(xml_path))): tables = None all_tables = [] doi = xml_name[file_i].replace(".xml", "") doi = doi.replace("-", "/", 1) xml_n = xml_name[file_i] file = xml_path + '/' + str(xml_n) try: tables, captions = TableExtractor_m.get_xml_tables(doi, file) except: all_error_file.append(doi) if tables: cols, rows, col_inds, row_inds = TableExtractor_m.get_headers(tables, doi) tab = [] for table, row_ind, col_ind in zip(tables, row_inds, col_inds): curr, error_file = (TableExtractor_m.construct_table_object(doi, table, row_ind, col_ind)) if curr: tab.append(curr) if error_file: all_error_file.append(str(doi)) for i, (t, caption) in enumerate(zip(tab, captions)): if t is not None: t['order'] = i t['_id'] = ObjectId() t['caption'] = caption t['paper_doi'] = doi all_tables.append(t) log_wp.print_log('Success: Extracted Tables from : %s', doi) xls = openpyxl.Workbook() sheet_id = 1 if all_tables: for table in all_tables: sht_new = xls.create_sheet(str(sheet_id)) act_table = table['act_table'] caption = table['caption'] row_len = len(act_table[0]) doi = table['paper_doi'] sht_new.cell(1, 1, str(doi)) sht_new.cell(2, 1, str(caption)) start_row = 3 for row in act_table: len_row = len(row) for index in range(len_row): sht_new.cell(start_row, index + 1, row[index]) start_row += 1 sheet_id += 1 del xls['Sheet'] log_wp.excel_save(xls, save_path + '/' + str(file_i) + "end.xlsx") return all_error_file, len(all_error_file)
class File_IO: def __init__(self, target_sents, out_path, txt_name): self.target_sents = target_sents self.out_path = out_path self.txt_name = txt_name self.log_wp = LogWp() def out_to_excel(self): length_sent = len(self.target_sents) xls = xlwt.Workbook() sht1 = xls.add_sheet("Sheet1") for w in range(0, length_sent): sht1.write(w, 0, str(w) + '.txt') if self.target_sents[w] == '{}': out = 'no target sentence' sht1.write(w, 1, out) else: sht1.write(w, 1, self.target_sents[w]) self.log_wp.excel_save(xls, self.out_path) def data_from_excel(self): all_data = [] file = xlrd.open_workbook(self.out_path) sheet = file.sheet_by_index(0) col_value = sheet.col_values(1) k = len(col_value) for k in range(0, k): unit_data = [] str_sent = col_value[k] if str_sent == 'no target sentence': all_data.append([]) else: dict_sent = eval(str_sent) for i, sent in dict_sent.items(): unit_data.append(sent) all_data.append(unit_data) return all_data
class GetTargetInfo: def __init__(self, all_info, out_path, c_path): self.related_allinfo = all_info self.out_path = out_path self.log_wp = LogWp() self.dict_info = Dictionary(c_path) self.prop_pattern = self.dict_info.table_prop_pattern def structure_ele(self): xls = openpyxl.Workbook() sht = xls.create_sheet(index=0) sht.cell(1, 1, "File_name") sht.cell(1, 2, "DOIs") sht.cell(1, 3, "Material") sht.cell(1, 4, "Percentage") sht.cell(1, 5, "Element_info and other_info") start_row = 2 for file, ele in self.related_allinfo.items(): for material in ele: sht.cell(start_row, 1, file) if 'doi' in material.keys(): sht.cell(start_row, 2, material['doi']) material.pop('doi') if 'material' in material.keys(): material_name = material['material'] noisy = re.findall('\s*\[.+\]', str(material_name)) if noisy: for puc in noisy: material_name = str(material_name).replace(puc, '') sht.cell(start_row, 3, material_name) material.pop('material') if 'percentage' in material.keys(): sht.cell(start_row, 4, material['percentage']) material.pop('percentage') if material: sht.cell(start_row, 5, str(material)) start_row += 1 self.log_wp.excel_save(xls, self.out_path) def structure_prop(self, prop_name_s): xls = openpyxl.Workbook() sht = xls.create_sheet(0) sht.cell(1, 1, "File_name") sht.cell(1, 2, "DOIs") sht.cell(1, 3, "Table_topic") sht.cell(1, 4, "Material") sht.cell(1, 5, "unit") sht.cell(1, 6, "Property_name") sht.cell(1, 7, "Property_value") sht.cell(1, 8, "Child_tag") sht.cell(1, 9, "Other_info") start_row = 2 for file, ele in self.related_allinfo.items(): for material in ele: get_prop = None sht.cell(start_row, 1, file) if 'doi' in material.keys(): sht.cell(start_row, 2, material['doi']) material.pop('doi') if 'table_topic' in material.keys(): sht.cell(start_row, 3, material['table_topic']) material.pop('table_topic') if 'material' in material.keys(): material_name = material['material'] noisy = re.findall('\s*\[.+\]', str(material_name)) if noisy: for puc in noisy: material_name = str(material_name).replace(puc, '') sht.cell(start_row, 4, material_name) material.pop('material') if 'unit' in material.keys(): unit_replace = material['unit'].replace('degC', '°C') sht.cell(start_row, 5, unit_replace) material.pop('unit') if 'child_tag' in material.keys(): sht.cell(start_row, 8, str(material['child_tag'])) material.pop('child_tag') if 'other_info' in material.keys(): sht.cell(start_row, 9, str(material['other_info'])) material.pop('other_info') if len(material) == 1: for prop_name, value in material.items(): sht.cell(start_row, 6, str(prop_name)) sht.cell(start_row, 7, str(value)) elif len(material) >= 1: for prop_name, value in material.items(): for pattern in self.prop_pattern[prop_name_s]: prop_search = re.findall(pattern, str(prop_name)) if prop_search: sht.cell(start_row, 6, str(prop_name)) sht.cell(start_row, 7, str(value)) get_prop = True break if get_prop: break start_row += 1 self.log_wp.excel_save(xls, self.out_path)
triple_lines += 1 n_triple = 0 for index, v in all_outcome.items(): out_unit.append(v) n_triple += 1 for n in range(0, n_triple): sht2.write(num_of_lines + n, 1, sent) num_of_lines = num_of_lines + n_triple for s in range(0, len(out_unit)): sht2.write(triple_lines + s, 2, out_unit[s][0]) sht2.write(triple_lines + s, 3, out_unit[s][1]) sht2.write(triple_lines + s, 4, out_unit[s][2]) if out_unit: triple_lines = triple_lines + len(out_unit) else: triple_lines += 1 file_index += 1 else: out = 'no target triples' sht2.write(triple_lines, 2, out) sht2.write(triple_lines, 3, 'None') sht2.write(triple_lines, 4, 'None') sht2.write(num_of_lines, 1, 'no target sentence') num_of_lines += 1 triple_lines += 1 file_index += 1 log_wp.excel_save(xls, triple_path) attributes = AllAttributes(prop_name, txt_name, text_path, triple_path, out_path, c_path) attributes.get_toexcel()
def relation_extraction(self, C_path, origin_text_path, prop_name, triple_path, out_path, m_path): log_wp = LogWp() # The path to the folder where the full-text text is stored text_path = os.path.join(m_path, "full_text") # Locate the obtained target corpus TS_path = os.path.join(m_path, "sent.xls") # Filter to get the full text FT = FilterText(origin_text_path, text_path) txt_name, dois = FT.process() # Get the target corpus all_x = [] txt_name2 = [] length = len(os.listdir(text_path)) for i in range(0, length): n_path = text_path + '/' + str(os.listdir(text_path)[i]) with open(n_path, 'r', encoding='utf-8') as file: data = file.read() pre_processor = PreProcessor(data, C_path) filter_data = pre_processor.pre_processor() processor = TPreProcessor(filter_data, prop_name, C_path) filter_data = processor.processor() positioner = SentencePositioner(filter_data, prop_name, C_path) target_sents = positioner.target_sent() # print(target_sents) all_x.append(str(target_sents)) txt_name2.append(n_path) FI_out = FI(all_x, TS_path, txt_name2) FI_out.out_to_excel() # Extraction of triples data = FI_out.data_from_excel() xls = xlwt.Workbook() sht2 = xls.add_sheet("triple_extracion") triple_lines = 0 # the number of "triple" file_index = 0 # document indexing num_of_lines = 0 # the number of sentences for item in data: doi = dois[file_index].replace("doi:", "") sht2.write(triple_lines, 0, doi) if item != []: out_unit = [] sent_out = {} l_sent = [] for sent in item: processor = TPreProcessor(sent, prop_name, C_path) filter_data = processor.processor() parse = PhraseParse(filter_data, prop_name, C_path) sub_order, sub_id, object_list = parse.alloy_sub_search() RE = RelationExtraciton(prop_name, filter_data, sub_order, sub_id, object_list, C_path) all_outcome = RE.triple_extraction() if not all_outcome: out = 'no target triples' sht2.write(triple_lines, 2, out) sht2.write(triple_lines, 3, 'None') sht2.write(triple_lines, 4, 'None') sht2.write(num_of_lines, 1, 'no target sentence') num_of_lines += 1 triple_lines += 1 n_triple = 0 for index, v in all_outcome.items(): out_unit.append(v) n_triple += 1 for n in range(0, n_triple): sht2.write(num_of_lines + n, 1, sent) num_of_lines = num_of_lines + n_triple for s in range(0, len(out_unit)): sht2.write(triple_lines + s, 2, out_unit[s][0]) sht2.write(triple_lines + s, 3, out_unit[s][1]) sht2.write(triple_lines + s, 4, out_unit[s][2]) if out_unit: triple_lines = triple_lines + len(out_unit) else: triple_lines += 1 file_index += 1 else: out = 'no target triples' sht2.write(triple_lines, 2, out) sht2.write(triple_lines, 3, 'None') sht2.write(triple_lines, 4, 'None') sht2.write(num_of_lines, 1, 'no target sentence') num_of_lines += 1 triple_lines += 1 file_index += 1 log_wp.excel_save(xls, triple_path) attributes = AllAttributes(prop_name, txt_name, text_path, triple_path, out_path, C_path, dois) attributes.get_toexcel()
class AcquireTargetInfo: def __init__(self, c_path, origin_text_path, prop_list, excels_path, out_path): self.c_path = c_path self.prop_list = prop_list self.origin_text_path = origin_text_path self.excels_path = excels_path self.dict_info = Dictionary(self.c_path) self.out_path = out_path self.log_wp = LogWp() def mkdir(self, file_name): pathd = os.getcwd() + '\\' + file_name if os.path.exists(pathd): for root, dirs, files in os.walk(pathd, topdown=False): for name in files: os.remove(os.path.join(root, name)) for name in dirs: os.rmdir(os.path.join(root, name)) os.rmdir(pathd) os.mkdir(pathd) def get_doi_fromtxt(self, txt_path): text_name = txt_path.replace(".txt", "") doi = text_name.replace("-", "/", 1) return doi def get_abrre(self, text, prop_name): processor = TPreProcessor(text, prop_name, self.c_path) text = processor.processor() sentences = nltk.sent_tokenize(text) sentences_split = text.split(" ") alloy_write_type = self.dict_info.alloy_writing_type len_type = len(alloy_write_type) abbre_to_alloy = {} for sent in sentences: processor = TPreProcessor(sent, prop_name, self.c_path) filter_data = processor.processor() words = nltk.word_tokenize(filter_data) for word in words: for type_i in range(0, len_type): outcome = re.findall(alloy_write_type[type_i], word) outcome_alloy = None if outcome: abbre = "(" + word + ")" if abbre in sentences_split: index_alloy = sentences_split.index(abbre) - 1 alloy = sentences_split[index_alloy] for type_j in range(0, len_type): outcome_alloy = re.findall( alloy_write_type[type_j], alloy) if outcome_alloy: abbre_to_alloy[word] = alloy break if outcome_alloy: break return abbre_to_alloy def get_text_triple(self, prop_name): self.mkdir('output_tt') text_path = r"output_tt\full_text" self.mkdir(text_path) ft = FilterText(self.origin_text_path, text_path) txt_name = ft.process() length = len(os.listdir(self.origin_text_path)) all_txt_info = [] for i in range(0, length): n_path = os.listdir(self.origin_text_path)[i] doi = self.get_doi_fromtxt(n_path) file = open(text_path + '/' + n_path, 'r', encoding='utf-8') data = file.read() pre_processor = PreProcessor(data, self.c_path) filter_txt = pre_processor.pre_processor() file_origin = open(self.origin_text_path + '/' + n_path, 'r', encoding='utf-8') data_origin = file_origin.read() abbre_pairs = self.get_abrre(data_origin, prop_name) positioner = SentencePositioner(filter_txt, prop_name, self.c_path) target_sents = positioner.target_sent() for index, sent in target_sents.items(): processor = TPreProcessor(sent, prop_name, self.c_path) filter_data = processor.processor() parse = PhraseParse(filter_data, prop_name, self.c_path) sub_order, sub_id, object_list = parse.alloy_sub_search() ree = RelationExtraciton(prop_name, filter_data, sub_order, sub_id, object_list, self.c_path, abbre_pairs) all_outcome = ree.triple_extraction() if all_outcome: for id_m, info in all_outcome.items(): sole_info = dict() sole_info['doi'] = doi sole_info['material'] = info[0] sole_info['prop_name'] = info[1] sole_info['prop_value'] = info[2] all_txt_info.append(sole_info) return all_txt_info def gather_tableinfo_textinfo(self, all_txt_info, table_info, prop_name, prop_pattern, unit_pattern_text): gather_outcome = list() for file_name, t_info in table_info.items(): sole_info = dict() all_triple_info = list() sole_doi = list() for sole_m_info in t_info: triple_info = dict() triple_info['source'] = 'table' if 'doi' in sole_m_info.keys(): plus_doi = sole_m_info['doi'] sole_doi.append(plus_doi) sole_m_info.pop('doi') if 'material' in sole_m_info.keys(): sole_material = sole_m_info['material'] noisy = re.findall('\s*\[.+\]', str(sole_material)) if noisy: for puc in noisy: sole_material = str(sole_material).replace(puc, '') triple_info['material'] = sole_material sole_m_info.pop('material') if 'unit' in sole_m_info.keys(): sole_unit = sole_m_info['unit'] triple_info['unit'] = sole_unit sole_m_info.pop('unit') if 'other_info' in sole_m_info.keys(): sole_other_info = sole_m_info['other_info'] triple_info['other_prop_info'] = sole_other_info sole_m_info.pop('other_info') if 'child_tag' in sole_m_info.keys(): sole_tag_info = sole_m_info['child_tag'] triple_info['child_tag'] = sole_tag_info sole_m_info.pop('child_tag') if 'table_topic' in sole_m_info.keys(): sole_tag_info = sole_m_info['table_topic'] triple_info['table_topic'] = sole_tag_info sole_m_info.pop('table_topic') if len(sole_m_info) == 1: for prop_name_t, value in sole_m_info.items(): sole_propname = str(prop_name_t) triple_info['prop_name'] = sole_propname sole_value = str(value) triple_info['value'] = sole_value elif len(sole_m_info) >= 1: get_prop = None for prop_name_t, value in sole_m_info.items(): for pattern in prop_pattern[prop_name]: prop_search = re.findall(pattern, str(prop_name_t)) if prop_search: sole_propname = str(prop_name_t) triple_info['prop_name'] = sole_propname sole_value = str(value) triple_info['value'] = sole_value get_prop = True break if get_prop: break all_triple_info.append(triple_info) if list(set(sole_doi)): sole_info[list(set(sole_doi))[0]] = all_triple_info gather_outcome.append(sole_info) gather = 0 for q in gather_outcome: k = tuple(q.keys())[0] i = q[k] for n in i: for w, v in n.items(): if w == 'value': gather += 1 self.log_wp.print_log("gather number :%s", gather) copy_all_txt_info = copy.copy(all_txt_info) if copy_all_txt_info: all_text = 0 all_gather_doi = [] for info_one in gather_outcome: all_gather_doi.append(tuple(info_one.keys())[0]) for triple_info_sole in copy_all_txt_info: if triple_info_sole['doi'] in all_gather_doi: all_text += 1 plus_info = dict() plus_info['source'] = 'text' plus_info['prop_name'] = triple_info_sole['prop_name'] prop_value = triple_info_sole['prop_value'] plus_info['material'] = triple_info_sole['material'] unit_search = re.findall(unit_pattern_text[prop_name], str(prop_value)) if unit_search: plus_info['unit'] = unit_search[0] prop_value = prop_value.replace(unit_search[0], '') plus_info['value'] = prop_value else: plus_info['unit'] = "" plus_info['value'] = prop_value for get_info in gather_outcome: if tuple( get_info.keys())[0] == triple_info_sole['doi']: get_info[triple_info_sole['doi']].append(plus_info) if triple_info_sole['doi'] not in all_gather_doi: all_text += 1 plus_info = {} full_info = {} sole_triple = [] plus_info['source'] = 'text' plus_info['prop_name'] = triple_info_sole['prop_name'] prop_value = triple_info_sole['prop_value'] plus_info['material'] = triple_info_sole['material'] unit_search = re.findall(unit_pattern_text[prop_name], str(prop_value)) if unit_search: plus_info['unit'] = unit_search[0] prop_value = prop_value.replace(unit_search[0], '') plus_info['value'] = prop_value else: plus_info['unit'] = "" plus_info['value'] = prop_value if plus_info: sole_triple.append(plus_info) full_info[triple_info_sole['doi']] = sole_triple gather_outcome.append(full_info) all_gather_doi.append(triple_info_sole['doi']) self.log_wp.print_log("all_text number :%s", all_text) return gather_outcome def transform_comp_outcome(self, all_composition): ele_list = self.dict_info.ele_list gather_outcome = [] for file_name, t_info in all_composition.items(): sole_info = {} all_triple_info = [] for sole_m_info in t_info: triple_info = {} sole_doi = sole_m_info['doi'] sole_m_info.pop('doi') if 'material' in sole_m_info.keys(): sole_material = sole_m_info['material'] noisy = re.findall('\[.+\]', str(sole_material)) if noisy: for puc in noisy: sole_material = str(sole_material).replace(puc, '') triple_info['material'] = sole_material sole_m_info.pop('material') for element in ele_list: if element in sole_m_info.keys(): triple_info[element] = sole_m_info[element] sole_m_info.pop(element) if sole_m_info: triple_info["other_eleinfo"] = sole_m_info all_triple_info.append(triple_info) sole_info[sole_doi] = all_triple_info gather_outcome.append(sole_info) return gather_outcome def allinfo_dependencyparse(self, comp_info, prop_info): all_ele_doi = [] all_prop_doi = [] outcome = [] for doi_info_ele in comp_info: ele_doi = tuple(doi_info_ele.keys())[0] all_ele_doi.append(ele_doi) for doi_info_prop in prop_info: prop_doi = tuple(doi_info_prop.keys())[0] all_prop_doi.append(prop_doi) prop_info_modified = copy.copy(prop_info) for doi_info_ele in comp_info: ele_doi = tuple(doi_info_ele.keys())[0] if ele_doi in all_prop_doi: for doi_info_prop in prop_info: prop_doi = tuple(doi_info_prop.keys())[0] plus_info = {} all_doi_info = [] if ele_doi == prop_doi: if doi_info_prop in prop_info_modified: prop_info_modified.remove(doi_info_prop) ele_doi_fullinfo = doi_info_ele[ele_doi] ele_allname = [] prop_allname = [] pop_name = [] for one_material_ele in ele_doi_fullinfo: if 'material' in one_material_ele.keys(): ele_m_name = one_material_ele['material'] ele_allname.append(ele_m_name) modified_ele_allname = [] for name in ele_allname: space_search = re.findall('\s', str(name)) if space_search: name_list = str(name).split() modified_ele_allname.append(str(name)) for name_sepe in name_list: modified_ele_allname.append(name_sepe) else: modified_ele_allname.append(name) for one_material_prop in doi_info_prop[prop_doi]: if 'material' in one_material_prop.keys(): prop_m_name = one_material_prop['material'] prop_allname.append(prop_m_name) if prop_m_name not in modified_ele_allname and len( ele_doi_fullinfo) == 1: if one_material_prop['source'] == 'table': combine_info = {} for prop_name, prop_value in one_material_prop.items( ): combine_info[ prop_name] = prop_value for ele_name, ele_value in ele_doi_fullinfo[ 0].items(): combine_info[ele_name] = ele_value all_doi_info.append(combine_info) else: all_doi_info.append(one_material_prop) if prop_m_name not in modified_ele_allname and len( ele_doi_fullinfo) != 1: all_doi_info.append(one_material_prop) if prop_m_name in modified_ele_allname: for one_material_ele in ele_doi_fullinfo: if 'material' in one_material_ele.keys( ): ele_m_name = one_material_ele[ 'material'] space_search = re.findall( '\s', str(ele_m_name)) if space_search: ele_m_name_split = ele_m_name.split( ) if prop_m_name in ele_m_name_split or prop_m_name == ele_m_name: pop_name.append(ele_m_name) combine_info = {} for prop_name, prop_value in one_material_prop.items( ): combine_info[ prop_name] = prop_value for ele_name, ele_value in one_material_ele.items( ): combine_info[ ele_name] = ele_value all_doi_info.append( combine_info) else: if prop_m_name == ele_m_name: combine_info = {} for prop_name, prop_value in one_material_prop.items( ): combine_info[ prop_name] = prop_value for ele_name, ele_value in one_material_ele.items( ): combine_info[ ele_name] = ele_value all_doi_info.append( combine_info) for one_material_ele in ele_doi_fullinfo: if 'material' in one_material_ele.keys(): ele_m_name = one_material_ele['material'] if ele_m_name not in pop_name: if ele_m_name not in prop_allname: all_doi_info.append(one_material_ele) if all_doi_info: plus_info[ele_doi] = all_doi_info outcome.append(plus_info) else: outcome.append(doi_info_ele) for extra_prop in prop_info_modified: outcome.append(extra_prop) return outcome def structureinfo_toexcel(self, all_structureinfo, out_path): ele_list = self.dict_info.ele_list xls = openpyxl.Workbook() sht = xls.create_sheet("0") sht = xls.create_sheet(index=0) sht.cell(1, 1, "Source") sht.cell(1, 2, "DOIs") sht.cell(1, 3, "table_topic") sht.cell(1, 4, "material") sht.cell(1, 5, "Property_name") sht.cell(1, 6, "Property_value") sht.cell(1, 7, "Unit") col_n = 8 row_now = 2 sht.cell(1, col_n, str("other_element_info")) col_n += 1 sht.cell(1, col_n, str("other_property_info")) col_n += 1 sht.cell(1, col_n, str("child_tag")) col_n += 1 for ele in ele_list: sht.cell(1, col_n, ele) col_n += 1 for m_info in all_structureinfo: doi = tuple(m_info.keys())[0] length_m_info = m_info[doi] for index_m in range(len(length_m_info)): sht.cell(row_now, 2, doi) material_now = length_m_info[index_m] if 'source' in material_now.keys(): sht.cell(row_now, 1, str(material_now['source'])) if 'table_topic' in material_now.keys(): sht.cell(row_now, 3, str(material_now['table_topic'])) if 'material' in material_now.keys(): sht.cell(row_now, 4, str(material_now['material'])) if 'prop_name' in material_now.keys(): sht.cell(row_now, 5, str(material_now['prop_name'])) if 'value' in material_now.keys(): sht.cell(row_now, 6, str(material_now['value'])) if 'unit' in material_now.keys(): sht.cell(row_now, 7, str(material_now['unit'])) if "other_eleinfo" in material_now.keys(): sht.cell(row_now, 8, str(material_now['other_eleinfo'])) if "other_prop_info" in material_now.keys(): sht.cell(row_now, 9, str(material_now['other_prop_info'])) if "child_tag" in material_now.keys(): sht.cell(row_now, 10, str(material_now["child_tag"])) col_ele = 11 for ele in ele_list: if ele in material_now.keys(): sht.cell(row_now, col_ele, material_now[ele]) col_ele += 1 row_now += 1 del xls['Sheet'] self.log_wp.excel_save(xls, out_path) def run(self): prop_pattern = self.dict_info.table_prop_pattern unit_pattern_text = self.dict_info.table_unit_pattern_text for prop_name in self.prop_list: self.mkdir('output_tt') text_path = r"output_tt\full_text" self.mkdir(text_path) all_txt_info = self.get_text_triple(prop_name) target_property = prop_name # 'density' 'liquidus' 'solidus' 'solvus' te = TableExtraction(self.excels_path, self.c_path, prop_name=target_property) info_all = te.property_info_extraction() i_l = 0 for k, v in info_all.items(): i_l += len(v) all_composition = te.composition_triple_extraction() gather_outcome = self.gather_tableinfo_textinfo( all_txt_info, info_all, prop_name, prop_pattern, unit_pattern_text) gather = 0 for q in gather_outcome: k = tuple(q.keys())[0] i = q[k] gather += len(i) ele_transform = self.transform_comp_outcome(all_composition) all_structureinfo = self.allinfo_dependencyparse( ele_transform, gather_outcome) b = 0 for a in all_structureinfo: k = tuple(a.keys())[0] i = a[k] for n in i: for w, v in n.items(): if w == 'value': b += 1 out_path = self.out_path + '/' + str(prop_name) + '.xlsx' self.structureinfo_toexcel(all_structureinfo, out_path)