Esempio n. 1
0
class AllAttributes:
    def __init__(self, prop_name, txt_name, text_path, triple_path, out_path,
                 c_path):
        self.txt_name = txt_name
        self.text_path = text_path
        self.triple_path = triple_path
        self.prop_name = prop_name
        self.out_path = out_path
        self.c_path = c_path
        self.log_wp = LogWp()

    def get_toexcel(self):
        xls = xlwt.Workbook()
        sheet = xls.add_sheet("all_attributes")
        sheet.write(0, 0, 'txt_name')
        sheet.write(0, 1, 'Full_text')
        sheet.write(0, 2, 'Target_sentences')
        sheet.write(0, 3, 'alloy')
        sheet.write(0, 4, 'property')
        sheet.write(0, 5, 'value')
        # triples_tuple
        data_triples = xlrd.open_workbook(self.triple_path)
        table_triples = data_triples.sheet_by_index(0)
        for x in range(0, len(table_triples.col_values(0))):
            sheet.write(x + 1, 1, str(table_triples.col_values(0)[x]))
            sheet.write(x + 1, 2, str(table_triples.col_values(1)[x]))
            sheet.write(x + 1, 3, str(table_triples.col_values(2)[x]))
            sheet.write(x + 1, 4, str(table_triples.col_values(3)[x]))
            sheet.write(x + 1, 5, str(table_triples.col_values(4)[x]))
        # txt_name+doi_list
        k = 0
        for i in range(0, len(table_triples.col_values(0))):
            if table_triples.col_values(0)[i]:
                sheet.write(i + 1, 0, str(self.txt_name[0][k]))
                k += 1
        # text + filtered_text_1 + Target sentences
        txt_name = os.listdir(self.text_path)
        process_text = []
        for i in range(0, len(txt_name)):
            file = open(self.text_path + '/' + str(txt_name[i]),
                        'r',
                        encoding='utf-8')
            sole_text = file.read()
            pre_processor = PreProcessor(sole_text, self.c_path)
            filter_txt = pre_processor.pre_processor()
            path_2 = self.text_path + '/proprecess'
            is_exists = os.path.exists(path_2)
            if is_exists:
                text_path = path_2 + '/' + str(txt_name[i])
                file = open(text_path, 'w', encoding='utf-8')
                file.write(filter_txt)
                process_text.append(text_path)
            else:
                os.makedirs(path_2)
                text_path = path_2 + '/' + txt_name[i]
                file = open(text_path, 'w', encoding='utf-8')
                file.write(filter_txt)
                process_text.append(text_path)
        self.log_wp.excel_save(xls, self.out_path)
def get_extraction_outcome(xml_path, save_path, config_path):
    TableExtractor_m = TableExtractorModifiedtoalloy(xml_path, save_path, config_path)
    all_error_file = []
    xml_name = os.listdir(xml_path)
    log_wp = LogWp()
    for file_i in range(len(os.listdir(xml_path))):
        tables = None
        all_tables = []
        doi = xml_name[file_i].replace(".xml", "")
        doi = doi.replace("-", "/", 1)
        xml_n = xml_name[file_i]
        file = xml_path + '/' + str(xml_n)
        try:
            tables, captions = TableExtractor_m.get_xml_tables(doi, file)
        except:
            all_error_file.append(doi)
        if tables:
            cols, rows, col_inds, row_inds = TableExtractor_m.get_headers(tables, doi)
            tab = []
            for table, row_ind, col_ind in zip(tables, row_inds, col_inds):
                curr, error_file = (TableExtractor_m.construct_table_object(doi, table, row_ind, col_ind))
                if curr:
                    tab.append(curr)
                if error_file:
                    all_error_file.append(str(doi))
            for i, (t, caption) in enumerate(zip(tab, captions)):
                if t is not None:
                    t['order'] = i
                    t['_id'] = ObjectId()
                    t['caption'] = caption
                    t['paper_doi'] = doi
                    all_tables.append(t)
                    log_wp.print_log('Success: Extracted Tables from : %s', doi)
            xls = openpyxl.Workbook()
            sheet_id = 1
            if all_tables:
                for table in all_tables:
                    sht_new = xls.create_sheet(str(sheet_id))
                    act_table = table['act_table']
                    caption = table['caption']
                    row_len = len(act_table[0])
                    doi = table['paper_doi']
                    sht_new.cell(1, 1, str(doi))
                    sht_new.cell(2, 1, str(caption))
                    start_row = 3
                    for row in act_table:
                        len_row = len(row)
                        for index in range(len_row):
                            sht_new.cell(start_row, index + 1, row[index])
                        start_row += 1
                    sheet_id += 1
                del xls['Sheet']
                log_wp.excel_save(xls, save_path + '/' + str(file_i) + "end.xlsx")
    return all_error_file, len(all_error_file)
Esempio n. 3
0
class File_IO:
    def __init__(self, target_sents, out_path, txt_name):
        self.target_sents = target_sents
        self.out_path = out_path
        self.txt_name = txt_name
        self.log_wp = LogWp()

    def out_to_excel(self):
        length_sent = len(self.target_sents)
        xls = xlwt.Workbook()
        sht1 = xls.add_sheet("Sheet1")
        for w in range(0, length_sent):
            sht1.write(w, 0, str(w) + '.txt')
            if self.target_sents[w] == '{}':
                out = 'no target sentence'
                sht1.write(w, 1, out)
            else:
                sht1.write(w, 1, self.target_sents[w])
        self.log_wp.excel_save(xls, self.out_path)

    def data_from_excel(self):
        all_data = []
        file = xlrd.open_workbook(self.out_path)
        sheet = file.sheet_by_index(0)
        col_value = sheet.col_values(1)
        k = len(col_value)
        for k in range(0, k):
            unit_data = []
            str_sent = col_value[k]
            if str_sent == 'no target sentence':
                all_data.append([])
            else:
                dict_sent = eval(str_sent)
                for i, sent in dict_sent.items():
                    unit_data.append(sent)
                all_data.append(unit_data)
        return all_data
Esempio n. 4
0
class GetTargetInfo:
    def __init__(self, all_info, out_path, c_path):
        self.related_allinfo = all_info
        self.out_path = out_path
        self.log_wp = LogWp()
        self.dict_info = Dictionary(c_path)
        self.prop_pattern = self.dict_info.table_prop_pattern

    def structure_ele(self):
        xls = openpyxl.Workbook()
        sht = xls.create_sheet(index=0)
        sht.cell(1, 1, "File_name")
        sht.cell(1, 2, "DOIs")
        sht.cell(1, 3, "Material")
        sht.cell(1, 4, "Percentage")
        sht.cell(1, 5, "Element_info and other_info")
        start_row = 2
        for file, ele in self.related_allinfo.items():
            for material in ele:
                sht.cell(start_row, 1, file)
                if 'doi' in material.keys():
                    sht.cell(start_row, 2, material['doi'])
                    material.pop('doi')
                if 'material' in material.keys():
                    material_name = material['material']
                    noisy = re.findall('\s*\[.+\]', str(material_name))
                    if noisy:
                        for puc in noisy:
                            material_name = str(material_name).replace(puc, '')
                    sht.cell(start_row, 3, material_name)
                    material.pop('material')
                if 'percentage' in material.keys():
                    sht.cell(start_row, 4, material['percentage'])
                    material.pop('percentage')
                if material:
                    sht.cell(start_row, 5, str(material))
                start_row += 1
        self.log_wp.excel_save(xls, self.out_path)

    def structure_prop(self, prop_name_s):
        xls = openpyxl.Workbook()
        sht = xls.create_sheet(0)
        sht.cell(1, 1, "File_name")
        sht.cell(1, 2, "DOIs")
        sht.cell(1, 3, "Table_topic")
        sht.cell(1, 4, "Material")
        sht.cell(1, 5, "unit")
        sht.cell(1, 6, "Property_name")
        sht.cell(1, 7, "Property_value")
        sht.cell(1, 8, "Child_tag")
        sht.cell(1, 9, "Other_info")
        start_row = 2
        for file, ele in self.related_allinfo.items():
            for material in ele:
                get_prop = None
                sht.cell(start_row, 1, file)
                if 'doi' in material.keys():
                    sht.cell(start_row, 2, material['doi'])
                    material.pop('doi')
                if 'table_topic' in material.keys():
                    sht.cell(start_row, 3, material['table_topic'])
                    material.pop('table_topic')
                if 'material' in material.keys():
                    material_name = material['material']
                    noisy = re.findall('\s*\[.+\]', str(material_name))
                    if noisy:
                        for puc in noisy:
                            material_name = str(material_name).replace(puc, '')
                    sht.cell(start_row, 4, material_name)
                    material.pop('material')
                if 'unit' in material.keys():
                    unit_replace = material['unit'].replace('degC', '°C')
                    sht.cell(start_row, 5, unit_replace)
                    material.pop('unit')
                if 'child_tag' in material.keys():
                    sht.cell(start_row, 8, str(material['child_tag']))
                    material.pop('child_tag')
                if 'other_info' in material.keys():
                    sht.cell(start_row, 9, str(material['other_info']))
                    material.pop('other_info')
                if len(material) == 1:
                    for prop_name, value in material.items():
                        sht.cell(start_row, 6, str(prop_name))
                        sht.cell(start_row, 7, str(value))
                elif len(material) >= 1:
                    for prop_name, value in material.items():
                        for pattern in self.prop_pattern[prop_name_s]:
                            prop_search = re.findall(pattern, str(prop_name))
                            if prop_search:
                                sht.cell(start_row, 6, str(prop_name))
                                sht.cell(start_row, 7, str(value))
                                get_prop = True
                                break
                        if get_prop:
                            break
                start_row += 1
        self.log_wp.excel_save(xls, self.out_path)
Esempio n. 5
0
                    triple_lines += 1
                n_triple = 0
                for index, v in all_outcome.items():
                    out_unit.append(v)
                    n_triple += 1
                for n in range(0, n_triple):
                    sht2.write(num_of_lines + n, 1, sent)
                num_of_lines = num_of_lines + n_triple

            for s in range(0, len(out_unit)):
                sht2.write(triple_lines + s, 2, out_unit[s][0])
                sht2.write(triple_lines + s, 3, out_unit[s][1])
                sht2.write(triple_lines + s, 4, out_unit[s][2])
            if out_unit:
                triple_lines = triple_lines + len(out_unit)
            else:
                triple_lines += 1
            file_index += 1
        else:
            out = 'no target triples'
            sht2.write(triple_lines, 2, out)
            sht2.write(triple_lines, 3, 'None')
            sht2.write(triple_lines, 4, 'None')
            sht2.write(num_of_lines, 1, 'no target sentence')
            num_of_lines += 1
            triple_lines += 1
            file_index += 1
    log_wp.excel_save(xls, triple_path)
    attributes = AllAttributes(prop_name, txt_name, text_path, triple_path, out_path, c_path)
    attributes.get_toexcel()
Esempio n. 6
0
    def relation_extraction(self, C_path, origin_text_path, prop_name,
                            triple_path, out_path, m_path):
        log_wp = LogWp()
        # The path to the folder where the full-text text is stored
        text_path = os.path.join(m_path, "full_text")
        # Locate the obtained target corpus
        TS_path = os.path.join(m_path, "sent.xls")
        # Filter to get the full text
        FT = FilterText(origin_text_path, text_path)
        txt_name, dois = FT.process()
        # Get the target corpus
        all_x = []
        txt_name2 = []
        length = len(os.listdir(text_path))
        for i in range(0, length):
            n_path = text_path + '/' + str(os.listdir(text_path)[i])
            with open(n_path, 'r', encoding='utf-8') as file:
                data = file.read()
            pre_processor = PreProcessor(data, C_path)
            filter_data = pre_processor.pre_processor()
            processor = TPreProcessor(filter_data, prop_name, C_path)
            filter_data = processor.processor()
            positioner = SentencePositioner(filter_data, prop_name, C_path)
            target_sents = positioner.target_sent()

            # print(target_sents)
            all_x.append(str(target_sents))
            txt_name2.append(n_path)
        FI_out = FI(all_x, TS_path, txt_name2)
        FI_out.out_to_excel()

        # Extraction of triples
        data = FI_out.data_from_excel()
        xls = xlwt.Workbook()
        sht2 = xls.add_sheet("triple_extracion")
        triple_lines = 0  # the number of "triple"
        file_index = 0  # document indexing
        num_of_lines = 0  # the number of sentences
        for item in data:
            doi = dois[file_index].replace("doi:", "")
            sht2.write(triple_lines, 0, doi)
            if item != []:
                out_unit = []
                sent_out = {}
                l_sent = []
                for sent in item:
                    processor = TPreProcessor(sent, prop_name, C_path)
                    filter_data = processor.processor()
                    parse = PhraseParse(filter_data, prop_name, C_path)
                    sub_order, sub_id, object_list = parse.alloy_sub_search()
                    RE = RelationExtraciton(prop_name, filter_data, sub_order,
                                            sub_id, object_list, C_path)
                    all_outcome = RE.triple_extraction()
                    if not all_outcome:
                        out = 'no target triples'
                        sht2.write(triple_lines, 2, out)
                        sht2.write(triple_lines, 3, 'None')
                        sht2.write(triple_lines, 4, 'None')
                        sht2.write(num_of_lines, 1, 'no target sentence')
                        num_of_lines += 1
                        triple_lines += 1
                    n_triple = 0
                    for index, v in all_outcome.items():
                        out_unit.append(v)
                        n_triple += 1
                    for n in range(0, n_triple):
                        sht2.write(num_of_lines + n, 1, sent)
                    num_of_lines = num_of_lines + n_triple
                for s in range(0, len(out_unit)):
                    sht2.write(triple_lines + s, 2, out_unit[s][0])
                    sht2.write(triple_lines + s, 3, out_unit[s][1])
                    sht2.write(triple_lines + s, 4, out_unit[s][2])
                if out_unit:
                    triple_lines = triple_lines + len(out_unit)
                else:
                    triple_lines += 1
                file_index += 1
            else:
                out = 'no target triples'
                sht2.write(triple_lines, 2, out)
                sht2.write(triple_lines, 3, 'None')
                sht2.write(triple_lines, 4, 'None')
                sht2.write(num_of_lines, 1, 'no target sentence')
                num_of_lines += 1
                triple_lines += 1
                file_index += 1
        log_wp.excel_save(xls, triple_path)
        attributes = AllAttributes(prop_name, txt_name, text_path, triple_path,
                                   out_path, C_path, dois)
        attributes.get_toexcel()
class AcquireTargetInfo:
    def __init__(self, c_path, origin_text_path, prop_list, excels_path,
                 out_path):
        self.c_path = c_path
        self.prop_list = prop_list
        self.origin_text_path = origin_text_path
        self.excels_path = excels_path
        self.dict_info = Dictionary(self.c_path)
        self.out_path = out_path
        self.log_wp = LogWp()

    def mkdir(self, file_name):
        pathd = os.getcwd() + '\\' + file_name
        if os.path.exists(pathd):
            for root, dirs, files in os.walk(pathd, topdown=False):
                for name in files:
                    os.remove(os.path.join(root, name))
                for name in dirs:
                    os.rmdir(os.path.join(root, name))
            os.rmdir(pathd)
        os.mkdir(pathd)

    def get_doi_fromtxt(self, txt_path):
        text_name = txt_path.replace(".txt", "")
        doi = text_name.replace("-", "/", 1)
        return doi

    def get_abrre(self, text, prop_name):
        processor = TPreProcessor(text, prop_name, self.c_path)
        text = processor.processor()
        sentences = nltk.sent_tokenize(text)
        sentences_split = text.split(" ")
        alloy_write_type = self.dict_info.alloy_writing_type
        len_type = len(alloy_write_type)
        abbre_to_alloy = {}
        for sent in sentences:
            processor = TPreProcessor(sent, prop_name, self.c_path)
            filter_data = processor.processor()
            words = nltk.word_tokenize(filter_data)
            for word in words:
                for type_i in range(0, len_type):
                    outcome = re.findall(alloy_write_type[type_i], word)
                    outcome_alloy = None
                    if outcome:
                        abbre = "(" + word + ")"
                        if abbre in sentences_split:
                            index_alloy = sentences_split.index(abbre) - 1
                            alloy = sentences_split[index_alloy]
                            for type_j in range(0, len_type):
                                outcome_alloy = re.findall(
                                    alloy_write_type[type_j], alloy)
                                if outcome_alloy:
                                    abbre_to_alloy[word] = alloy
                                    break
                    if outcome_alloy:
                        break
        return abbre_to_alloy

    def get_text_triple(self, prop_name):
        self.mkdir('output_tt')
        text_path = r"output_tt\full_text"
        self.mkdir(text_path)
        ft = FilterText(self.origin_text_path, text_path)
        txt_name = ft.process()
        length = len(os.listdir(self.origin_text_path))
        all_txt_info = []
        for i in range(0, length):
            n_path = os.listdir(self.origin_text_path)[i]
            doi = self.get_doi_fromtxt(n_path)
            file = open(text_path + '/' + n_path, 'r', encoding='utf-8')
            data = file.read()
            pre_processor = PreProcessor(data, self.c_path)
            filter_txt = pre_processor.pre_processor()
            file_origin = open(self.origin_text_path + '/' + n_path,
                               'r',
                               encoding='utf-8')
            data_origin = file_origin.read()
            abbre_pairs = self.get_abrre(data_origin, prop_name)
            positioner = SentencePositioner(filter_txt, prop_name, self.c_path)
            target_sents = positioner.target_sent()
            for index, sent in target_sents.items():
                processor = TPreProcessor(sent, prop_name, self.c_path)
                filter_data = processor.processor()
                parse = PhraseParse(filter_data, prop_name, self.c_path)
                sub_order, sub_id, object_list = parse.alloy_sub_search()
                ree = RelationExtraciton(prop_name, filter_data, sub_order,
                                         sub_id, object_list, self.c_path,
                                         abbre_pairs)
                all_outcome = ree.triple_extraction()
                if all_outcome:
                    for id_m, info in all_outcome.items():
                        sole_info = dict()
                        sole_info['doi'] = doi
                        sole_info['material'] = info[0]
                        sole_info['prop_name'] = info[1]
                        sole_info['prop_value'] = info[2]
                        all_txt_info.append(sole_info)
        return all_txt_info

    def gather_tableinfo_textinfo(self, all_txt_info, table_info, prop_name,
                                  prop_pattern, unit_pattern_text):
        gather_outcome = list()
        for file_name, t_info in table_info.items():
            sole_info = dict()
            all_triple_info = list()
            sole_doi = list()
            for sole_m_info in t_info:
                triple_info = dict()
                triple_info['source'] = 'table'
                if 'doi' in sole_m_info.keys():
                    plus_doi = sole_m_info['doi']
                    sole_doi.append(plus_doi)
                    sole_m_info.pop('doi')
                if 'material' in sole_m_info.keys():
                    sole_material = sole_m_info['material']
                    noisy = re.findall('\s*\[.+\]', str(sole_material))
                    if noisy:
                        for puc in noisy:
                            sole_material = str(sole_material).replace(puc, '')
                    triple_info['material'] = sole_material
                    sole_m_info.pop('material')
                if 'unit' in sole_m_info.keys():
                    sole_unit = sole_m_info['unit']
                    triple_info['unit'] = sole_unit
                    sole_m_info.pop('unit')
                if 'other_info' in sole_m_info.keys():
                    sole_other_info = sole_m_info['other_info']
                    triple_info['other_prop_info'] = sole_other_info
                    sole_m_info.pop('other_info')
                if 'child_tag' in sole_m_info.keys():
                    sole_tag_info = sole_m_info['child_tag']
                    triple_info['child_tag'] = sole_tag_info
                    sole_m_info.pop('child_tag')
                if 'table_topic' in sole_m_info.keys():
                    sole_tag_info = sole_m_info['table_topic']
                    triple_info['table_topic'] = sole_tag_info
                    sole_m_info.pop('table_topic')
                if len(sole_m_info) == 1:
                    for prop_name_t, value in sole_m_info.items():
                        sole_propname = str(prop_name_t)
                        triple_info['prop_name'] = sole_propname
                        sole_value = str(value)
                        triple_info['value'] = sole_value
                elif len(sole_m_info) >= 1:
                    get_prop = None
                    for prop_name_t, value in sole_m_info.items():
                        for pattern in prop_pattern[prop_name]:
                            prop_search = re.findall(pattern, str(prop_name_t))
                            if prop_search:
                                sole_propname = str(prop_name_t)
                                triple_info['prop_name'] = sole_propname
                                sole_value = str(value)
                                triple_info['value'] = sole_value
                                get_prop = True
                                break
                        if get_prop:
                            break
                all_triple_info.append(triple_info)
            if list(set(sole_doi)):
                sole_info[list(set(sole_doi))[0]] = all_triple_info
                gather_outcome.append(sole_info)
        gather = 0
        for q in gather_outcome:
            k = tuple(q.keys())[0]
            i = q[k]
            for n in i:
                for w, v in n.items():
                    if w == 'value':
                        gather += 1
        self.log_wp.print_log("gather number :%s", gather)
        copy_all_txt_info = copy.copy(all_txt_info)
        if copy_all_txt_info:
            all_text = 0
            all_gather_doi = []
            for info_one in gather_outcome:
                all_gather_doi.append(tuple(info_one.keys())[0])
            for triple_info_sole in copy_all_txt_info:
                if triple_info_sole['doi'] in all_gather_doi:
                    all_text += 1
                    plus_info = dict()
                    plus_info['source'] = 'text'
                    plus_info['prop_name'] = triple_info_sole['prop_name']
                    prop_value = triple_info_sole['prop_value']
                    plus_info['material'] = triple_info_sole['material']
                    unit_search = re.findall(unit_pattern_text[prop_name],
                                             str(prop_value))
                    if unit_search:
                        plus_info['unit'] = unit_search[0]
                        prop_value = prop_value.replace(unit_search[0], '')
                        plus_info['value'] = prop_value
                    else:
                        plus_info['unit'] = ""
                        plus_info['value'] = prop_value
                    for get_info in gather_outcome:
                        if tuple(
                                get_info.keys())[0] == triple_info_sole['doi']:
                            get_info[triple_info_sole['doi']].append(plus_info)
                if triple_info_sole['doi'] not in all_gather_doi:
                    all_text += 1
                    plus_info = {}
                    full_info = {}
                    sole_triple = []
                    plus_info['source'] = 'text'
                    plus_info['prop_name'] = triple_info_sole['prop_name']
                    prop_value = triple_info_sole['prop_value']
                    plus_info['material'] = triple_info_sole['material']
                    unit_search = re.findall(unit_pattern_text[prop_name],
                                             str(prop_value))
                    if unit_search:
                        plus_info['unit'] = unit_search[0]
                        prop_value = prop_value.replace(unit_search[0], '')
                        plus_info['value'] = prop_value
                    else:
                        plus_info['unit'] = ""
                        plus_info['value'] = prop_value
                    if plus_info:
                        sole_triple.append(plus_info)
                        full_info[triple_info_sole['doi']] = sole_triple
                        gather_outcome.append(full_info)
                        all_gather_doi.append(triple_info_sole['doi'])
            self.log_wp.print_log("all_text number :%s", all_text)
        return gather_outcome

    def transform_comp_outcome(self, all_composition):
        ele_list = self.dict_info.ele_list
        gather_outcome = []
        for file_name, t_info in all_composition.items():
            sole_info = {}
            all_triple_info = []
            for sole_m_info in t_info:
                triple_info = {}
                sole_doi = sole_m_info['doi']
                sole_m_info.pop('doi')
                if 'material' in sole_m_info.keys():
                    sole_material = sole_m_info['material']
                    noisy = re.findall('\[.+\]', str(sole_material))
                    if noisy:
                        for puc in noisy:
                            sole_material = str(sole_material).replace(puc, '')
                    triple_info['material'] = sole_material
                    sole_m_info.pop('material')
                for element in ele_list:
                    if element in sole_m_info.keys():
                        triple_info[element] = sole_m_info[element]
                        sole_m_info.pop(element)
                if sole_m_info:
                    triple_info["other_eleinfo"] = sole_m_info
                all_triple_info.append(triple_info)
            sole_info[sole_doi] = all_triple_info
            gather_outcome.append(sole_info)
        return gather_outcome

    def allinfo_dependencyparse(self, comp_info, prop_info):
        all_ele_doi = []
        all_prop_doi = []
        outcome = []
        for doi_info_ele in comp_info:
            ele_doi = tuple(doi_info_ele.keys())[0]
            all_ele_doi.append(ele_doi)
        for doi_info_prop in prop_info:
            prop_doi = tuple(doi_info_prop.keys())[0]
            all_prop_doi.append(prop_doi)
        prop_info_modified = copy.copy(prop_info)
        for doi_info_ele in comp_info:
            ele_doi = tuple(doi_info_ele.keys())[0]
            if ele_doi in all_prop_doi:
                for doi_info_prop in prop_info:
                    prop_doi = tuple(doi_info_prop.keys())[0]
                    plus_info = {}
                    all_doi_info = []
                    if ele_doi == prop_doi:
                        if doi_info_prop in prop_info_modified:
                            prop_info_modified.remove(doi_info_prop)
                        ele_doi_fullinfo = doi_info_ele[ele_doi]
                        ele_allname = []
                        prop_allname = []
                        pop_name = []
                        for one_material_ele in ele_doi_fullinfo:
                            if 'material' in one_material_ele.keys():
                                ele_m_name = one_material_ele['material']
                                ele_allname.append(ele_m_name)
                        modified_ele_allname = []
                        for name in ele_allname:
                            space_search = re.findall('\s', str(name))
                            if space_search:
                                name_list = str(name).split()
                                modified_ele_allname.append(str(name))
                                for name_sepe in name_list:
                                    modified_ele_allname.append(name_sepe)
                            else:
                                modified_ele_allname.append(name)
                        for one_material_prop in doi_info_prop[prop_doi]:
                            if 'material' in one_material_prop.keys():
                                prop_m_name = one_material_prop['material']
                                prop_allname.append(prop_m_name)
                                if prop_m_name not in modified_ele_allname and len(
                                        ele_doi_fullinfo) == 1:
                                    if one_material_prop['source'] == 'table':
                                        combine_info = {}
                                        for prop_name, prop_value in one_material_prop.items(
                                        ):
                                            combine_info[
                                                prop_name] = prop_value
                                        for ele_name, ele_value in ele_doi_fullinfo[
                                                0].items():
                                            combine_info[ele_name] = ele_value
                                        all_doi_info.append(combine_info)
                                    else:
                                        all_doi_info.append(one_material_prop)
                                if prop_m_name not in modified_ele_allname and len(
                                        ele_doi_fullinfo) != 1:
                                    all_doi_info.append(one_material_prop)
                                if prop_m_name in modified_ele_allname:
                                    for one_material_ele in ele_doi_fullinfo:
                                        if 'material' in one_material_ele.keys(
                                        ):
                                            ele_m_name = one_material_ele[
                                                'material']
                                            space_search = re.findall(
                                                '\s', str(ele_m_name))
                                            if space_search:
                                                ele_m_name_split = ele_m_name.split(
                                                )
                                                if prop_m_name in ele_m_name_split or prop_m_name == ele_m_name:
                                                    pop_name.append(ele_m_name)
                                                    combine_info = {}
                                                    for prop_name, prop_value in one_material_prop.items(
                                                    ):
                                                        combine_info[
                                                            prop_name] = prop_value
                                                    for ele_name, ele_value in one_material_ele.items(
                                                    ):
                                                        combine_info[
                                                            ele_name] = ele_value
                                                    all_doi_info.append(
                                                        combine_info)
                                            else:
                                                if prop_m_name == ele_m_name:
                                                    combine_info = {}
                                                    for prop_name, prop_value in one_material_prop.items(
                                                    ):
                                                        combine_info[
                                                            prop_name] = prop_value
                                                    for ele_name, ele_value in one_material_ele.items(
                                                    ):
                                                        combine_info[
                                                            ele_name] = ele_value
                                                    all_doi_info.append(
                                                        combine_info)
                        for one_material_ele in ele_doi_fullinfo:
                            if 'material' in one_material_ele.keys():
                                ele_m_name = one_material_ele['material']
                                if ele_m_name not in pop_name:
                                    if ele_m_name not in prop_allname:
                                        all_doi_info.append(one_material_ele)
                    if all_doi_info:
                        plus_info[ele_doi] = all_doi_info
                        outcome.append(plus_info)
            else:
                outcome.append(doi_info_ele)
        for extra_prop in prop_info_modified:
            outcome.append(extra_prop)
        return outcome

    def structureinfo_toexcel(self, all_structureinfo, out_path):
        ele_list = self.dict_info.ele_list
        xls = openpyxl.Workbook()
        sht = xls.create_sheet("0")
        sht = xls.create_sheet(index=0)
        sht.cell(1, 1, "Source")
        sht.cell(1, 2, "DOIs")
        sht.cell(1, 3, "table_topic")
        sht.cell(1, 4, "material")
        sht.cell(1, 5, "Property_name")
        sht.cell(1, 6, "Property_value")
        sht.cell(1, 7, "Unit")
        col_n = 8
        row_now = 2
        sht.cell(1, col_n, str("other_element_info"))
        col_n += 1
        sht.cell(1, col_n, str("other_property_info"))
        col_n += 1
        sht.cell(1, col_n, str("child_tag"))
        col_n += 1
        for ele in ele_list:
            sht.cell(1, col_n, ele)
            col_n += 1
        for m_info in all_structureinfo:
            doi = tuple(m_info.keys())[0]
            length_m_info = m_info[doi]
            for index_m in range(len(length_m_info)):
                sht.cell(row_now, 2, doi)
                material_now = length_m_info[index_m]
                if 'source' in material_now.keys():
                    sht.cell(row_now, 1, str(material_now['source']))
                if 'table_topic' in material_now.keys():
                    sht.cell(row_now, 3, str(material_now['table_topic']))
                if 'material' in material_now.keys():
                    sht.cell(row_now, 4, str(material_now['material']))
                if 'prop_name' in material_now.keys():
                    sht.cell(row_now, 5, str(material_now['prop_name']))
                if 'value' in material_now.keys():
                    sht.cell(row_now, 6, str(material_now['value']))
                if 'unit' in material_now.keys():
                    sht.cell(row_now, 7, str(material_now['unit']))
                if "other_eleinfo" in material_now.keys():
                    sht.cell(row_now, 8, str(material_now['other_eleinfo']))
                if "other_prop_info" in material_now.keys():
                    sht.cell(row_now, 9, str(material_now['other_prop_info']))
                if "child_tag" in material_now.keys():
                    sht.cell(row_now, 10, str(material_now["child_tag"]))
                col_ele = 11
                for ele in ele_list:
                    if ele in material_now.keys():
                        sht.cell(row_now, col_ele, material_now[ele])
                    col_ele += 1
                row_now += 1
        del xls['Sheet']
        self.log_wp.excel_save(xls, out_path)

    def run(self):
        prop_pattern = self.dict_info.table_prop_pattern
        unit_pattern_text = self.dict_info.table_unit_pattern_text
        for prop_name in self.prop_list:
            self.mkdir('output_tt')
            text_path = r"output_tt\full_text"
            self.mkdir(text_path)
            all_txt_info = self.get_text_triple(prop_name)
            target_property = prop_name  # 'density' 'liquidus'  'solidus'  'solvus'
            te = TableExtraction(self.excels_path,
                                 self.c_path,
                                 prop_name=target_property)
            info_all = te.property_info_extraction()
            i_l = 0
            for k, v in info_all.items():
                i_l += len(v)
            all_composition = te.composition_triple_extraction()
            gather_outcome = self.gather_tableinfo_textinfo(
                all_txt_info, info_all, prop_name, prop_pattern,
                unit_pattern_text)
            gather = 0
            for q in gather_outcome:
                k = tuple(q.keys())[0]
                i = q[k]
                gather += len(i)
            ele_transform = self.transform_comp_outcome(all_composition)
            all_structureinfo = self.allinfo_dependencyparse(
                ele_transform, gather_outcome)
            b = 0
            for a in all_structureinfo:
                k = tuple(a.keys())[0]
                i = a[k]
                for n in i:
                    for w, v in n.items():
                        if w == 'value':
                            b += 1
            out_path = self.out_path + '/' + str(prop_name) + '.xlsx'
            self.structureinfo_toexcel(all_structureinfo, out_path)