def get_extraction_outcome(xml_path, save_path, config_path): table_extractor_m = TableExtractorToAlloy(xml_path, save_path, config_path) all_error_file = [] xml_name = os.listdir(xml_path) log_wp = LogWp() for file_i in range(len(os.listdir(xml_path))): tables = None all_tables = [] doi = xml_name[file_i].replace( ".xml", "") # choose target file according to doi doi = doi.replace("-", "/", 1) xml_n = xml_name[file_i] file = xml_path + '/' + str(xml_n) try: tables, captions = table_extractor_m.get_xml_tables(doi, file) except Exception as e: print(e) all_error_file.append(doi) tables = None captions = None if tables: cols, rows, col_inds, row_inds = table_extractor_m.get_headers( tables, doi) tab = [] for table, row_ind, col_ind in zip(tables, row_inds, col_inds): curr, error_file = (table_extractor_m.construct_table_object( doi, table, row_ind, col_ind)) if curr: tab.append(curr) if error_file: all_error_file.append(str(doi)) for i, (t, caption) in enumerate(zip(tab, captions)): if t is not None: t['order'] = i t['_id'] = ObjectId() t['caption'] = caption t['paper_doi'] = doi all_tables.append(t) log_wp.print_log('Success: Extracted Tables from %s', doi) xls = openpyxl.Workbook() sheet_id = 1 if all_tables: for table in all_tables: sht_new = xls.create_sheet(str(sheet_id)) act_table = table['act_table'] caption = table['caption'] row_len = len(act_table[0]) doi = table['paper_doi'] sht_new.cell(1, 1, str(doi)) sht_new.cell(2, 1, str(caption)) start_row = 3 for row in act_table: len_row = len(row) for index in range(len_row): sht_new.cell(start_row, index + 1, row[index]) start_row += 1 sheet_id += 1 del xls['Sheet'] xls.save(save_path + '/' + str(file_i) + ".xlsx") return all_error_file, len(all_error_file)
class GetTableHtml: def __init__(self, doi_path, output_path): self.doi_path = doi_path self.output_path = output_path self.log_wp = LogWp() def get_all_url(self, url): """ return all url on the page :param url:url of one page :return: all url as list """ headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0)\ Gecko/20100101 Firefox/23.0'} req = urllib.request.Request(url=url, headers=headers) html = urllib.request.urlopen(req).read().decode("utf-8") soup = BeautifulSoup(html, features='html.parser') tags = soup.find_all('a') all_url = [] for tag in tags: all_url.append(str(tag.get('href')).strip()) return all_url def get_table_url(self, doi_info): """ return table url on the page for Springer and Nature Publishing Group :param doi_info:doi_info of article :return: table url as list """ all_url = self.get_all_url(doi_info[0]) table_url = [] for i in all_url: if "article" and "table" in i: if "%" and "#" and "?" not in i: if len(i) <= 150: print(str(i)) if doi_info[1] in "Springer": table_url.append('https://link.springer.com' + i) else: table_url.append('https://www.nature.com' + i) if len(table_url) == 0: print("There is no table url in this article!") print(str(table_url)) return table_url def doi_info(self, doi_str): """ get url and database of doi :param doi_str: doi as str :return: doi_info=[doi_url,doi_database] """ global doi_url doi_info = [] if doi_str[0:7] in "10.1016": doi_url = "https://doi.org/" + doi_str doi_database = "Elsevier" elif doi_str[0:7] in ["10.1007", "10.1361", "10.1023"]: doi_url = "https://link.springer.com/article/" + doi_str doi_database = "Springer" elif doi_str[0:7] in "10.1080": doi_url = "https://doi.org/" + doi_str doi_database = "Taylor & Francis Online" elif doi_str[0:7] in ["10.1002", "10.1111"]: doi_url = "https://onlinelibrary.wiley.com/doi/" + doi_str doi_database = "Wiley Blackwell" elif doi_str[0:7] in "10.1115": doi_url = "https://doi.org/" + doi_str doi_database = "ASME International" elif doi_str[0:7] in "10.3390": all_url = self.get_all_url("https://doi.org/" + doi_str) for url_str in all_url: if "htm" in url_str: doi_url = "https://www.mdpi.com/" + url_str break doi_database = "MDPI" elif doi_str[0:7] == "10.1038": doi_url = "https://doi.org/" + doi_str doi_database = "Nature Publishing Group" else: doi_url = "other URL" doi_database = "other database" doi_info.append(doi_url) doi_info.append(doi_database) doi_info.append(doi_str) return doi_info def file_name(self, name): st = '\|/:?*<>;' for s in st: if s in name: name = name.replace(s, '-') return name def get_table(self, doi_info, path=r'table.xlsx'): """ get all table name from the page, :param doi_info: [doi_url,1doi_info_name]str :param path: requests.get(url).text :return: """ table_name = [] if doi_info[1] in ['Springer', 'Nature Publishing Group']: table_url = self.get_table_url(doi_info) if len(table_url) != 0: with pd.ExcelWriter(path) as writer: for p in range(len(table_url)): time.sleep(1) print("Start crawling the page") r = requests.get(table_url[p]) rt = r.text try: df = pd.read_html(rt) print("complete!") except Exception as e: print(e) print('format of table ' + str(p) + ' is PDF') data_df = pd.DataFrame() self.log_wp.excel_writer(data_df, writer) continue start = rt.find("<h1") end = rt.rfind("</h1>") title_str = '' for i in range(start, end + 5): title_str += rt[i] title_start = title_str.find("Table") title_end = title_str.find("</h1>") title = '' for j in range(title_start, title_end): title += title_str[j] table_name.append(title) table_te = [] row_doi = [doi_info[2]] for j in range(len(df[0].columns) - 1): row_doi.append('') table_te.append(row_doi) row_title = list() row_title.append(title) for j in range(len(df[0].columns) - 1): row_title.append('') table_te.append(row_title) table_te.append(list(df[0].columns)) for i in range(len(df[0])): table_te.append(list(df[0].iloc[i])) df[0] = pd.DataFrame(data=table_te) sheet_name = 'table' + str(p + 1) self.log_wp.excel_writer(df[0], writer, sheet_name) else: self.log_wp.print_log(" Cannot find table in this page:%s", doi_info[0]) elif doi_info[1] in 'Taylor & Francis Online': rt = self.get_rt(doi_info[0]) page = BeautifulSoup(rt, 'lxml') table_name = [] for page1 in page.find_all('b'): name = page1.text if 'Table' in name: table_name.append(name) del table_name[int(len(table_name) / 2):len(table_name)] count = 0 for t in table_name: if 'Table 1' in t: count += 1 if count > 1: del table_name[1:(len(table_name)):2] if len(table_name) != 0: with pd.ExcelWriter(path) as writer: for p in range(len(table_name)): df = pd.read_html(rt) table_te = [] row_doi = [doi_info[2]] for j in range(len(df[p].columns) - 1): row_doi.append('') table_te.append(row_doi) row_title = list() row_title.append(table_name[p]) for j in range(len(df[p].columns) - 1): row_title.append('') table_te.append(row_title) table_te.append(list(df[p].columns)) for i in range(len(df[p])): table_te.append(list(df[p].iloc[i])) df = pd.DataFrame(data=table_te) sheet_name = 'table' + str(p + 1) self.log_wp.excel_writer(df, writer, sheet_name) else: self.log_wp.print_log(" Cannot find table in this page: %s", doi_info[0]) elif doi_info[1] in 'MDPI': rt = self.get_rt(doi_info[0]) page = BeautifulSoup(rt, 'lxml') table_name = [] for page1 in page.find_all('caption'): name = page1.text name = name.replace('\n', '') table_name.append(name) if len(table_name) != 0: with pd.ExcelWriter(path) as writer: time.sleep(1) print("Start crawling the page") r = requests.get(doi_info[0]) rt = r.text df = pd.read_html(rt) print("complete!") for p in range(len(table_name)): table_te = [] row_doi = [doi_info[2]] for j in range(len(df[p].columns) - 1): row_doi.append('') table_te.append(row_doi) row_title = list() row_title.append(table_name[p]) for j in range(len(df[p].columns) - 1): row_title.append('') table_te.append(row_title) table_te.append(list(df[p].columns)) for i in range(len(df[p])): table_te.append(list(df[p].iloc[i])) fa = pd.DataFrame(data=table_te) sheet_name = 'table' + str(p + 1) self.log_wp.excel_writer(fa, writer, sheet_name) else: self.log_wp.print_log(" Cannot find table in this page:%s", doi_info[0]) elif doi_info[1] in "ASME International": rt = self.get_rt(doi_info[0]) page = BeautifulSoup(rt, 'lxml') table_name = [] for page1 in page.find_all('div'): name = page1.text if 'Table' in name[0:5]: if ' ' in name[-1]: table_name.append(name) if len(table_name) != 0: df = pd.read_html(rt) print("complete!") del df[0:len(df):2] with pd.ExcelWriter(path) as writer: for p in range(len(df)): table_te = [] row_doi = [doi_info[2]] for j in range(len(df[p].columns) - 1): row_doi.append('') table_te.append(row_doi) row_title = list() row_title.append(table_name[p]) for j in range(len(df[p].columns) - 1): row_title.append('') table_te.append(row_title) table_te.append(list(df[p].columns)) for i in range(len(df[p])): table_te.append(list(df[p].iloc[i])) fa = pd.DataFrame(data=table_te) sheet_name = 'table' + str(p + 1) self.log_wp.excel_writer(fa, writer, sheet_name) else: self.log_wp.print_log(" Cannot find table in this page:%s", doi_info[0]) elif doi_info[1] in "Wiley Blackwell": rt = self.get_rt(doi_info[0]) page = BeautifulSoup(rt, 'lxml') table_name = [] for page1 in page.find_all('header'): name = page1.text if 'Table' in name: name = ' '.join(name.split()) table_name.append(name.replace('\n', '')) if len(table_name) != 0: df = pd.read_html(rt) with pd.ExcelWriter(path) as writer: for p in range(len(table_name)): table_te = [] row_doi = [doi_info[2]] for j in range(len(df[p].columns) - 1): row_doi.append('') table_te.append(row_doi) row_title = list() row_title.append(table_name[p]) for j in range(len(df[p].columns) - 1): row_title.append('') table_te.append(row_title) table_te.append(list(df[p].columns)) for i in range(len(df[p])): table_te.append(list(df[p].iloc[i])) fa = pd.DataFrame(data=table_te) sheet_name = 'table' + str(p + 1) self.log_wp.excel_writer(fa, writer, sheet_name) else: self.log_wp.print_log(" Cannot find table in this page:%s", doi_info[0]) else: print("Please try other function!") return table_name def get_rt(self, url): """ :param url:str :return: '""" headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} time.sleep(1) print("Start crawling the page") r = requests.get(url, headers=headers) r.encoding = 'utf-8' rt = r.text print("complete!") return rt def load_doi(self, path): import xlrd data = xlrd.open_workbook(path) table = data.sheet_by_index(0) nrows = table.nrows doi_li = [] for row in range(nrows): table_doi = table.row_values(row, start_colx=0, end_colx=None)[0] doi_li.append(table_doi) return doi_li def get_html(self, url): headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} req = urllib.request.Request(url=url, headers=headers) html = urllib.request.urlopen(req).read() return html def save_html(self, file_name, file_content): self.log_wp.write_tohtml_log(path=file_name + ".html", content=file_content) def down_html(self, doi_li, path=''): for s in range(len(doi_li)): name = self.file_name(doi_li[s]) url_te = self.doi_info(doi_li[s])[0] html = self.get_html(url_te) self.save_html(path + name, html) print('html_' + str(s + 1) + " download completed!") def run(self): xls = xlrd.open_workbook(self.doi_path) sht = xls.sheet_by_index(0) doi_li = sht.col_values(0) doi_error = [] for i in range(len(doi_li)): print('***************** text' + str(i + 1) + ' start! *****************') doi_ls = self.doi_info(doi_li[i]) name = self.file_name(doi_ls[2]) try: table_name = self.get_table(doi_ls, self.output_path + '/' + str(name) + '.xlsx') print('***************** text' + str(i + 1) + ' finished! ********************\n') except Exception as e: print(e) doi_error.append(str(i + 1)) print("\033[1;31;40m***************** text " + str(i + 1) + " is error! ********************\n\033[0m") print('***************** text' + str(i + 1) + ' is error! ********************\n') print('*' * 100) print(str(doi_error))
class TableExtractorModifiedtoalloy(object): def __init__(self, xml_path, save_path, config_path): self.xml_path = xml_path self.save_path = save_path self.dict_info = Dictionary(config_path) self.list_of_units = self.dict_info.table_units self.log_wp = LogWp() def get_caption(self, doi, table, format): if format == 'html': if '10.1016' in doi: up = table.parent table_root = up.parent caption = table_root.find('div', 'caption') caption = caption.find('p') caption, ref = self._search_for_reference(caption, format) caption = unidecode.unidecode(HTMLParser().unescape(caption.text)).strip() return caption, ref elif '10.1039' in doi: check = table.parent check = check.parent if check.get('class') == ['rtable__wrapper']: up = table.parent up = up.parent caption = up.previous_sibling if caption is None: return '', [] else: caption = caption.previous_sibling if caption is None: return '', [] else: caption = caption.find('span') caption, ref = self._search_for_reference(caption, format) caption = unidecode.unidecode(HTMLParser().unescape(caption.text)).strip() return caption, ref else: return '', [] elif '10.1002' in doi: up = table.parent caption = up.previous_sibling caption = caption.previous_sibling if caption is not None: caption.span.decompose() caption, ref = self._search_for_reference(caption, format) caption = unidecode.unidecode(HTMLParser().unescape(caption.text)).strip() return caption, ref else: self.log_wp.print_log('No caption') return '', [] elif '10.1021' in doi: up = table.parent if up.get('class') == ['NLM_table-wrap']: caption = up.find('div', 'NLM_caption') else: caption = up.previous_sibling if caption == ' ': caption = caption.previous_sibling if caption is not None: caption, ref = self._search_for_reference(caption, format) caption = unidecode.unidecode(HTMLParser().unescape(caption.text)).strip() return caption, ref else: return '', None elif '10.1007' in doi: up = table.parent caption = up.previous_sibling caption = caption.find('p') caption, ref = self._search_for_reference(caption, format) caption = unidecode.unidecode(HTMLParser().unescape(caption.text)).strip() return caption, ref else: return '', [] elif format == 'xml': if '10.1016' in doi: try: caption = table.find('caption') caption, ref = self._search_for_reference(caption, format) caption = unidecode.unidecode(HTMLParser().unescape(caption.text)).strip() except Exception as e: self.log_wp.print_log(str(e)) return caption, ref elif '10.1021' in doi: caption = table.find('title') if caption is None: up = table.parent caption = table.find('title') if caption is None: caption = up.find('caption') caption, ref = self._search_for_reference(caption, format) caption = unidecode.unidecode(HTMLParser().unescape(caption.text)).strip() return caption, ref return '', [] def get_xml_tables(self, doi, xml): all_tables = [] all_captions = [] soup = BeautifulSoup(open((xml), 'r+', encoding='utf-8'), 'xml') tables = soup.find_all('table') if len(tables) == 0: soup = BeautifulSoup(open(xml, 'r+', encoding='utf-8'), 'lxml') tables = soup.find_all('table-wrap') for w, table in enumerate(tables): try: try: caption, ref = self.get_caption(doi, table, format='xml') except Exception as e: self.log_wp.print_log(str(e)) all_captions.append(caption) tab = [] sup_tab = [] for t in range(150): tab.append([None] * 150) sup_tab.append([None] * 150) rows = table.find_all('row') if len(rows) == 0: rows = table.find_all('oasis:row') num_rows = len(rows) for i, row in enumerate(rows): counter = 0 for ent in row: curr_col = 0 beg = 0 end = 0 more_row = 0 if type(ent) == type(row): if ent.has_attr('colname'): try: curr_col = int(ent['colname']) except: curr = list(ent['colname']) for c in curr: try: curr_col = int(c) except: continue if ent.has_attr('namest'): try: beg = int(ent['namest']) except: curr = list(ent['namest']) for c in curr: try: beg = int(c) except: continue if ent.has_attr('nameend'): try: end = int(ent['nameend']) except: curr = list(ent['nameend']) for c in curr: try: end = int(c) except: continue if ent.has_attr('morerows'): try: more_row = int(ent['morerows']) except: curr = list(ent['morerows']) for c in curr: try: more_row = int(c) except: continue ent, curr_ref = self._search_for_reference(ent, 'xml') if beg != 0 and end != 0 and more_row != 0: for j in range(beg, end + 1): for k in range(more_row + 1): tab[i + k][j - 1] = unidecode.unidecode( HTMLParser().unescape(ent.get_text())).strip().replace('\n', ' ') sup_tab[i + k][j - 1] = curr_ref elif beg != 0 and end != 0: for j in range(beg, end + 1): tab[i][j - 1] = unidecode.unidecode( HTMLParser().unescape(ent.get_text())).strip().replace('\n', ' ') sup_tab[i][j - 1] = curr_ref elif more_row != 0: for j in range(more_row + 1): tab[i + j][counter] = unidecode.unidecode( HTMLParser().unescape(ent.get_text())).strip().replace('\n', ' ') sup_tab[i + j][counter] = curr_ref elif curr_col != 0: tab[i][curr_col - 1] = unidecode.unidecode( HTMLParser().unescape(ent.get_text())).strip().replace('\n', ' ') sup_tab[i][curr_col - 1] = curr_ref else: counter_ent = counter found = False while not found: if tab[i][counter_ent] is None: tab[i][counter_ent] = unidecode.unidecode( HTMLParser().unescape(ent.get_text())).strip().replace('\n', ' ') sup_tab[i][counter_ent] = curr_ref found = True else: counter_ent += 1 counter = counter_ent counter = counter + 1 + (end - beg) for t, s in zip(tab, sup_tab): for j, k in zip(reversed(t), reversed(s)): if j is None: t.remove(j) s.remove(k) for t, s in zip(reversed(tab), reversed(sup_tab)): if len(t) == 0: tab.remove(t) sup_tab.remove(s) lens = [] for t in tab: lens.append(len(t)) size = stats.mode(lens)[0][0] for t, s in zip(tab, sup_tab): if len(t) != size: for j in range(len(t), size): t.append('') s.append([]) all_tables.append(tab) except Exception as e: self.log_wp.print_log('Failed to extract XML table') table = [[0]] self.log_wp.print_log(str(e)) sup_table = [[None]] all_tables.append(table) tb = sys.exc_info()[-1] self.log_wp.print_log(str(traceback.extract_tb(tb, limit=1)[-1][1])) return all_tables, all_captions def get_headers(self, tables, doi): all_col_headers = [] all_row_headers = [] all_col_indexes = [] all_row_indexes = [] for num, table in enumerate(tables): try: curr = table[0] col_index = 0 for i in range(len(table) - 1): next = table[i + 1] count_curr = 0 count_next = 0 for cell in curr: try: cell, _ = self.value_extractor(cell) fixed = float(cell) except: if cell != '': count_curr += 1 for cell in next: try: cell, _ = self.value_extractor(cell) fixed = float(cell) except: if cell != '': count_next += 1 if count_next > count_curr: curr = next else: col_index = 0 break trans_table = list(map(list, zip(*table))) curr_row = trans_table[0] row_index = 0 for i in range(len(trans_table) - 1): next = trans_table[i + 1] count_curr = 0 count_next = 0 for cell in curr_row: try: cell, _ = self.value_extractor(cell) fixed = float(cell) except: if cell != '': count_curr += 1 for cell in next: try: cell, _ = self.value_extractor(cell) fixed = float(cell) except: if cell != '': count_next += 1 if count_next > count_curr: curr = next else: row_index = 0 break row_header = [] col_header = [] for i in range(col_index + 1): col_header.extend(table[i]) for i in range(row_index + 1): row_header.extend(trans_table[i]) indexes = [] curr = col_header[0] for i in range(len(col_header) - 1): next = col_header[i + 1] if curr == next: indexes.append(i) curr = next else: curr = next for i in reversed(indexes): col_header.pop(i) indexes = [] curr = row_header[0] for i in range(len(row_header) - 1): next = row_header[i + 1] if curr == next: indexes.append(i) curr = next else: curr = next for i in reversed(indexes): row_header.pop(i) all_col_headers.append(col_header) all_row_headers.append(row_header) all_col_indexes.append(col_index) all_row_indexes.append(row_index) except IndexError as e: self.log_wp.print_log("FAILURE: Index self.get_headers table #" + str(num) + " from paper " + str(doi)) self.log_wp.print_log('IndexError in get headers') self.log_wp.print_log(str(e)) tb = sys.exc_info()[-1] self.log_wp.print_log(str(traceback.extract_tb(tb, limit=1)[-1][1])) return all_col_headers, all_row_headers, all_col_indexes, all_row_indexes def load_embeddings(self, file_loc=None): if file_loc == None: self.log_wp.print_log('Need to specify path to word embedding model') self.log_wp.print_log('Materials science training word2vec and fasttext are available for download') self.log_wp.print_log('Check the read-me') else: embeddings = keyedvectors.KeyedVectors.load(file_loc) # embeddings.bucket = 2000000 emb_vocab_ft = dict([('<null>', 0), ('<oov>', 1)] + [(k, v.index + 2) for k, v in embeddings.vocab.items()]) emb_weights_ft = np.vstack([np.zeros((1, 100)), np.ones((1, 100)), np.array(embeddings.syn0)]) def _normalize_string(self, string): ret_string = '' for char in string: if re.match(u'[Α-Ωα-ωÅ]', char) is not None: ret_string += str(char) else: ret_string += str(unidecode_expect_nonascii(str(char))) return ret_string def construct_table_object(self, doi, table, row_ind, col_ind): new_table = Table() new_table['act_table'] = table mat_trans_table = np.array(table).T.tolist() mat_table = np.array(table).tolist() error_file = [] try: for i, c in enumerate(mat_table[col_ind][(row_ind + 1):]): entity = Entity() entity['name'] = str(c) entity['descriptor'] = str(mat_table[col_ind][row_ind]) if col_ind > 0: for j in range(col_ind): link = Link() link['name'] = str(mat_table[col_ind - j - 1][i + 1]) if link['name'] != entity['name']: entity['links'] for j, r in enumerate(mat_trans_table[row_ind][(col_ind + 1):]): attr = Attribute() try: potential_units = unit_regex.search(r).group(0)[1:-1] found_units = [u for u in self.list_of_units if u in potential_units] if len(found_units) > 0: attr['unit'] = unit except: pass attr['name'] = str(r) if row_ind > 0: for k in range(row_ind): link = Link() link['name'] = str(mat_trans_table[row_ind - k - 1][j + 1]) if link['name'] != attr['name']: attr['links'].append(link) val, unit = self.value_extractor(str(mat_table[row_ind + j + 1][i + 1])) if type(val) == float: attr['value'] = val else: attr['string_value'] = val if unit is not None: # overwrites previous unit attr['unit'] = unit entity['attributes'].append(attr) new_table['entities'].append(entity) return new_table, set(error_file) except IndexError as e: self.log_wp.print_log("FAILURE: Index construct_table table from paper " + str(doi)) self.log_wp.print_log('IndexError in construct object') self.log_wp.print_log(str(e)) error_file.append(str(doi)) return new_table, set(error_file) def print_table_object(self, table): for ent in table['entities']: self.log_wp.print_log('Ent:', ent['name']) self.log_wp.print_log('Links:') for link in ent['links']: self.log_wp.print_log(link['name']) self.log_wp.print_log('Attr:') for att in ent['attributes']: self.log_wp.print_log(att['name']) self.log_wp.print_log(att['value']) for link in att['links']: self.log_wp.print_log(link['name']) self.log_wp.print_log('-------') self.log_wp.print_log('--------------') def value_extractor(self, string): original_string = string[:] extracted_unit = None balance_syn = ['balance', 'bal', 'bal.', 'other.', 'other'] if string.lower() in balance_syn: return 'balance', extracted_unit units = [u for u in self.list_of_units if u in string] if units: extracted_unit = max(units) string = string.replace(extracted_unit, '') # e.g. already in int or float form: 12.5 -> 12.5 try: return float(string), extracted_unit except: pass # e.g. 12.5 - 13.5 -> 13.0 range_regex = re.compile('\d+\.?\d*\s*-\s*\d+\.?\d*') try: ranges = range_regex.search(string).group().split('-') average = (float(ranges[0]) + float(ranges[1])) / 2.0 return average, extracted_unit except: pass # e.g. 12.2 (5.2) -> 12.2 bracket_regex = re.compile('(\d+\.?\d*)\s*\(\d*.?\d*\)') try: extracted_value = float(bracket_regex.search(string).group(1)) return float(extracted_value), extracted_unit except: pass # e.g. 12.3 ± 0.5 -> 12.3 plusmin_regex = re.compile('(\d+\.?\d*)(\s*[±+-]+\s*\d+\.?\d*)') try: extracted_value = float(plusmin_regex.search(string).group(1)) return extracted_value, extracted_unit except AttributeError: pass # e.g. <0.05 -> 0.05 | >72.0 -> 72.0 | ~12 -> 12 lessthan_roughly_regex = re.compile('([<]|[~]|[>])=?\s*\d+\.*\d*') try: extracted_value = lessthan_roughly_regex.search(string).group() num_regex = re.compile('\d+\.*\d*') extracted_value = num_regex.search(extracted_value).group() return float(extracted_value), extracted_unit except: pass # e.g. 0.4:0.6 (ratios) if ':' in string: split = string.split(":") try: extracted_value = round(float(split[0]) / float(split[1]), 3) return extracted_value, extracted_unit except: pass return original_string, None def load_composition_elements(self, domain=None): # Compositional elements to help in correclty identifiying the orientation of tables in specific domains if domain == 'geopolymers': material_constituents = ['Al2O3', 'SiO2'] constituent_threshold = 2 remaining = None elif domain == 'steel': material_constituents = ['Fe', 'Cr', 'Cu', 'C', 'Ti', 'Ni', 'Mo', 'Mn'] constituent_threshold = 4 remaining = ['Fe'] elif domain == 'titanium': material_constituents = ['Ti', 'Fe', 'C'] constituent_threshold = 2 remaining = ['Fe'] elif domain == 'zeolites': material_constituents = ( ['Si/Ge', 'DMAP/T', 'HF/T', 'H2O/T', '(Si + Ge)/Al', 'SiO2', 'GeO2', 'SDA', 'HF', 'H2O', 'Ge', 'Si', 'SiO2/Al2O3', 'Si/Al', 'R(OH)2/Si', 'F-/Si', '(Si + Ge)/Zr', 'Al', 'SDA/Si', 'H2O/Si', 'OH/Si', 'Si/H2O', 'Si/OH', 'Ge/Si', 'Si/Ti', 'MeO', 'SiO2/GeO2', 'TMHDA', 'TMEDA', 'TEOS', 'NH4F', 'Al/T', 'N,N-Diethylethylenediamine', 'NaGaGeO4', 'NaGaO2', 'Na2GeO3*H2O', 'SOD', 'NaNO2', 'NaOH']) constituent_threshold = 2 remaining = None elif domain == 'aluminum': material_constituents = ['Al', 'Cu', 'Mn', 'Si', 'O', 'Mg'] constituent_threshold = 2 remaining = None elif domain == 'alloys': material_constituents = ['Ag', 'Al', 'Ar', 'As', 'Au', 'B', 'Ba', 'Be', 'Bi', 'Br', 'C', 'Ca', 'Cd', 'Ce', 'Cl', 'Co', 'Cr', 'Cs', 'Cu', 'Dy', 'Er', 'Eu', 'F', 'Fe', 'Ga', 'Gd', 'Ge', 'H', 'Hf', 'Hg', 'Ho', 'I', 'In', 'Ir', 'K', 'La', 'Li', 'Lu', 'Md', 'Mg', 'Mn', 'Mo', 'N', 'Na', 'Nb', 'Nd', 'Ni', 'O', 'Os', 'P', 'Pb', 'Pd', 'Pr', 'Pt', 'Rb', 'Re', 'Rh', 'Ru', 'S', 'Sb', 'Sc', 'Se', 'Si', 'Sm', 'Sn', 'Sr', 'Ta', 'Tb', 'Te', 'Th', 'Ti', 'Tl', 'Tm', 'U', 'V', 'W', 'Y', 'Yb', 'Zn', 'Zr'] constituent_threshold = 2 remaining = ['Fe', 'Al', 'Ti'] def set_balance(self, entity, balance_pos, cumsum): if cumsum < 1: entity['attributes'][balance_pos]['value'] = 1.0 - cumsum else: entity['attributes'][balance_pos]['value'] = 100.0 - cumsum def get_links(self, entity): list_of_names = [] for attr in entity['attributes']: list_of_names.append(attr['name']) if len(set(list_of_names)) < 3: for attr in entity['attributes']: if len(attr['links']) > 0: swapped = attr['name'] attr['name'] = attr['links'][0]['name'] attr['links'][0]['name'] = swapped def check_if_balanced(self, cumsum): if cumsum > 1: if 100 - cumsum < 1.5: return True else: return False else: if 1 - cumsum < 0.015: return True else: return False def _search_for_reference(self, soup, format): if format == 'html': ref = soup.find_all('a') tags = [] if len(ref) == 0: text = soup.text refs = re.findall('\[\D\]', text) if len(refs) == 0: return soup, tags else: text = re.split('\[\D\]', text) text = ''.join(text) soup.string = text return soup, refs else: for r in ref: tag = soup.a.extract() tags.append(tag.text) return soup, tags elif format == 'xml': ref = soup.find_all('xref') tags = [] if len(ref) == 0: if soup.name == 'caption': return soup, tags ref = soup.find_all('sup') for r in ref: text = r.text.split(',') for t in text: if len(t) == 1 and t.isalpha(): tags.append(t) soup.sup.decompose() return soup, tags else: for r in ref: if len(r.text) < 4: tag = soup.xref.extract() tags.append(tag.text) return soup, tags
class TableExtraction: def __init__(self, excels_path, c_path, prop_name='solvus'): self.excels_path = excels_path self.c_path = c_path self.prop_name = prop_name self.dict_info = Dictionary(self.c_path) self.ele_list = self.dict_info.ele_list self.e_pattern = self.dict_info.table_e_pattern self.ratio_pattern = self.dict_info.table_ratio_pattern self.prop_pattern = self.dict_info.table_prop_pattern self.unit_pattern = self.dict_info.unit_pattern_table self.number_pattern = self.dict_info.table_number_pattern self.ele_to_abr = self.dict_info.ele_to_abr self.prop_pattern_words = self.dict_info.table_prop_pattern_words self.log_wp = LogWp() def composition_triple_extraction(self): file_list = os.listdir(self.excels_path) composition_all = {} for excel_path in file_list: try: file = xlrd.open_workbook(self.excels_path + '/' + excel_path) all_material = [] for sheet_i in range(len(file.sheets())): try: sheet = file.sheet_by_index(sheet_i) topic = sheet.row_values(1)[0] if 'composition' in topic.lower(): target_ele_row = [] target_ele_col = [] search_outcome = [] ele_loc = None for line_index in range(2, len(sheet.col_values(0))): search_line = sheet.row_values(line_index) unit_i = 0 for unit in search_line: outcome = re.findall(self.e_pattern, str(unit)) if outcome and str(unit) in self.ele_list: target_ele_row.append(line_index) target_ele_col.append(unit_i) search_outcome.append(unit) unit_i += 1 if search_outcome: ele_loc = line_index break if ele_loc: dict_info = Dictionary(self.c_path) alloy_replace = dict_info.table_alloy_to_replace alloy_common_type = dict_info.alloy_writing_type alloy_blank_type = dict_info.alloy_blank_type for alloy_model, replace in alloy_replace.items(): alloy_part = re.findall(alloy_model, str(topic)) for alloy in alloy_part: find_part = re.findall(replace[0], str(alloy)) alloy_out = alloy.replace(find_part[0], replace[1]) topic = topic.replace(alloy, alloy_out) outcome_name = list() topic_tokenize = nltk.word_tokenize(topic) for word in topic_tokenize: for pattern_1 in alloy_common_type: outcome_common = re.findall(pattern_1, str(word)) if outcome_common: outcome_name.append(word) break for pattern_2 in alloy_blank_type: outcome_blank = re.findall(pattern_2, str(topic)) if outcome_blank and outcome_blank[0] not in outcome_name: outcome_name.append(outcome_blank[0]) break len_col = len(sheet.row_values(3)) alloy_name_col = None alloy_name_search = [] if len_col <= 3: for col_i in range(len_col): col_info = sheet.col_values(col_i) if col_i == 0: col_info = sheet.col_values(col_i)[2:] if col_info: for cell in col_info: for pattern_1 in alloy_common_type: outcome_common = re.findall(pattern_1, str(cell)) if outcome_common: alloy_name_col = col_i alloy_name_search.append(col_i) for pattern_2 in alloy_blank_type: outcome_blank = re.findall(pattern_2, str(cell)) if outcome_blank: alloy_name_col = col_i alloy_name_search.append(col_i) else: for col_i in range(3): col_info = sheet.col_values(col_i) if col_i == 0: col_info = sheet.col_values(col_i)[2:] if col_info: for cell in col_info: for pattern_1 in alloy_common_type: outcome_common = re.findall(pattern_1, str(cell)) if outcome_common: alloy_name_col = col_i alloy_name_search.append(col_i) for pattern_2 in alloy_blank_type: outcome_blank = re.findall(pattern_2, str(cell)) if outcome_blank: alloy_name_col = col_i alloy_name_search.append(col_i) if not alloy_name_search: alloy_name_col = 0 else: alloy_name_col = alloy_name_search[0] first_col = sheet.col_values(0) ele_first = [] for unit in first_col: firstcol_search = re.findall(self.e_pattern, str(unit)) if firstcol_search: ele_first.append(unit) if len(ele_first) <= 2: if len(first_col) > 4: e_search = re.findall(self.e_pattern, str(sheet.col_values(0)[ele_loc])) if e_search and outcome_name and len(outcome_name) == 1: for index_row in range(ele_loc + 1, len(first_col)): composition_single = {} composition_single['material'] = outcome_name[0].replace('~', ' ') composition_single['doi'] = first_col[0] ratio_find_topic = re.findall(self.ratio_pattern, str(topic)) ratio_find_col = re.findall(self.ratio_pattern, str(first_col[index_row])) for table_head in sheet.row_values(2): ratio_find_head = re.findall(self.ratio_pattern, str(table_head)) if ratio_find_head: composition_single['percentage'] = ratio_find_head[0] break if ratio_find_topic: composition_single['percentage'] = ratio_find_topic[0] elif ratio_find_col: composition_single['percentage'] = ratio_find_col[0] for ele_index in range(len(sheet.row_values(2))): ele_name = sheet.row_values(ele_loc)[ele_index] if ele_name in tuple(self.ele_to_abr.keys()): ele_name = self.ele_to_abr[ele_name] number = sheet.row_values(index_row)[ele_index] composition_single[ele_name] = number all_material.append(composition_single) if not e_search: for index_row in range(ele_loc + 1, len(first_col)): if first_col[index_row]: composition_single = {} name_col = sheet.col_values(alloy_name_col) if outcome_name and len( outcome_name) == 1 and not alloy_name_search: composition_single['material'] = outcome_name[0].replace('~', ' ') else: composition_single['material'] = name_col[index_row] composition_single['doi'] = first_col[0] ratio_find_topic = re.findall(self.ratio_pattern, str(topic)) ratio_find_col = re.findall(self.ratio_pattern, str(first_col[index_row])) for table_head in sheet.row_values(2): ratio_find_head = re.findall(self.ratio_pattern, str(table_head)) if ratio_find_head: composition_single['percentage'] = ratio_find_head[0] break if ratio_find_topic: composition_single['percentage'] = ratio_find_topic[0] elif ratio_find_col: composition_single['percentage'] = ratio_find_col[0] ratio_find_unit = re.findall(self.ratio_pattern, str(first_col[index_row])) if ratio_find_unit: composition_single['percentage'] = ratio_find_unit[0] for ele_index in range(len(sheet.row_values(ele_loc)[1:])): ele_name = sheet.row_values(ele_loc)[1:][ele_index] if ele_name in tuple(self.ele_to_abr.keys()): ele_name = self.ele_to_abr[ele_name] number = sheet.row_values(index_row)[ele_index + 1] composition_single[ele_name] = number all_material.append(composition_single) else: composition_single = {} first_col_1 = sheet.row_values(3)[0] e_search = re.findall(self.e_pattern, str(sheet.col_values(0)[ele_loc])) ratio_find_col = re.findall(self.ratio_pattern, str(first_col_1)) for table_head in sheet.row_values(2): ratio_find_head = re.findall(self.ratio_pattern, str(table_head)) if ratio_find_head: composition_single['percentage'] = ratio_find_head[0] break if ratio_find_col: composition_single['percentage'] = ratio_find_col[0] ratio_find_topic = re.findall(self.ratio_pattern, str(topic)) if ratio_find_topic: composition_single['percentage'] = ratio_find_topic[0] if outcome_name and e_search: composition_single['material'] = outcome_name[0].replace('~', ' ') composition_single['doi'] = first_col[0] for ele_index in range(len(sheet.row_values(2))): ele_name = sheet.row_values(ele_loc)[ele_index] number = sheet.row_values(3)[ele_index] if ele_name in tuple(self.ele_to_abr.keys()): ele_name = self.ele_to_abr[ele_name] composition_single[ele_name] = number all_material.append(composition_single) elif outcome_name and not e_search: if len(outcome_name) == 1: composition_single['material'] = outcome_name[0].replace('~', ' ') else: composition_single['material'] = sheet.row_values(ele_loc + 1)[ alloy_name_col] composition_single['doi'] = first_col[0] for ele_index in range(len(sheet.row_values(2)[1:])): ele_name = sheet.row_values(ele_loc)[1:][ele_index] number = sheet.row_values(3)[1:][ele_index] if ele_name in tuple(self.ele_to_abr.keys()): ele_name = self.ele_to_abr[ele_name] composition_single[ele_name] = number all_material.append(composition_single) elif not outcome_name and not e_search: composition_single['material'] = sheet.row_values(ele_loc + 1)[ alloy_name_col] composition_single['doi'] = first_col[0] m_name = sheet.row_values(ele_loc)[0] composition_single[m_name] = first_col[3] for ele_index in range(len(sheet.row_values(2)[1:])): ele_name = sheet.row_values(ele_loc)[1:][ele_index] number = sheet.row_values(3)[1:][ele_index] if ele_name in tuple(self.ele_to_abr.keys()): ele_name = self.ele_to_abr[ele_name] composition_single[ele_name] = number all_material.append(composition_single) elif not outcome_name and e_search: composition_single['material'] = None composition_single['doi'] = first_col[0] for ele_index in range(len(sheet.row_values(2))): ele_name = sheet.row_values(ele_loc)[ele_index] number = sheet.row_values(3)[ele_index] if ele_name in tuple(self.ele_to_abr.keys()): ele_name = self.ele_to_abr[ele_name] composition_single[ele_name] = number all_material.append(composition_single) else: ele_row = sheet.row_values(ele_loc - 1) len_elerow = len(ele_row) for index_col in range(1, len_elerow): if ele_row[index_col]: composition_single = {} if outcome_name and len(outcome_name) == 1 and len_elerow <= 2: material_name = outcome_name[0].replace('~', ' ') else: material_name = ele_row[index_col] composition_single['material'] = material_name composition_single['doi'] = first_col[0] ratio_find_topic = re.findall(self.ratio_pattern, str(topic)) ratio_find_col = re.findall(self.ratio_pattern, str(material_name)) if ratio_find_topic: composition_single['percentage'] = ratio_find_topic[0] elif ratio_find_col: composition_single['percentage'] = ratio_find_col[0] for ele_index in range(len(sheet.col_values(0)[ele_loc:])): ele_name = sheet.col_values(0)[ele_loc:][ele_index] number = sheet.col_values(index_col)[ele_loc + ele_index] if ele_name in tuple(self.ele_to_abr.keys()): ele_name = self.ele_to_abr[ele_name] composition_single[ele_name] = number all_material.append(composition_single) if all_material: break except Exception as e: self.log_wp.print_log("%s", str(e)) self.log_wp.print_log("An error in the %s of %s!", sheet_i, excel_path) if all_material: composition_all[excel_path] = all_material except Exception as e: self.log_wp.print_log("can't open this file, name of file is %s", str(excel_path)) self.log_wp.print_log("Error is %s", str(e)) self.log_wp.print_log("%s", "--" * 25) return composition_all def property_info_extraction(self): file_list = os.listdir(self.excels_path) property_all = {} number_prop = 0 K_path = [] for excel_path in file_list: try: file = xlrd.open_workbook(self.excels_path + '/' + excel_path) all_material = [] for sheet_i in range(len(file.sheets())): try: sheet = file.sheet_by_index(sheet_i) topic = sheet.row_values(1)[0] search_outcome = [] target_prop_row = [] target_prop_col = [] for line_index in range(2, len(sheet.col_values(0))): search_line = sheet.row_values(line_index)[1:] unit_i = 1 for unit in search_line: outcome_words = None for pattern in self.prop_pattern[self.prop_name]: outcome = re.findall(pattern, str(unit)) if all(word in str(unit) for word in self.prop_pattern_words[self.prop_name]): outcome_words = unit if outcome: break if outcome or outcome_words: target_prop_row.append(line_index) target_prop_col.append(unit_i) search_outcome.append(unit) unit_i += 1 if any(search_outcome): first_col = sheet.col_values(0) alloy_replace = Dictionary(self.c_path).table_alloy_to_replace for alloy_model, replace in alloy_replace.items(): alloy_part = re.findall(alloy_model, str(topic)) for alloy in alloy_part: find_part = re.findall(replace[0], str(alloy)) alloy_out = alloy.replace(find_part[0], replace[1]) topic = topic.replace(alloy, alloy_out) alloy_common_type = Dictionary(self.c_path).alloy_writing_type alloy_blank_type = Dictionary(self.c_path).alloy_blank_type outcome_name = [] topic_tokenize = nltk.word_tokenize(topic) for word in topic_tokenize: for pattern_1 in alloy_common_type: outcome_common = re.findall(pattern_1, str(word)) if outcome_common: outcome_name.append(word) break for pattern_2 in alloy_blank_type: outcome_blank = re.findall(pattern_2, str(topic)) if outcome_blank: outcome_name.append(outcome_blank[0]) break fc_ns = [] for cell in sheet.col_values(0)[1:]: fc_n = re.findall(self.number_pattern[self.prop_name], str(cell)) alphabet_search = re.findall("[A-Za-z]", str(cell)) if fc_n and not alphabet_search: fc_ns.append(cell) len_col = len(sheet.row_values(3)) alloy_name_col = None alloy_name_search = [] if len_col <= 3: for col_i in range(len_col): col_info = sheet.col_values(col_i) if col_i == 0: col_info = sheet.col_values(col_i)[2:] if col_info: for cell in col_info: for pattern_1 in alloy_common_type: outcome_common = re.findall(pattern_1, str(cell)) if outcome_common: alloy_name_col = col_i alloy_name_search.append(col_i) for pattern_2 in alloy_blank_type: outcome_blank = re.findall(pattern_2, str(cell)) if outcome_blank: alloy_name_col = col_i alloy_name_search.append(col_i) else: for col_i in range(3): col_info = sheet.col_values(col_i) if col_i == 0: col_info = sheet.col_values(col_i)[2:] if col_info: for cell in col_info: for pattern_1 in alloy_common_type: outcome_common = re.findall(pattern_1, str(cell)) if outcome_common: alloy_name_col = col_i alloy_name_search.append(col_i) for pattern_2 in alloy_blank_type: outcome_blank = re.findall(pattern_2, str(cell)) if outcome_blank: alloy_name_col = col_i alloy_name_search.append(col_i) if not alloy_name_search: alloy_name_col = 0 else: alloy_name_col = alloy_name_search[0] if len(first_col) > 4: for prop_i in range(len(target_prop_row)): sub_label = [] curr_col = [] for index_row in range(target_prop_row[prop_i] + 1, len(first_col)): unit_search_parts = [] unit_search_parts.append(topic) if len(fc_ns) == 0: name_col = sheet.col_values(alloy_name_col) material_name = name_col[index_row] property_single = {} number = sheet.row_values(index_row)[target_prop_col[prop_i]] number_inspect = re.findall(self.number_pattern[self.prop_name], str(number)) prop_name = sheet.row_values(target_prop_row[prop_i])[ target_prop_col[prop_i]] unit_search_parts.append(first_col[index_row]) unit_search_parts.append(number) for unit in sheet.row_values(target_prop_row[0]): unit_search_parts.append(unit) for row_s in range(2, target_prop_row[prop_i] + 1): unit_search_parts.append( sheet.row_values(row_s)[target_prop_col[prop_i]]) if number_inspect: one_info = {} for prop_index in range(len(sheet.row_values(target_prop_row[prop_i]))): prop_name_line = sheet.row_values(target_prop_row[prop_i])[ prop_index] number_line_line = sheet.row_values(index_row)[prop_index] one_info[prop_name_line] = number_line_line curr_col.append(number) property_single[prop_name] = number property_single['other_info'] = one_info property_single['material'] = material_name property_single['doi'] = first_col[0] if sub_label: property_single['child_tag'] = sub_label for item in unit_search_parts: unit_find = re.findall(self.unit_pattern[self.prop_name], str(item)) if unit_find: property_single['unit'] = unit_find[0].replace('degC', '°C') K_path.append(excel_path) if 'unit' not in property_single.keys(): property_single['unit'] = 'no mentioned' elif not number_inspect and len(curr_col) != 0: property_single['material'] = material_name property_single['doi'] = first_col[0] property_single[prop_name] = number if sub_label: property_single['child_tag'] = sub_label for item in unit_search_parts: unit_find = re.findall(self.unit_pattern[self.prop_name], str(item)) if unit_find: property_single['unit'] = unit_find[0].replace('degC', '°C') K_path.append(excel_path) break if 'unit' not in property_single.keys(): property_single['unit'] = 'no mentioned' elif not number_inspect and len(curr_col) == 0: if number and not property_single: if number != '-' and number != '--': sub_label.append(number) if property_single: property_single['table_topic'] = first_col[1] all_material.append(property_single) if first_col[index_row] and len(fc_ns) != 0 and len(outcome_name) == 1: material_name = outcome_name[0].replace('~', ' ') property_single = {} unit_search_parts.append(first_col[index_row]) for row_s in range(2, target_prop_row[prop_i] + 1): unit_search_parts.append( sheet.row_values(row_s)[target_prop_col[prop_i]]) prop_name = sheet.row_values(target_prop_row[prop_i])[ target_prop_col[prop_i]] number = sheet.row_values(index_row)[target_prop_col[prop_i]] number_inspect = re.findall(self.number_pattern[self.prop_name], str(number)) unit_search_parts.append(number) if number_inspect: property_single[prop_name] = number property_single['material'] = material_name property_single['doi'] = first_col[0] for item in unit_search_parts: unit_find = re.findall(self.unit_pattern[self.prop_name], str(item)) if unit_find: property_single['unit'] = unit_find[0].replace('degC', '°C') K_path.append(excel_path) break if 'unit' not in property_single.keys(): property_single['unit'] = 'no mentioned' elif not number_inspect and len(curr_col) != 0: property_single['material'] = material_name property_single['doi'] = first_col[0] property_single[prop_name] = number if sub_label: property_single['child_tag'] = sub_label for item in unit_search_parts: unit_find = re.findall(self.unit_pattern[self.prop_name], str(item)) if unit_find: property_single['unit'] = unit_find[0].replace('degC', '°C') K_path.append(excel_path) break if 'unit' not in property_single.keys(): property_single['unit'] = 'no mentioned' elif not number_inspect and len(curr_col) == 0: if number and not property_single: sub_label.append(number) if property_single: property_single['table_topic'] = first_col[1] all_material.append(property_single) else: unit_search_parts = [] property_single = {} property_single['table_topic'] = first_col[1] alloy_replace = Dictionary(self.c_path).table_alloy_to_replace for alloy_model, replace in alloy_replace.items(): alloy_part = re.findall(alloy_model, str(topic)) for alloy in alloy_part: find_part = re.findall(replace[0], str(alloy)) alloy_out = alloy.replace(find_part[0], replace[1]) topic = topic.replace(alloy, alloy_out) alloy_common_type = Dictionary(self.c_path).alloy_writing_type alloy_blank_type = Dictionary(self.c_path).alloy_blank_type outcome_name = [] topic_tokenize = nltk.word_tokenize(topic) for word in topic_tokenize: for pattern_1 in alloy_common_type: outcome_common = re.findall(pattern_1, str(word)) if outcome_common: outcome_name.append(word) break for pattern_2 in alloy_blank_type: outcome_blank = re.findall(pattern_2, str(topic)) if outcome_blank and outcome_blank[0] not in outcome_name: outcome_name.append(outcome_blank[0]) break unit_search_parts.append(first_col[3]) unit_search_parts.append(topic) for row_s in range(2, 4): for prop_i in range(len(target_prop_row)): unit_search_parts.append(sheet.row_values(row_s)[target_prop_col[prop_i]]) number_search = re.findall(self.number_pattern[self.prop_name], str(sheet.col_values(0)[2])) if outcome_name and number_search: for prop_i in range(len(target_prop_row)): property_single['material'] = outcome_name[0].replace('~', ' ') property_single['doi'] = first_col[0] number = sheet.row_values(3)[target_prop_col[prop_i]] unit_search_parts.append(number) for item in unit_search_parts: unit_find = re.findall(self.unit_pattern[self.prop_name], str(item)) if unit_find: property_single['unit'] = unit_find[0].replace('degC', '°C') K_path.append(excel_path) break if 'unit' not in property_single.keys(): property_single['unit'] = 'no mentioned' prop_name = sheet.row_values(target_prop_row[prop_i])[target_prop_col[prop_i]] property_single[prop_name] = number all_material.append(property_single) elif not outcome_name and not number_search: for prop_i in range(len(target_prop_row)): property_single[sheet.col_values(2)[0]] = first_col[3] property_single['doi'] = first_col[0] number = sheet.row_values(3)[target_prop_col[prop_i]] unit_search_parts.append(number) for item in unit_search_parts: unit_find = re.findall(self.unit_pattern[self.prop_name], str(item)) if unit_find: property_single['unit'] = unit_find[0].replace('degC', '°C') K_path.append(excel_path) break if 'unit' not in property_single.keys(): property_single['unit'] = 'no mentioned' prop_name = sheet.row_values(target_prop_row[prop_i])[target_prop_col[prop_i]] property_single[prop_name] = number all_material.append(property_single) elif not outcome_name and number_search: for prop_i in range(len(target_prop_row)): property_single['material'] = 'no mentioned' property_single['doi'] = first_col[0] number = sheet.row_values(3)[target_prop_col[prop_i]] unit_search_parts.append(number) for item in unit_search_parts: unit_find = re.findall(self.unit_pattern, str(item)) if unit_find: property_single['unit'] = unit_find[0].replace('degC', '°C') K_path.append(excel_path) break if 'unit' not in property_single.keys(): property_single['unit'] = 'no mentioned' prop_name = sheet.row_values(target_prop_row[prop_i])[target_prop_col[prop_i]] property_single[prop_name] = number all_material.append(property_single) except Exception as e: self.log_wp.print_log("An error in file:%s-sheet:%s---%s!", excel_path, sheet_i, e) if all_material: number_prop += 1 property_all[excel_path] = all_material except Exception as e: self.log_wp.print_log("can't open %s ", excel_path) self.log_wp.print_log("%s", str(e)) self.log_wp.print_log("%s", "--" * 25) return property_all
class GetTInfoFromHtml: def __init__(self, html_path, output_path): self.html_path = html_path self.output_path = output_path self.log_wp = LogWp() def get_all_url(self, url): import urllib.request from bs4 import BeautifulSoup # Masquerading as browser access headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0)\ Gecko/20100101 Firefox/23.0' } req = urllib.request.Request(url=url, headers=headers) html = urllib.request.urlopen(req).read().decode("utf-8") # html = urllib.request.urlopen(url).read().decode("utf-8") soup = BeautifulSoup(html, features='html.parser') tags = soup.find_all('a') all_url = [] for tag in tags: all_url.append(str(tag.get('href')).strip()) return all_url def get_table_url(self, doi_info): all_url = self.get_all_url(doi_info[0]) table_url = [] for i in all_url: if "table" in i: if 'article' in i: self.log_wp.print_log(str(i)) if doi_info[1] in "Springer": table_url.append('https://link.springer.com' + i) else: table_url.append('https://www.nature.com' + i) if len(table_url) == 0: self.log_wp.print_log("There is no table url in this article!") self.log_wp.print_log(str(table_url)) return table_url def doi_info(self, doi_str): global doi_url doi_info = [] if doi_str[0:7] in "10.1016": doi_url = "https://doi.org/" + doi_str doi_database = "Elsevier" elif doi_str[0:7] in ["10.1007", "10.1361", "10.1023"]: doi_url = "https://link.springer.com/article/" + doi_str doi_database = "Springer" elif doi_str[0:7] in "10.1080": doi_url = "https://doi.org/" + doi_str doi_database = "Taylor & Francis Online" elif doi_str[0:7] in ["10.1002", "10.1111"]: doi_url = "https://onlinelibrary.wiley.com/doi/" + doi_str doi_database = "Wiley Blackwell" elif doi_str[0:7] in "10.1115": doi_url = "https://doi.org/" + doi_str doi_database = "ASME International" elif doi_str[0:7] in "10.3390": # 解决MDPI页面跳转 all_url = self.get_all_url("https://doi.org/" + doi_str) for url_str in all_url: if "htm" in url_str: doi_url = "https://www.mdpi.com/" + url_str break doi_database = "MDPI" elif doi_str[0:7] == "10.1038": doi_url = "https://doi.org/" + doi_str doi_database = "Nature Publishing Group" else: doi_url = "other URL" doi_database = "other database" doi_info.append(doi_url) doi_info.append(doi_database) doi_info.append(doi_str) return doi_info def file_name(self, name): st = '\|/:?*<>;' for s in st: if s in name: name = name.replace(s, '-') return name def get_table(self, doi_info, path=r'table.xlsx'): table_name = [] if doi_info[1] in ['Springer', 'Nature Publishing Group']: table_url = self.get_table_url(doi_info) if len(table_url) != 0: with pd.ExcelWriter(path) as writer: # 保存并写入多个sheet for p in range(len(table_url)): time.sleep(1) self.log_wp.print_log("Start crawling the page") r = requests.get(table_url[p]) rt = r.text # 将html转为纯文本, header=None, index_col=None try: df = pd.read_html(rt) self.log_wp.print_log("complete!") except Exception as e: self.log_wp.print_log('format of table ' + str(p) + ' is PDF') continue # 解析表格title start = rt.find("<h1") end = rt.rfind("</h1>") title_str = '' for i in range(start, end + 5): title_str += rt[i] title_start = title_str.find("Table") title_end = title_str.find("</h1>") title = '' for j in range(title_start, title_end): self.log_wp.print_log(str(title_str[j])) title += title_str[j] table_name.append(title) # 读取table的title并写入dataframe首行 table_te = [] # 将文章的doi加入其中 row_doi = [doi_info[2]] for j in range(len(df[0].columns) - 1): row_doi.append('') table_te.append(row_doi) # 将title加入表格其中 # 将title加入表格其中 row_title = [title] for j in range(len(df[0].columns) - 1): row_title.append('') table_te.append(row_title) table_te.append(list(df[0].columns)) for i in range(len(df[0])): # te.append() table_te.append(list(df[0].iloc[i])) df[0] = pd.DataFrame(data=table_te) # 写入excel sheet_name = 'table' + str(p + 1) # 写入excel时不加index和columns df[0].to_excel(writer, sheet_name=sheet_name, header=None, index=False) else: self.log_wp.print_log(" Cannot find table in this page: " + self.file_name(doi_info[0])) elif doi_info[1] in 'Taylor & Francis Online': rt = self.get_rt(doi_info[0]) page = BeautifulSoup(rt, 'lxml') # 利用BeautifulSoup取得网页代码 # 寻找页面构造成table_name的list table_name = [] for page1 in page.find_all('b'): name = page1.text if 'Table' in name: table_name.append(name) # 删除重复表格 del table_name[int(len(table_name) / 2):len(table_name)] # 删除表格中重复的title count = 0 for t in table_name: if 'Table 1' in t: count += 1 if count > 1: del table_name[1:(len(table_name)):2] if len(table_name) != 0: with pd.ExcelWriter(path) as writer: for p in range(len(table_name)): df = pd.read_html(rt) # 读取table的title并写入dataframe首行 table_te = [] # 读取文章的doi并写入表格第一行 row_doi = [doi_info[2]] for j in range(len(df[p].columns) - 1): row_doi.append('') table_te.append(row_doi) # 将title加入表格其中 row_title = [table_name[p]] for j in range(len(df[p].columns) - 1): row_title.append('') table_te.append(row_title) table_te.append(list(df[p].columns)) for i in range(len(df[p])): # te.append() table_te.append(list(df[p].iloc[i])) df = pd.DataFrame(data=table_te) # 写入excel sheet_name = 'table' + str(p + 1) # 写入excel时不加index和columns df.to_excel(writer, sheet_name=sheet_name, header=None, index=False) else: self.log_wp.print_log(" Cannot find table in this page: " + self.file_name(doi_info[0])) elif doi_info[1] in 'MDPI': rt = self.get_rt(doi_info[0]) page = BeautifulSoup(rt, 'lxml') # 利用BeautifulSoup取得网页代码 table_name = [] for page1 in page.find_all('caption'): name = page1.text name = name.replace('\n', '') # 清除title中的冗余字段 table_name.append(name) self.log_wp.print_log(str(table_name)) if len(table_name) != 0: with pd.ExcelWriter(path) as writer: # 爬取HTML内容 time.sleep(1) self.log_wp.print_log("Start crawling the page") r = requests.get(doi_info[0]) rt = r.text # 将html转为纯文本, header=None, index_col=None df = pd.read_html(rt) self.log_wp.print_log("complete!") for p in range(len(table_name)): # 读取table的title并写入dataframe首行 table_te = [] # 读取文章的doi并写入表格第一行 row_doi = [doi_info[2]] for j in range(len(df[p].columns) - 1): row_doi.append('') table_te.append(row_doi) # 将title加入表格其中 row_title = [table_name[p]] for j in range(len(df[p].columns) - 1): row_title.append('') table_te.append(row_title) table_te.append(list(df[p].columns)) for i in range(len(df[p])): # te.append() table_te.append(list(df[p].iloc[i])) fa = pd.DataFrame(data=table_te) # 写入excel sheet_name = 'table' + str(p + 1) # 写入excel时不加index和columns fa.to_excel(writer, sheet_name=sheet_name, header=None, index=False) else: self.log_wp.print_log(" Cannot find table in this page: " + self.file_name(doi_info[0])) elif doi_info[1] in "ASME International": rt = self.get_rt(doi_info[0]) page = BeautifulSoup(rt, 'lxml') # 利用BeautifulSoup取得网页代码 # 将html转为纯文本, header=None, index_col=None table_name = [] for page1 in page.find_all('div'): name = page1.text if 'Table' in name[0:5]: if ' ' in name[-1]: table_name.append(name) if len(table_name) != 0: df = pd.read_html(rt) self.log_wp.print_log("complete!") del df[0:len(df):2] with pd.ExcelWriter(path) as writer: for p in range(len(df)): # 读取table的title并写入dataframe首行 table_te = [] # 读取文章的doi并写入表格第一行 row_doi = [doi_info[2]] for j in range(len(df[p].columns) - 1): row_doi.append('') table_te.append(row_doi) # 将title加入表格其中 row_title = [table_name[p]] for j in range(len(df[p].columns) - 1): row_title.append('') table_te.append(row_title) table_te.append(list(df[p].columns)) for i in range(len(df[p])): # te.append() table_te.append(list(df[p].iloc[i])) fa = pd.DataFrame(data=table_te) # 写入excel sheet_name = 'table' + str(p + 1) # 写入excel时不加index和columns fa.to_excel(writer, sheet_name=sheet_name, header=None, index=False) else: self.log_wp.print_log(" Cannot find table in this page: " + doi_info[0]) elif doi_info[1] in "Wiley Blackwell": rt = self.get_rt(doi_info[0]) page = BeautifulSoup(rt, 'lxml') # 利用BeautifulSoup取得网页代码 # 将html转为纯文本, header=None, index_col=None table_name = [] for page1 in page.find_all('header'): name = page1.text if 'Table' in name: name = ' '.join(name.split()) table_name.append(name.replace('\n', '')) try: if len(table_name) != 0: df = pd.read_html(rt) with pd.ExcelWriter(path) as writer: # 爬取HTML内容 for p in range(len(df)): # 读取table的title并写入dataframe首行 table_te = [] # 读取文章的doi并写入表格第一行 row_doi = [doi_info[2]] for j in range(len(df[p].columns) - 1): row_doi.append('') table_te.append(row_doi) # 将title加入表格其中 row_title = [table_name[p]] for j in range(len(df[p].columns) - 1): row_title.append('') table_te.append(row_title) table_te.append(list(df[p].columns)) for i in range(len(df[p])): # te.append() table_te.append(list(df[p].iloc[i])) fa = pd.DataFrame(data=table_te) # 写入excel sheet_name = 'table' + str(p + 1) # 写入excel时不加index和columns fa.to_excel(writer, sheet_name=sheet_name, header=None, index=False) else: self.log_wp.print_log(" Cannot find table in this page: " + doi_info[0]) except Exception as e: self.log_wp.print_log(" Cannot find table in this page: " + doi_info[0]) else: self.log_wp.print_log("Please try other function!") return table_name def get_rt(self, url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0' } time.sleep(1) self.log_wp.print_log("Start crawling the page") r = requests.get(url, headers=headers) r.encoding = 'utf-8' rt = r.text self.log_wp.print_log("complete!") return rt def load_doi(self, path): import xlrd data = xlrd.open_workbook(path) table = data.sheet_by_index(0) # 获取表格所有行数 nrows = table.nrows doi_li = [] # excel文件中的doi处于第一列且省略表头 for row in range(nrows): table_doi = table.row_values(row, start_colx=0, end_colx=None)[0] doi_li.append(table_doi) return doi_li def get_html(self, url): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0' } req = urllib.request.Request(url=url, headers=headers) html = urllib.request.urlopen(req).read() return html def save_html(self, file_name, file_content): # 注意windows文件命名的禁用符,比如 / # path = 'C:/Users/T_sha/Desktop/DOI_MDPI/' # with open(file_name.replace('/', '_') + path+".html", "wb") as f: with open(file_name + ".html", "wb") as f: # 写文件用bytes而不是str,所以要转码 f.write(file_content) def down_html(self, doi_li, path=''): for s in range(len(doi_li)): name = self.file_name(doi_li[s]) url_te = self.doi_info(doi_li[s])[0] html = self.get_html(url_te) self.save_html(path + name, html) self.log_wp.print_log('html_' + str(s + 1) + " download completed!") def doi_renamed(self, html_name): # 删除‘.html’后缀 name = html_name[0:-5] if len(html_name) > 7: # 将doi中的'-'转为'/' name = self.str_sub(name, 7, '/') else: self.log_wp.print_log('Your file name is wrong!') return name # string指定p位置替换c def str_sub(self, string, p, c): new = [] for s in string: new.append(s) new[p] = c return ''.join(new) def load_html(self, html_input_path): # 输入导入html路径 html_name = self.doi_renamed(os.path.basename(html_input_path)) f = open(html_input_path, "r", encoding="utf-8") # 读取文件 ft = f.read() # 把文件内容转化为字符串 return ft, html_name def get_table_html(self, doi_info, rt, path=r'table.xlsx'): if doi_info[1] in 'Taylor & Francis Online': page = BeautifulSoup(rt, 'lxml') # 寻找页面构造成table_name的list table_name = [] for page1 in page.find_all('b'): name = page1.text if 'Table' in name: table_name.append(name) # 删除重复表格 del table_name[int(len(table_name) / 2):len(table_name)] # 删除表格中重复的title count = 0 for t in table_name: if 'Table 1' in t: count += 1 if count > 1: del table_name[1:(len(table_name)):2] if len(table_name) != 0: with pd.ExcelWriter(path) as writer: for p in range(len(table_name)): df = pd.read_html(rt) # 读取table的title并写入dataframe首行 table_te = [] # 读取文章的doi并写入表格第一行 row_doi = [doi_info[2]] for j in range(len(df[p].columns) - 1): row_doi.append('') table_te.append(row_doi) # 将title加入表格其中 row_title = [table_name[p]] for j in range(len(df[p].columns) - 1): row_title.append('') table_te.append(row_title) table_te.append(list(df[p].columns)) for i in range(len(df[p])): # te.append() table_te.append(list(df[p].iloc[i])) df = pd.DataFrame(data=table_te) # 写入excel sheet_name = 'table' + str(p + 1) # 写入excel时不加index和columns df.to_excel(writer, sheet_name=sheet_name, header=None, index=False) else: self.log_wp.print_log(" Cannot find table in this file: " + self.file_name(doi_info[2]) + '.html') elif doi_info[1] in 'MDPI': page = BeautifulSoup(rt, 'lxml') # 利用BeautifulSoup取得网页代码 table_name = [] for page1 in page.find_all('caption'): name = page1.text name = name.replace('\n', '') # 清除title中的冗余字段 table_name.append(name) self.log_wp.print_log(str(table_name)) if len(table_name) != 0: with pd.ExcelWriter(path) as writer: df = pd.read_html(rt) for p in range(len(table_name)): # 读取table的title并写入dataframe首行 table_te = [] # 读取文章的doi并写入表格第一行 row_doi = [doi_info[2]] for j in range(len(df[p].columns) - 1): row_doi.append('') table_te.append(row_doi) # 将title加入表格其中 row_title = [table_name[p]] for j in range(len(df[p].columns) - 1): row_title.append('') table_te.append(row_title) table_te.append(list(df[p].columns)) for i in range(len(df[p])): # te.append() table_te.append(list(df[p].iloc[i])) fa = pd.DataFrame(data=table_te) # 写入excel sheet_name = 'table' + str(p + 1) # 写入excel时不加index和columns fa.to_excel(writer, sheet_name=sheet_name, header=None, index=False) else: self.log_wp.print_log(" Cannot find table in this file: " + self.file_name(doi_info[2]) + '.html') elif doi_info[1] in "ASME International": page = BeautifulSoup(rt, 'lxml') # 利用BeautifulSoup取得网页代码 # 将html转为纯文本, header=None, index_col=None try: df = pd.read_html(rt) except Exception as e: df = 0 if df != 0: table_name = [] for page1 in page.find_all('div'): name = page1.text if 'Table' in name[0:5]: if ' ' in name[-1]: table_name.append(name) if len(table_name) != 0: self.log_wp.print_log("complete!") del df[0:len(df):2] with pd.ExcelWriter(path) as writer: # 爬取HTML内容 for p in range(len(df)): # 读取table的title并写入dataframe首行 table_te = [] # 读取文章的doi并写入表格第一行 row_doi = [doi_info[2]] for j in range(len(df[p].columns) - 1): row_doi.append('') table_te.append(row_doi) # 将title加入表格其中 row_title = [] try: row_title.append(table_name[p]) for j in range(len(df[p].columns) - 1): row_title.append('') table_te.append(row_title) table_te.append(list(df[p].columns)) for i in range(len(df[p])): # te.append() table_te.append(list(df[p].iloc[i])) fa = pd.DataFrame(data=table_te) # 写入excel sheet_name = 'table' + str(p + 1) # 写入excel时不加index和columns fa.to_excel(writer, sheet_name=sheet_name, header=None, index=False) except Exception as e: print("An error exist:", e) else: table_name = [] self.log_wp.print_log(" Cannot find table in this file: " + self.file_name(doi_info[2]) + '.html') elif doi_info[1] in [ "Springer", "Nature Publishing Group", "Wiley Blackwell" ]: page = BeautifulSoup(rt, 'lxml') # 利用BeautifulSoup取得网页代码 # 将html转为纯文本, header=None, index_col=None table_name = [] for page1 in page.find_all('header'): name = page1.text if 'Table' in name: name = ' '.join(name.split()) table_name.append(name.replace('\n', '')) try: if len(table_name) != 0: df = pd.read_html(rt) with pd.ExcelWriter(path) as writer: # 爬取HTML内容 for p in range(len(df)): # 读取table的title并写入dataframe首行 table_te = [] # 读取文章的doi并写入表格第一行 row_doi = [doi_info[2]] for j in range(len(df[p].columns) - 1): row_doi.append('') table_te.append(row_doi) # 将title加入表格其中 row_title = [table_name[p]] for j in range(len(df[p].columns) - 1): row_title.append('') table_te.append(row_title) table_te.append(list(df[p].columns)) for i in range(len(df[p])): # te.append() table_te.append(list(df[p].iloc[i])) fa = pd.DataFrame(data=table_te) # 写入excel sheet_name = 'table' + str(p + 1) # 写入excel时不加index和columns fa.to_excel(writer, sheet_name=sheet_name, header=None, index=False) else: self.log_wp.print_log(" Cannot find table in this file: " + self.file_name(doi_info[2]) + '.html') except Exception as e: self.log_wp.print_log(" Cannot find table in this file: " + self.file_name(doi_info[2]) + '.html') else: table_name = [] self.log_wp.print_log("This doi belongs to other databases!") return table_name def run(self): html_list = os.listdir(self.html_path) for html_file in html_list: path = self.html_path + '/' + html_file rt, html_name = self.load_html(path) excel_name = self.file_name(html_name) output_path = self.output_path + '/' + '%s.xlsx' table_name = self.get_table_html(self.doi_info(html_name), rt, output_path % excel_name)
class GetTableInfoFromHtml: def __init__(self, html_path, output_path): self.html_path = html_path self.output_path = output_path self.log_wp = LogWp() def get_all_url(self, url): """ return all url on the page :param url:url of one page :return: all url as list """ headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0)\ Gecko/20100101 Firefox/23.0'} req = urllib.request.Request(url=url, headers=headers) html = urllib.request.urlopen(req).read().decode("utf-8") soup = BeautifulSoup(html, features='html.parser') tags = soup.find_all('a') all_url = [] for tag in tags: all_url.append(str(tag.get('href')).strip()) return all_url def get_table_url(self, doi_info): """ return table url on the page for Springer and Nature Publishing Group param url:doi_info of article return: table url as list """ all_url = self.get_all_url(doi_info[0]) table_url = [] for i in all_url: if "table" in i: if 'article' in i: self.log_wp.print_log(str(i)) if doi_info[1] in "Springer": table_url.append('https://link.springer.com' + i) else: table_url.append('https://www.nature.com' + i) if len(table_url) == 0: self.log_wp.print_log("There is no table url in this article!") self.log_wp.print_log(str(table_url)) return table_url def doi_info(self, doi_str): """ get url and database of doi param doi_str: doi as str return: doi_info=[doi_url,doi_database] doi_url: str """ global doi_url doi_info = [] if doi_str[0:7] in "10.1016": doi_url = "https://doi.org/" + doi_str doi_database = "Elsevier" elif doi_str[0:7] in ["10.1007", "10.1361", "10.1023"]: doi_url = "https://link.springer.com/article/" + doi_str doi_database = "Springer" elif doi_str[0:7] in "10.1080": doi_url = "https://doi.org/" + doi_str doi_database = "Taylor & Francis Online" elif doi_str[0:7] in ["10.1002", "10.1111"]: doi_url = "https://onlinelibrary.wiley.com/doi/" + doi_str doi_database = "Wiley Blackwell" elif doi_str[0:7] in "10.1115": doi_url = "https://doi.org/" + doi_str doi_database = "ASME International" elif doi_str[0:7] in "10.3390": all_url = self.get_all_url("https://doi.org/" + doi_str) for url_str in all_url: if "htm" in url_str: doi_url = "https://www.mdpi.com/" + url_str break doi_database = "MDPI" elif doi_str[0:7] == "10.1038": doi_url = "https://doi.org/" + doi_str doi_database = "Nature Publishing Group" else: doi_url = "other URL" doi_database = "other database" doi_info.append(doi_url) doi_info.append(doi_database) doi_info.append(doi_str) return doi_info def filename(self, name): st = '\|/:?*<>;' for s in st: if s in name: name = name.replace(s, '-') return name def get_table(self, doi_info, path=r'table.xlsx'): """ get all table name from the page, param doi_info: [doi_url,1doi_info_name]str param path: r""str return """ table_name = [] if doi_info[1] in ['Springer', 'Nature Publishing Group']: table_url = self.get_table_url(doi_info) if len(table_url) != 0: with pd.ExcelWriter(path) as writer: for p in range(len(table_url)): time.sleep(1) self.log_wp.print_log("Start crawling the page") r = requests.get(table_url[p]) rt = r.text try: df = pd.read_html(rt) self.log_wp.print_log("complete!") except Exception as e: print(e) self.log_wp.print_log('format of table ' + str(p) + ' is PDF') continue start = rt.find("<h1") end = rt.rfind("</h1>") title_str = '' for i in range(start, end + 5): title_str += rt[i] title_start = title_str.find("Table") title_end = title_str.find("</h1>") title = '' for j in range(title_start, title_end): self.log_wp.print_log(str(title_str[j])) title += title_str[j] table_name.append(title) table_te = [] row_doi = [doi_info[2]] for j in range(len(df[0].columns) - 1): row_doi.append('') table_te.append(row_doi) row_title = list() row_title.append(title) for j in range(len(df[0].columns) - 1): row_title.append('') table_te.append(row_title) table_te.append(list(df[0].columns)) for i in range(len(df[0])): table_te.append(list(df[0].iloc[i])) df[0] = pd.DataFrame(data=table_te) sheet_name = 'table' + str(p + 1) df[0].to_excel(writer, sheet_name=sheet_name, header=None, index=False) else: self.log_wp.print_log(" Cannot find table in this page: " + self.filename(doi_info[0])) elif doi_info[1] in 'Taylor & Francis Online': rt = self.get_rt(doi_info[0]) page = BeautifulSoup(rt, 'lxml') table_name = [] for page1 in page.find_all('b'): name = page1.text if 'Table' in name: table_name.append(name) del table_name[int(len(table_name) / 2):len(table_name)] count = 0 for t in table_name: if 'Table 1' in t: count += 1 if count > 1: del table_name[1:(len(table_name)):2] if len(table_name) != 0: with pd.ExcelWriter(path) as writer: for p in range(len(table_name)): df = pd.read_html(rt) table_te = [] row_doi = [doi_info[2]] for j in range(len(df[p].columns) - 1): row_doi.append('') table_te.append(row_doi) row_title = list() row_title.append(table_name[p]) for j in range(len(df[p].columns) - 1): row_title.append('') table_te.append(row_title) table_te.append(list(df[p].columns)) for i in range(len(df[p])): table_te.append(list(df[p].iloc[i])) df = pd.DataFrame(data=table_te) sheet_name = 'table' + str(p + 1) df.to_excel(writer, sheet_name=sheet_name, header=None, index=False) else: self.log_wp.print_log(" Cannot find table in this page: " + self.filename(doi_info[0])) elif doi_info[1] in 'MDPI': rt = self.get_rt(doi_info[0]) page = BeautifulSoup(rt, 'lxml') table_name = [] for page1 in page.find_all('caption'): name = page1.text name = name.replace('\n', '') table_name.append(name) self.log_wp.print_log(str(table_name)) if len(table_name) != 0: with pd.ExcelWriter(path) as writer: time.sleep(1) self.log_wp.print_log("Start crawling the page") r = requests.get(doi_info[0]) rt = r.text df = pd.read_html(rt) self.log_wp.print_log("complete!") for p in range(len(table_name)): table_te = [] row_doi = [doi_info[2]] for j in range(len(df[p].columns) - 1): row_doi.append('') table_te.append(row_doi) row_title = list() row_title.append(table_name[p]) for j in range(len(df[p].columns) - 1): row_title.append('') table_te.append(row_title) table_te.append(list(df[p].columns)) for i in range(len(df[p])): table_te.append(list(df[p].iloc[i])) fa = pd.DataFrame(data=table_te) sheet_name = 'table' + str(p + 1) fa.to_excel(writer, sheet_name=sheet_name, header=None, index=False) else: self.log_wp.print_log(" Cannot find table in this page: " + self.filename(doi_info[0])) elif doi_info[1] in "ASME International": import re rt = self.get_rt(doi_info[0]) page = BeautifulSoup(rt, 'lxml') table_name = [] for page1 in page.find_all('div'): name = page1.text if 'Table' in name[0:5]: if ' ' in name[-1]: table_name.append(name) if len(table_name) != 0: df = pd.read_html(rt) self.log_wp.print_log("complete!") del df[0:len(df):2] with pd.ExcelWriter(path) as writer: for p in range(len(df)): table_te = [] row_doi = [doi_info[2]] for j in range(len(df[p].columns) - 1): row_doi.append('') table_te.append(row_doi) row_title = list() row_title.append(table_name[p]) for j in range(len(df[p].columns) - 1): row_title.append('') table_te.append(row_title) table_te.append(list(df[p].columns)) for i in range(len(df[p])): table_te.append(list(df[p].iloc[i])) fa = pd.DataFrame(data=table_te) sheet_name = 'table' + str(p + 1) fa.to_excel(writer, sheet_name=sheet_name, header=None, index=False) else: self.log_wp.print_log(" Cannot find table in this page: " + doi_info[0]) elif doi_info[1] in "Wiley Blackwell": rt = self.get_rt(doi_info[0]) page = BeautifulSoup(rt, 'lxml') table_name = [] for page1 in page.find_all('header'): name = page1.text if 'Table' in name: name = ' '.join(name.split()) table_name.append(name.replace('\n', '')) try: if len(table_name) != 0: df = pd.read_html(rt) with pd.ExcelWriter(path) as writer: for p in range(len(df)): table_te = [] row_doi = [doi_info[2]] for j in range(len(df[p].columns) - 1): row_doi.append('') table_te.append(row_doi) row_title = list() row_title.append(table_name[p]) for j in range(len(df[p].columns) - 1): row_title.append('') table_te.append(row_title) table_te.append(list(df[p].columns)) for i in range(len(df[p])): table_te.append(list(df[p].iloc[i])) fa = pd.DataFrame(data=table_te) sheet_name = 'table' + str(p + 1) fa.to_excel(writer, sheet_name=sheet_name, header=None, index=False) else: self.log_wp.print_log(" Cannot find table in this page: " + doi_info[0]) except Exception as e: print(e) pself.log_wp.print_log(" Cannot find table in this page: " + doi_info[0]) else: self.log_wp.print_log("Please try other function!") return table_name def get_rt(self, url): """ param url:str return """ headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} time.sleep(1) self.log_wp.print_log("Start crawling the page") r = requests.get(url, headers=headers) r.encoding = 'utf-8' rt = r.text self.log_wp.print_log("complete!") return rt def load_doi(self, path): import xlrd data = xlrd.open_workbook(path) table = data.sheet_by_index(0) doi_li = [] for row in range(table.nrows): table_doi = table.row_values(row, start_colx=0, end_colx=None)[0] doi_li.append(table_doi) return doi_li def get_html(self, url): headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'} req = urllib.request.Request(url=url, headers=headers) html = urllib.request.urlopen(req).read() return html def save_html(self, file_name, file_content): with open(file_name + ".html", "wb") as f: f.write(file_content) def down_html(self, doi_li, path=''): for s in range(len(doi_li)): name = self.filename(doi_li[s]) url_te = self.doi_info(doi_li[s])[0] html = self.get_html(url_te) self.save_html(path + name, html) self.log_wp.print_log('html_' + str(s + 1) + " download completed!") def doi_renamed(self, html_name): name = html_name[0:-5] if len(html_name) > 7: name = self.str_sub(name, 7, '/') else: self.log_wp.print_log('Your file name is wrong!') return name def str_sub(self, string, p, c): new = [] for s in string: new.append(s) new[p] = c return ''.join(new) def load_html(self, html_input_path): html_name = self.doi_renamed(os.path.basename(html_input_path)) f = open(html_input_path, "r", encoding="utf-8") ft = f.read() return ft, html_name def get_table_html(self, doi_info, rt, path=r'table.xlsx'): """ get all table name from the page, param doi_info: [doi_url,1doi_info_name]str param rt: requests.get(url).text param path: str return """ if doi_info[1] in 'Taylor & Francis Online': page = BeautifulSoup(rt, 'lxml') table_name = [] for page1 in page.find_all('b'): name = page1.text if 'Table' in name: table_name.append(name) del table_name[int(len(table_name) / 2):len(table_name)] count = 0 for t in table_name: if 'Table 1' in t: count += 1 if count > 1: del table_name[1:(len(table_name)):2] if len(table_name) != 0: with pd.ExcelWriter(path) as writer: for p in range(len(table_name)): df = pd.read_html(rt) table_te = [] row_doi = [doi_info[2]] for j in range(len(df[p].columns) - 1): row_doi.append('') table_te.append(row_doi) row_title = list() row_title.append(table_name[p]) for j in range(len(df[p].columns) - 1): row_title.append('') table_te.append(row_title) table_te.append(list(df[p].columns)) for i in range(len(df[p])): table_te.append(list(df[p].iloc[i])) df = pd.DataFrame(data=table_te) sheet_name = 'table' + str(p + 1) df.to_excel(writer, sheet_name=sheet_name, header=None, index=False) else: self.log_wp.print_log(" Cannot find table in this file: " + self.filename(doi_info[2]) + '.html') elif doi_info[1] in 'MDPI': page = BeautifulSoup(rt, 'lxml') table_name = [] for page1 in page.find_all('caption'): name = page1.text name = name.replace('\n', '') table_name.append(name) self.log_wp.print_log(str(table_name)) if len(table_name) != 0: with pd.ExcelWriter(path) as writer: df = pd.read_html(rt) for p in range(len(table_name)): table_te = [] row_doi = [doi_info[2]] for j in range(len(df[p].columns) - 1): row_doi.append('') table_te.append(row_doi) row_title = list() row_title.append(table_name[p]) for j in range(len(df[p].columns) - 1): row_title.append('') table_te.append(row_title) table_te.append(list(df[p].columns)) for i in range(len(df[p])): table_te.append(list(df[p].iloc[i])) fa = pd.DataFrame(data=table_te) sheet_name = 'table' + str(p + 1) fa.to_excel(writer, sheet_name=sheet_name, header=None, index=False) else: self.log_wp.print_log(" Cannot find table in this file: " + self.filename(doi_info[2]) + '.html') elif doi_info[1] in "ASME International": page = BeautifulSoup(rt, 'lxml') table_name = [] for page1 in page.find_all('div'): name = page1.text if 'Table' in name[0:5]: if ' ' in name[-1]: table_name.append(name) if len(table_name) != 0: df = pd.read_html(rt) self.log_wp.print_log("complete!") del df[0:len(df):2] with pd.ExcelWriter(path) as writer: for p in range(len(df)): table_te = [] row_doi = [doi_info[2]] for j in range(len(df[p].columns) - 1): row_doi.append('') table_te.append(row_doi) row_title = list() row_title.append(table_name[p]) for j in range(len(df[p].columns) - 1): row_title.append('') table_te.append(row_title) table_te.append(list(df[p].columns)) for i in range(len(df[p])): table_te.append(list(df[p].iloc[i])) fa = pd.DataFrame(data=table_te) sheet_name = 'table' + str(p + 1) fa.to_excel(writer, sheet_name=sheet_name, header=None, index=False) else: self.log_wp.print_log(" Cannot find table in this file: " + self.filename(doi_info[2]) + '.html') elif doi_info[1] in ["Springer", "Nature Publishing Group", "Wiley Blackwell"]: page = BeautifulSoup(rt, 'lxml') table_name = [] for page1 in page.find_all('header'): name = page1.text if 'Table' in name: name = ' '.join(name.split()) table_name.append(name.replace('\n', '')) try: if len(table_name) != 0: df = pd.read_html(rt) with pd.ExcelWriter(path) as writer: for p in range(len(df)): table_te = [] row_doi = [doi_info[2]] for j in range(len(df[p].columns) - 1): row_doi.append('') table_te.append(row_doi) row_title = list() row_title.append(table_name[p]) for j in range(len(df[p].columns) - 1): row_title.append('') table_te.append(row_title) table_te.append(list(df[p].columns)) for i in range(len(df[p])): table_te.append(list(df[p].iloc[i])) fa = pd.DataFrame(data=table_te) sheet_name = 'table' + str(p + 1) fa.to_excel(writer, sheet_name=sheet_name, header=None, index=False) else: self.log_wp.print_log(" Cannot find table in this file: " + self.filename(doi_info[2]) + '.html') except Exception as e: print(e) self.log_wp.print_log(" Cannot find table in this file: " + self.filename(doi_info[2]) + '.html') else: table_name = [] self.log_wp.print_log("This doi belongs to other databases!") return table_name def run(self): html_list = os.listdir(self.html_path) for html_file in html_list: path = os.path.join(self.html_path, html_file) rt, html_name = self.load_html(path) excel_name = self.filename(html_name) output_path = self.output_path + '/' + '%s.xlsx' table_name = self.get_table_html(self.doi_info(html_name), rt, output_path % excel_name)
class AcquireTargetInfo: def __init__(self, c_path, origin_text_path, prop_list, excels_path, out_path): self.c_path = c_path self.prop_list = prop_list self.origin_text_path = origin_text_path self.excels_path = excels_path self.dict_info = Dictionary(self.c_path) self.out_path = out_path self.log_wp = LogWp() def mkdir(self, file_name): pathd = os.getcwd() + '\\' + file_name if os.path.exists(pathd): for root, dirs, files in os.walk(pathd, topdown=False): for name in files: os.remove(os.path.join(root, name)) for name in dirs: os.rmdir(os.path.join(root, name)) os.rmdir(pathd) os.mkdir(pathd) def get_doi_fromtxt(self, txt_path): text_name = txt_path.replace(".txt", "") doi = text_name.replace("-", "/", 1) return doi def get_abrre(self, text, prop_name): processor = TPreProcessor(text, prop_name, self.c_path) text = processor.processor() sentences = nltk.sent_tokenize(text) sentences_split = text.split(" ") alloy_write_type = self.dict_info.alloy_writing_type len_type = len(alloy_write_type) abbre_to_alloy = {} for sent in sentences: processor = TPreProcessor(sent, prop_name, self.c_path) filter_data = processor.processor() words = nltk.word_tokenize(filter_data) for word in words: for type_i in range(0, len_type): outcome = re.findall(alloy_write_type[type_i], word) outcome_alloy = None if outcome: abbre = "(" + word + ")" if abbre in sentences_split: index_alloy = sentences_split.index(abbre) - 1 alloy = sentences_split[index_alloy] for type_j in range(0, len_type): outcome_alloy = re.findall( alloy_write_type[type_j], alloy) if outcome_alloy: abbre_to_alloy[word] = alloy break if outcome_alloy: break return abbre_to_alloy def get_text_triple(self, prop_name): self.mkdir('output_tt') text_path = r"output_tt\full_text" self.mkdir(text_path) ft = FilterText(self.origin_text_path, text_path) txt_name = ft.process() length = len(os.listdir(self.origin_text_path)) all_txt_info = [] for i in range(0, length): n_path = os.listdir(self.origin_text_path)[i] doi = self.get_doi_fromtxt(n_path) file = open(text_path + '/' + n_path, 'r', encoding='utf-8') data = file.read() pre_processor = PreProcessor(data, self.c_path) filter_txt = pre_processor.pre_processor() file_origin = open(self.origin_text_path + '/' + n_path, 'r', encoding='utf-8') data_origin = file_origin.read() abbre_pairs = self.get_abrre(data_origin, prop_name) positioner = SentencePositioner(filter_txt, prop_name, self.c_path) target_sents = positioner.target_sent() for index, sent in target_sents.items(): processor = TPreProcessor(sent, prop_name, self.c_path) filter_data = processor.processor() parse = PhraseParse(filter_data, prop_name, self.c_path) sub_order, sub_id, object_list = parse.alloy_sub_search() ree = RelationExtraciton(prop_name, filter_data, sub_order, sub_id, object_list, self.c_path, abbre_pairs) all_outcome = ree.triple_extraction() if all_outcome: for id_m, info in all_outcome.items(): sole_info = dict() sole_info['doi'] = doi sole_info['material'] = info[0] sole_info['prop_name'] = info[1] sole_info['prop_value'] = info[2] all_txt_info.append(sole_info) return all_txt_info def gather_tableinfo_textinfo(self, all_txt_info, table_info, prop_name, prop_pattern, unit_pattern_text): gather_outcome = list() for file_name, t_info in table_info.items(): sole_info = dict() all_triple_info = list() sole_doi = list() for sole_m_info in t_info: triple_info = dict() triple_info['source'] = 'table' if 'doi' in sole_m_info.keys(): plus_doi = sole_m_info['doi'] sole_doi.append(plus_doi) sole_m_info.pop('doi') if 'material' in sole_m_info.keys(): sole_material = sole_m_info['material'] noisy = re.findall('\s*\[.+\]', str(sole_material)) if noisy: for puc in noisy: sole_material = str(sole_material).replace(puc, '') triple_info['material'] = sole_material sole_m_info.pop('material') if 'unit' in sole_m_info.keys(): sole_unit = sole_m_info['unit'] triple_info['unit'] = sole_unit sole_m_info.pop('unit') if 'other_info' in sole_m_info.keys(): sole_other_info = sole_m_info['other_info'] triple_info['other_prop_info'] = sole_other_info sole_m_info.pop('other_info') if 'child_tag' in sole_m_info.keys(): sole_tag_info = sole_m_info['child_tag'] triple_info['child_tag'] = sole_tag_info sole_m_info.pop('child_tag') if 'table_topic' in sole_m_info.keys(): sole_tag_info = sole_m_info['table_topic'] triple_info['table_topic'] = sole_tag_info sole_m_info.pop('table_topic') if len(sole_m_info) == 1: for prop_name_t, value in sole_m_info.items(): sole_propname = str(prop_name_t) triple_info['prop_name'] = sole_propname sole_value = str(value) triple_info['value'] = sole_value elif len(sole_m_info) >= 1: get_prop = None for prop_name_t, value in sole_m_info.items(): for pattern in prop_pattern[prop_name]: prop_search = re.findall(pattern, str(prop_name_t)) if prop_search: sole_propname = str(prop_name_t) triple_info['prop_name'] = sole_propname sole_value = str(value) triple_info['value'] = sole_value get_prop = True break if get_prop: break all_triple_info.append(triple_info) if list(set(sole_doi)): sole_info[list(set(sole_doi))[0]] = all_triple_info gather_outcome.append(sole_info) gather = 0 for q in gather_outcome: k = tuple(q.keys())[0] i = q[k] for n in i: for w, v in n.items(): if w == 'value': gather += 1 self.log_wp.print_log("gather number :%s", gather) copy_all_txt_info = copy.copy(all_txt_info) if copy_all_txt_info: all_text = 0 all_gather_doi = [] for info_one in gather_outcome: all_gather_doi.append(tuple(info_one.keys())[0]) for triple_info_sole in copy_all_txt_info: if triple_info_sole['doi'] in all_gather_doi: all_text += 1 plus_info = dict() plus_info['source'] = 'text' plus_info['prop_name'] = triple_info_sole['prop_name'] prop_value = triple_info_sole['prop_value'] plus_info['material'] = triple_info_sole['material'] unit_search = re.findall(unit_pattern_text[prop_name], str(prop_value)) if unit_search: plus_info['unit'] = unit_search[0] prop_value = prop_value.replace(unit_search[0], '') plus_info['value'] = prop_value else: plus_info['unit'] = "" plus_info['value'] = prop_value for get_info in gather_outcome: if tuple( get_info.keys())[0] == triple_info_sole['doi']: get_info[triple_info_sole['doi']].append(plus_info) if triple_info_sole['doi'] not in all_gather_doi: all_text += 1 plus_info = {} full_info = {} sole_triple = [] plus_info['source'] = 'text' plus_info['prop_name'] = triple_info_sole['prop_name'] prop_value = triple_info_sole['prop_value'] plus_info['material'] = triple_info_sole['material'] unit_search = re.findall(unit_pattern_text[prop_name], str(prop_value)) if unit_search: plus_info['unit'] = unit_search[0] prop_value = prop_value.replace(unit_search[0], '') plus_info['value'] = prop_value else: plus_info['unit'] = "" plus_info['value'] = prop_value if plus_info: sole_triple.append(plus_info) full_info[triple_info_sole['doi']] = sole_triple gather_outcome.append(full_info) all_gather_doi.append(triple_info_sole['doi']) self.log_wp.print_log("all_text number :%s", all_text) return gather_outcome def transform_comp_outcome(self, all_composition): ele_list = self.dict_info.ele_list gather_outcome = [] for file_name, t_info in all_composition.items(): sole_info = {} all_triple_info = [] for sole_m_info in t_info: triple_info = {} sole_doi = sole_m_info['doi'] sole_m_info.pop('doi') if 'material' in sole_m_info.keys(): sole_material = sole_m_info['material'] noisy = re.findall('\[.+\]', str(sole_material)) if noisy: for puc in noisy: sole_material = str(sole_material).replace(puc, '') triple_info['material'] = sole_material sole_m_info.pop('material') for element in ele_list: if element in sole_m_info.keys(): triple_info[element] = sole_m_info[element] sole_m_info.pop(element) if sole_m_info: triple_info["other_eleinfo"] = sole_m_info all_triple_info.append(triple_info) sole_info[sole_doi] = all_triple_info gather_outcome.append(sole_info) return gather_outcome def allinfo_dependencyparse(self, comp_info, prop_info): all_ele_doi = [] all_prop_doi = [] outcome = [] for doi_info_ele in comp_info: ele_doi = tuple(doi_info_ele.keys())[0] all_ele_doi.append(ele_doi) for doi_info_prop in prop_info: prop_doi = tuple(doi_info_prop.keys())[0] all_prop_doi.append(prop_doi) prop_info_modified = copy.copy(prop_info) for doi_info_ele in comp_info: ele_doi = tuple(doi_info_ele.keys())[0] if ele_doi in all_prop_doi: for doi_info_prop in prop_info: prop_doi = tuple(doi_info_prop.keys())[0] plus_info = {} all_doi_info = [] if ele_doi == prop_doi: if doi_info_prop in prop_info_modified: prop_info_modified.remove(doi_info_prop) ele_doi_fullinfo = doi_info_ele[ele_doi] ele_allname = [] prop_allname = [] pop_name = [] for one_material_ele in ele_doi_fullinfo: if 'material' in one_material_ele.keys(): ele_m_name = one_material_ele['material'] ele_allname.append(ele_m_name) modified_ele_allname = [] for name in ele_allname: space_search = re.findall('\s', str(name)) if space_search: name_list = str(name).split() modified_ele_allname.append(str(name)) for name_sepe in name_list: modified_ele_allname.append(name_sepe) else: modified_ele_allname.append(name) for one_material_prop in doi_info_prop[prop_doi]: if 'material' in one_material_prop.keys(): prop_m_name = one_material_prop['material'] prop_allname.append(prop_m_name) if prop_m_name not in modified_ele_allname and len( ele_doi_fullinfo) == 1: if one_material_prop['source'] == 'table': combine_info = {} for prop_name, prop_value in one_material_prop.items( ): combine_info[ prop_name] = prop_value for ele_name, ele_value in ele_doi_fullinfo[ 0].items(): combine_info[ele_name] = ele_value all_doi_info.append(combine_info) else: all_doi_info.append(one_material_prop) if prop_m_name not in modified_ele_allname and len( ele_doi_fullinfo) != 1: all_doi_info.append(one_material_prop) if prop_m_name in modified_ele_allname: for one_material_ele in ele_doi_fullinfo: if 'material' in one_material_ele.keys( ): ele_m_name = one_material_ele[ 'material'] space_search = re.findall( '\s', str(ele_m_name)) if space_search: ele_m_name_split = ele_m_name.split( ) if prop_m_name in ele_m_name_split or prop_m_name == ele_m_name: pop_name.append(ele_m_name) combine_info = {} for prop_name, prop_value in one_material_prop.items( ): combine_info[ prop_name] = prop_value for ele_name, ele_value in one_material_ele.items( ): combine_info[ ele_name] = ele_value all_doi_info.append( combine_info) else: if prop_m_name == ele_m_name: combine_info = {} for prop_name, prop_value in one_material_prop.items( ): combine_info[ prop_name] = prop_value for ele_name, ele_value in one_material_ele.items( ): combine_info[ ele_name] = ele_value all_doi_info.append( combine_info) for one_material_ele in ele_doi_fullinfo: if 'material' in one_material_ele.keys(): ele_m_name = one_material_ele['material'] if ele_m_name not in pop_name: if ele_m_name not in prop_allname: all_doi_info.append(one_material_ele) if all_doi_info: plus_info[ele_doi] = all_doi_info outcome.append(plus_info) else: outcome.append(doi_info_ele) for extra_prop in prop_info_modified: outcome.append(extra_prop) return outcome def structureinfo_toexcel(self, all_structureinfo, out_path): ele_list = self.dict_info.ele_list xls = openpyxl.Workbook() sht = xls.create_sheet("0") sht = xls.create_sheet(index=0) sht.cell(1, 1, "Source") sht.cell(1, 2, "DOIs") sht.cell(1, 3, "table_topic") sht.cell(1, 4, "material") sht.cell(1, 5, "Property_name") sht.cell(1, 6, "Property_value") sht.cell(1, 7, "Unit") col_n = 8 row_now = 2 sht.cell(1, col_n, str("other_element_info")) col_n += 1 sht.cell(1, col_n, str("other_property_info")) col_n += 1 sht.cell(1, col_n, str("child_tag")) col_n += 1 for ele in ele_list: sht.cell(1, col_n, ele) col_n += 1 for m_info in all_structureinfo: doi = tuple(m_info.keys())[0] length_m_info = m_info[doi] for index_m in range(len(length_m_info)): sht.cell(row_now, 2, doi) material_now = length_m_info[index_m] if 'source' in material_now.keys(): sht.cell(row_now, 1, str(material_now['source'])) if 'table_topic' in material_now.keys(): sht.cell(row_now, 3, str(material_now['table_topic'])) if 'material' in material_now.keys(): sht.cell(row_now, 4, str(material_now['material'])) if 'prop_name' in material_now.keys(): sht.cell(row_now, 5, str(material_now['prop_name'])) if 'value' in material_now.keys(): sht.cell(row_now, 6, str(material_now['value'])) if 'unit' in material_now.keys(): sht.cell(row_now, 7, str(material_now['unit'])) if "other_eleinfo" in material_now.keys(): sht.cell(row_now, 8, str(material_now['other_eleinfo'])) if "other_prop_info" in material_now.keys(): sht.cell(row_now, 9, str(material_now['other_prop_info'])) if "child_tag" in material_now.keys(): sht.cell(row_now, 10, str(material_now["child_tag"])) col_ele = 11 for ele in ele_list: if ele in material_now.keys(): sht.cell(row_now, col_ele, material_now[ele]) col_ele += 1 row_now += 1 del xls['Sheet'] self.log_wp.excel_save(xls, out_path) def run(self): prop_pattern = self.dict_info.table_prop_pattern unit_pattern_text = self.dict_info.table_unit_pattern_text for prop_name in self.prop_list: self.mkdir('output_tt') text_path = r"output_tt\full_text" self.mkdir(text_path) all_txt_info = self.get_text_triple(prop_name) target_property = prop_name # 'density' 'liquidus' 'solidus' 'solvus' te = TableExtraction(self.excels_path, self.c_path, prop_name=target_property) info_all = te.property_info_extraction() i_l = 0 for k, v in info_all.items(): i_l += len(v) all_composition = te.composition_triple_extraction() gather_outcome = self.gather_tableinfo_textinfo( all_txt_info, info_all, prop_name, prop_pattern, unit_pattern_text) gather = 0 for q in gather_outcome: k = tuple(q.keys())[0] i = q[k] gather += len(i) ele_transform = self.transform_comp_outcome(all_composition) all_structureinfo = self.allinfo_dependencyparse( ele_transform, gather_outcome) b = 0 for a in all_structureinfo: k = tuple(a.keys())[0] i = a[k] for n in i: for w, v in n.items(): if w == 'value': b += 1 out_path = self.out_path + '/' + str(prop_name) + '.xlsx' self.structureinfo_toexcel(all_structureinfo, out_path)