def yz(): result_path = "./zyz" for runs in os.listdir(result_path): run_path = result_path + "/" + runs for lang in os.listdir(run_path): lang_path = run_path + "/" + lang + "/systems" if not os.path.exists(lang_path): lang_path = run_path + "/" + lang ori_lang_path = run_path + "/" + lang for file in os.listdir(lang_path): file_path = lang_path + "/" print file_path + file if file.endswith(".temp"): shutil.move(file_path + file, file_path + file.replace(".temp", ".txt")) file = file.replace(".temp", ".txt") if lang in ["zh", "ja"]: content = read_file(file_path + file) new_content = [] for sentence in content: new_content.append(sentence.replace(" ", "")) write_file(new_content, ori_lang_path + "/" + file, False) else: content = read_file(file_path + file) write_file(content, ori_lang_path + "/" + file, False) if os.path.exists(run_path + "/" + lang + "/systems"): shutil.rmtree(lang_path)
def cjq(): result_path = "./multiling2017_summarization" for runs in os.listdir(result_path): run_path = result_path + "/" + runs for lang in os.listdir(run_path): lang_path = run_path + "/" + lang for file in os.listdir(lang_path): file_path = lang_path + "/" print file_path + file if file.endswith(".temp"): shutil.move(file_path + file, file_path + file.replace(".temp", ".txt")) file = file.replace(".temp", ".txt") if lang in ["zh", "ja"]: content = read_file(file_path + file) new_content = [] for sentence in content: new_content.append(sentence.replace(" ", "")) write_file(new_content, file_path + file, False)
for q in range(len(keywords)): # print(q,keywords[q]) keywords[q] = keywords[q].lower() keywords[q] = keywords[q].strip() for l in keywords_origin: l = l.lower() for m in keywords: if l in m: keywords.remove(m) print(len(keywords)) #delete those phrases with the original keywords un_relevant = ['algorithm','learning','data','design','calculation','neural network', 'model','simulation','structure','cluster','regression','system','prediction', 'throughput','theory','analysis','monte carlo','function','pca','comput','equation', 'lead','feature extraction','technique','loop','interface','software','matrix', 'network','drying','thermodynamics','monte-carlo','method','popcorn failure', 'statistics','coefficient','classification','estimation','sampling', 'modul','search','k-points','probability','probabilistic','dft','software','matlab', 'eulerian','first-principles','gga','first principles','experiments','approach', 'mbj','lsda','strategy','rbfnns','lda','gw','lmto','aim','dna','gpu','pbe', 'bte','fea','test','rdf','cpa','grain','program','cpu','measurement','newton','negf'] for v in un_relevant: for n in keywords: if v in n: keywords.remove(n) print(len(keywords)) filename = 'Doc_processing/keywords_filter.txt' write_file(filename,keywords)
from crawler import extract_html_code from file_operation import read_file, write_file website = "https://www.nature.com/search?journal=npjcompumats" order = "//div//span[@class='text-gray-light']/text()" page_num = extract_html_code(website, order) links = [] if page_num: page_num = int(page_num[0].split()[-1]) page_num = page_num // 50 + 1 for j in range(page_num): u = website + "&page=" + str(j + 1) links.append(u) article = [] for i in range(len(links)): titles = extract_html_code(links[i], "//h2/a") for j in range(len(titles)): title = titles[j].xpath('string(.)').strip() article.append(title) print(len(article)) write_file('Doc_processing_npj/articels_npj.txt', article)
header = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/12.1.2 Safari/605.1.15' } def page_code(url): req = urllib.request.Request(url, headers=header) webpage = urllib.request.urlopen(req) html = webpage.read() soup = BeautifulSoup(html, 'html.parser') return soup soup1 = page_code(url) for i in soup1.find_all('div', class_='li'): key = i.get_text().strip() if key not in ks: ks.append(key) keywords = ks[27:] soup2 = page_code(url2) for j in soup2.find_all('dt'): #,string='更多' keyword = j.get_text().strip() if keyword not in keywords: keywords.append(keyword) filename = 'Doc_processing/' + 'keywords.txt' file_operation.write_file(filename, keywords)
print(len(new_keywords)) un_relevant = ['algorithm','learning','data','design','calculation','neural network', 'model','simulation','structure','cluster','regression','system','prediction', 'throughput','theory','analysis','monte carlo','function','pca','comput','equation', 'lead','feature extraction','technique','loop','interface','software','matrix', 'network','drying','thermodynamics','monte-carlo','method','popcorn failure', 'statistics','coefficient','classification','estimation','sampling', 'modul','search','k-points','probability','probabilistic','dft','software','matlab', 'eulerian','first-principles','gga','first principles','experiments','approach', 'mbj','lsda','strategy','rbfnns','lda','gw','lmto','aim','dna','gpu','pbe', 'bte','fea','test','rdf','cpa','grain','program','cpu','measurement','newton','negf'] for v in un_relevant: for n in new_keywords: if v in n: new_keywords.remove(n) print(len(new_keywords)) write_file('Doc_processing/additional_keywords.txt',new_keywords) articles12 = read_file('Doc_processing/articles.txt')+read_file('Doc_processing/additional articles.txt') links12 = read_file('Doc_processing/articles_link.txt')+read_file('Doc_processing/additional_articles_links.txt') print(len(articles12),len(links12)) n_articles,n_links = get_articles(generate_pages(generate_research_url(new_keywords))) new_articles = [] new_articles_links = [] for n in range(len(n_links)): if n_links[n] not in n_links: new_articles.append(n_articles[n]) new_articles_links.append(n_links[n]) print(len(new_articles))
Quantity = key_err(item, 'GCL') ForeignCode = key_err(item, 'ForeignCode') Character = key_err(item, 'Character') UnitPrice = key_err(item, 'ComUnitPrice') coloum_list.append(( DataSysNo, DataType, RowType, Quantity, DataName, DataCateName, QuotedBasis, Model, ForeignCode, Character, UnitName, JobContent, Remark, UnitPrice, TotalPrice, )) coloum_list.sort() coloum_list = coloum_list return coloum_list if __name__ == '__main__': my_list = contract_list() write_file('122121', my_list)