def yz(): result_path = "./zyz" for runs in os.listdir(result_path): run_path = result_path + "/" + runs for lang in os.listdir(run_path): lang_path = run_path + "/" + lang + "/systems" if not os.path.exists(lang_path): lang_path = run_path + "/" + lang ori_lang_path = run_path + "/" + lang for file in os.listdir(lang_path): file_path = lang_path + "/" print file_path + file if file.endswith(".temp"): shutil.move(file_path + file, file_path + file.replace(".temp", ".txt")) file = file.replace(".temp", ".txt") if lang in ["zh", "ja"]: content = read_file(file_path + file) new_content = [] for sentence in content: new_content.append(sentence.replace(" ", "")) write_file(new_content, ori_lang_path + "/" + file, False) else: content = read_file(file_path + file) write_file(content, ori_lang_path + "/" + file, False) if os.path.exists(run_path + "/" + lang + "/systems"): shutil.rmtree(lang_path)
def cjq(): result_path = "./multiling2017_summarization" for runs in os.listdir(result_path): run_path = result_path + "/" + runs for lang in os.listdir(run_path): lang_path = run_path + "/" + lang for file in os.listdir(lang_path): file_path = lang_path + "/" print file_path + file if file.endswith(".temp"): shutil.move(file_path + file, file_path + file.replace(".temp", ".txt")) file = file.replace(".temp", ".txt") if lang in ["zh", "ja"]: content = read_file(file_path + file) new_content = [] for sentence in content: new_content.append(sentence.replace(" ", "")) write_file(new_content, file_path + file, False)
def keywords_cleaning(filename): keywords = read_file(filename) keywords_url = [] keywords_filter = [] for i in range(len(keywords)): keywords_url.append(website + keywords[i] + '&show=100') print("Cleaning Start!") for j in range(len(keywords_url)): article_num = extract_html_code(keywords_url[j], "//title/text()") article_num = article_num[0].split(' ')[0] article_num = int(article_num.replace(',', '')) if article_num == 0: keywords_filter.append(keywords[j]) for k in keywords_filter: keywords.remove(k) keywords.remove('code') keywords.remove('files') return keywords
from file_operation import read_file,write_file from keywords_clean import Deduplication import copy keywords = read_file('Doc_processing/keywords_from_articles.txt') keywords_origin = read_file('Doc_processing/keywords_clean.txt') keywords_copy = copy.deepcopy(keywords) print(len(keywords)) for k in range(len(keywords_copy)): keywords_copy[k] = keywords_copy[k].lower() keywords_copy[k] = keywords_copy[k].strip() keywords_copy[k] = ''.join(e for e in keywords_copy[k] if e.isalnum()) index = [] counts = [] for i in range(len(keywords_copy)): for j in range(i+1,len(keywords_copy)): if keywords_copy[j] in keywords_copy[i]: counts.append(j) index = Deduplication(counts) for k in range(len(index)): keywords.remove(keywords[k]) print(len(keywords)) #the code above aims at deleting the same phrases for q in range(len(keywords)): # print(q,keywords[q]) keywords[q] = keywords[q].lower() keywords[q] = keywords[q].strip()
#import movie_url #movie_url.get_movie_url(number=120) import file_operation import Movie import connect_mysql list = file_operation.read_file('movie_url.txt') path = 'movie_url.txt' n = 0 for url in list: soup = Movie.get_url_html_soup(url) id = Movie.get_movie_id(url) title = Movie.get_movie_title(soup) director = Movie.get_movie_directors(soup) screenwriter = Movie.get_movie_screenwriter(soup) character = Movie.get_movie_character(soup) type = Movie.get_movie_type(soup) country = Movie.get_movie_country(soup) comment = Movie.get_movie_shortcomment(url) connect_mysql.insert_database(id, title, director, screenwriter, character, type, country) connect_mysql.insert_comment(id, comment) n = n + 1 while (n == 10): exit()
def get_proxy(): ip = file_operation.read_file('ip.txt') IP = random.choice(ip) proxies = {'http:': 'http://' + IP, 'https:': 'https://' + IP} return proxies
# # url_list = read_file('Doc_processing/additional_articles_links.txt') # res = [] # K = keywords_extraction(url_list) # K = Deduplication(K) # for k in K: # k = k.lower() # k = k.strip() # if k not in res: # if k not in keywords: # res.append(k) # print(len(res)) # write_file('Doc_processing/additional_keywords.txt',res) #step2:deduplication of new keywords and extract articles new_keywords = read_file('Doc_processing/additional_keywords.txt') print(len(new_keywords)) un_relevant = ['algorithm','learning','data','design','calculation','neural network', 'model','simulation','structure','cluster','regression','system','prediction', 'throughput','theory','analysis','monte carlo','function','pca','comput','equation', 'lead','feature extraction','technique','loop','interface','software','matrix', 'network','drying','thermodynamics','monte-carlo','method','popcorn failure', 'statistics','coefficient','classification','estimation','sampling', 'modul','search','k-points','probability','probabilistic','dft','software','matlab', 'eulerian','first-principles','gga','first principles','experiments','approach', 'mbj','lsda','strategy','rbfnns','lda','gw','lmto','aim','dna','gpu','pbe', 'bte','fea','test','rdf','cpa','grain','program','cpu','measurement','newton','negf'] for v in un_relevant: for n in new_keywords: if v in n: new_keywords.remove(n)