def yz():
    result_path = "./zyz"
    for runs in os.listdir(result_path):
        run_path = result_path + "/" + runs
        for lang in os.listdir(run_path):
            lang_path = run_path + "/" + lang + "/systems"
            if not os.path.exists(lang_path):
                lang_path = run_path + "/" + lang
            ori_lang_path = run_path + "/" + lang
            for file in os.listdir(lang_path):
                file_path = lang_path + "/"
                print file_path + file
                if file.endswith(".temp"):
                    shutil.move(file_path + file,
                                file_path + file.replace(".temp", ".txt"))
                    file = file.replace(".temp", ".txt")
                if lang in ["zh", "ja"]:
                    content = read_file(file_path + file)
                    new_content = []
                    for sentence in content:
                        new_content.append(sentence.replace(" ", ""))
                    write_file(new_content, ori_lang_path + "/" + file, False)
                else:
                    content = read_file(file_path + file)
                    write_file(content, ori_lang_path + "/" + file, False)
            if os.path.exists(run_path + "/" + lang + "/systems"):
                shutil.rmtree(lang_path)
def cjq():
    result_path = "./multiling2017_summarization"
    for runs in os.listdir(result_path):
        run_path = result_path + "/" + runs
        for lang in os.listdir(run_path):
            lang_path = run_path + "/" + lang
            for file in os.listdir(lang_path):
                file_path = lang_path + "/"
                print file_path + file
                if file.endswith(".temp"):
                    shutil.move(file_path + file,
                                file_path + file.replace(".temp", ".txt"))
                    file = file.replace(".temp", ".txt")
                if lang in ["zh", "ja"]:
                    content = read_file(file_path + file)
                    new_content = []
                    for sentence in content:
                        new_content.append(sentence.replace(" ", ""))
                    write_file(new_content, file_path + file, False)
def keywords_cleaning(filename):
    keywords = read_file(filename)
    keywords_url = []
    keywords_filter = []
    for i in range(len(keywords)):
        keywords_url.append(website + keywords[i] + '&show=100')

    print("Cleaning Start!")
    for j in range(len(keywords_url)):
        article_num = extract_html_code(keywords_url[j], "//title/text()")
        article_num = article_num[0].split(' ')[0]
        article_num = int(article_num.replace(',', ''))

        if article_num == 0:
            keywords_filter.append(keywords[j])
    for k in keywords_filter:
        keywords.remove(k)
    keywords.remove('code')
    keywords.remove('files')
    return keywords
Exemple #4
0
from file_operation import read_file,write_file
from keywords_clean import Deduplication
import copy

keywords = read_file('Doc_processing/keywords_from_articles.txt')
keywords_origin = read_file('Doc_processing/keywords_clean.txt')
keywords_copy = copy.deepcopy(keywords)
print(len(keywords))
for k in range(len(keywords_copy)):
    keywords_copy[k] = keywords_copy[k].lower()
    keywords_copy[k] = keywords_copy[k].strip()
    keywords_copy[k] = ''.join(e for e in keywords_copy[k] if e.isalnum())

index = []
counts = []
for i in range(len(keywords_copy)):
    for j in range(i+1,len(keywords_copy)):
        if keywords_copy[j] in keywords_copy[i]:
            counts.append(j)
index = Deduplication(counts)

for k in range(len(index)):
    keywords.remove(keywords[k])
print(len(keywords))
#the code above aims at deleting the same phrases

for q in range(len(keywords)):
    # print(q,keywords[q])
    keywords[q] = keywords[q].lower()
    keywords[q] = keywords[q].strip()
Exemple #5
0
#import movie_url
#movie_url.get_movie_url(number=120)

import file_operation
import Movie
import connect_mysql

list = file_operation.read_file('movie_url.txt')
path = 'movie_url.txt'
n = 0
for url in list:
    soup = Movie.get_url_html_soup(url)
    id = Movie.get_movie_id(url)
    title = Movie.get_movie_title(soup)
    director = Movie.get_movie_directors(soup)
    screenwriter = Movie.get_movie_screenwriter(soup)
    character = Movie.get_movie_character(soup)
    type = Movie.get_movie_type(soup)
    country = Movie.get_movie_country(soup)
    comment = Movie.get_movie_shortcomment(url)

    connect_mysql.insert_database(id, title, director, screenwriter, character,
                                  type, country)
    connect_mysql.insert_comment(id, comment)
    n = n + 1
    while (n == 10):
        exit()
Exemple #6
0
def get_proxy():
    ip = file_operation.read_file('ip.txt')
    IP = random.choice(ip)
    proxies = {'http:': 'http://' + IP, 'https:': 'https://' + IP}
    return proxies
#
# url_list = read_file('Doc_processing/additional_articles_links.txt')
# res = []
# K = keywords_extraction(url_list)
# K = Deduplication(K)
# for k in K:
#     k = k.lower()
#     k = k.strip()
#     if k not in res:
#         if k not in keywords:
#             res.append(k)
# print(len(res))
# write_file('Doc_processing/additional_keywords.txt',res)

#step2:deduplication of new keywords and extract articles
new_keywords = read_file('Doc_processing/additional_keywords.txt')
print(len(new_keywords))
un_relevant = ['algorithm','learning','data','design','calculation','neural network',
               'model','simulation','structure','cluster','regression','system','prediction',
               'throughput','theory','analysis','monte carlo','function','pca','comput','equation',
               'lead','feature extraction','technique','loop','interface','software','matrix',
               'network','drying','thermodynamics','monte-carlo','method','popcorn failure',
               'statistics','coefficient','classification','estimation','sampling',
               'modul','search','k-points','probability','probabilistic','dft','software','matlab',
               'eulerian','first-principles','gga','first principles','experiments','approach',
               'mbj','lsda','strategy','rbfnns','lda','gw','lmto','aim','dna','gpu','pbe',
               'bte','fea','test','rdf','cpa','grain','program','cpu','measurement','newton','negf']
for v in un_relevant:
    for n in new_keywords:
        if v in n:
            new_keywords.remove(n)