def words_count(file): with codecs.open(file,'r','UTF-8','ignore') as file_in: sentence = file_in.read() words = Wakati.words_list(sentence)#全単語 # words = Wakati.words_list_select(sentence)#品詞選択 word_count = len(words) return word_count
def corpus_to_dictionary(corpus): dictionary = {} docs = [read_document(x) for x in corpus] for idx, (doc, name) in enumerate(tqdm(zip(docs, corpus))): words = Wakati.words_list(doc) dictionary[name] = words return dictionary
def vocabs_count(file): with codecs.open(file,'r','UTF-8','ignore') as file_in: sentence = file_in.read() words = Wakati.words_list(sentence)#全単語 # words = Wakati.words_list_select(sentence)#品詞選択 counter = Counter(words) vocab_count = len(counter) return vocab_count
def doc2vec_sim_unknow(model, Target, topn): print("「" + Target + "(未知文書)の類似度」") with open(Target, 'r', encoding='UTF-8') as file_in: text = file_in.read() words_list = Wakati.words_list(text) vector = model.infer_vector(words_list) sims = model.docvecs.most_similar([vector], topn=20) return sims
def words_vocab(dir): filelists = File_operation.get_all_paths(dir) print("ファイル数:" + str(len(filelists))) totalwords = [] for file in filelists: with codecs.open(file,'r','UTF-8','ignore')as file_in: sentence = file_in.read() totalwords.extend(Wakati.words_list(sentence)) counter = Counter(totalwords) vocab_count = len(counter) return vocab_count
def doc2vec_cal_unknown(model, pos1, neg1, pos2, topn): pos1_word = pos1.split("\\")[-1] neg1_word = neg1.split("\\")[-1] pos2_word = pos2.split("\\")[-1] print("「" + pos1_word + "-" + neg1_word + "+" + pos2_word + "(未知文書)の演算」") with open(pos1, 'r', encoding='UTF-8') as file_in: text = file_in.read() words_list = Wakati.words_list(text) pos1_vec = model.infer_vector(words_list) with open(neg1, 'r', encoding='UTF-8') as file_in: text = file_in.read() words_list = Wakati.words_list(text) neg1_vec = model.infer_vector(words_list) with open(pos2, 'r', encoding='UTF-8') as file_in: text = file_in.read() words_list = Wakati.words_list(text) pos2_vec = model.infer_vector(words_list) sims = model.docvecs.most_similar(positive=[pos1_vec, pos2_vec], negative=[neg1_vec], topn=topn) return sims
def all_count(file): with codecs.open(file,'r','UTF-8','ignore') as file_in: sentences = file_in.readlines() with codecs.open(file,'r','UTF-8','ignore') as file_in: sentence = file_in.read() words = Wakati.words_list(sentence)#全単語 # words = Wakati.words_list_select(sentence)#品詞選択 word_count = len(words) counter = Counter(words) vocab_count = len(counter) sentence_count = len(sentences) return word_count,vocab_count,sentence_count
def concate_wakati(INPUT_DIR, OUTPUT_DIR, name): lists = get_all_paths(INPUT_DIR) with open(os.path.join(OUTPUT_DIR, name), 'w', encoding='UTF-8') as file_out: print("総ファイル数:" + str(len(lists))) for file in tqdm(lists): with open(file, 'r', encoding='UTF-8-sig') as file_in: lines = file_in.readlines() for line in lines: text = Wakati.wakati(line) if line == lines[-1] and file == lists[-1]: text = text.replace("\n", "") file_out.write(text) else: file_out.write(text)
for i, file in enumerate(tweets_paths): try: fi = codecs.open(tweets_paths[i - 1], 'r', 'utf8') tweet_datas = json.load(fi) print(str(i + 1) + "×" + str(len(tweet_datas)) + "Tweets") except Exception: pass for j, tweet_data in enumerate(tweet_datas): text = tweet_data["text"].replace("\r", "").replace("\n", "") f_txt.write(text + "\n") text = text.replace(",", "") """文字列削除&単語分割""" text = Delete.delete_twitter(text) #文字列削除 text = Wakati.wakati(text) #分かち書き f_pre.write(text) with open(os.path.join( save_dir_corpus_koko, save_dir_name + "_pre_" + str(filenumber) + ".txt"), 'w', encoding='UTF-8') as file_koko: #前処理(個々) file_koko.write(text) filenumber += 1 fi.close() f_txt.close() f_pre.close()
import sys sys.path.append("..") from Preprocessing import File_operation from Preprocessing import Wakati """モデルの読み込み""" INPUT_MODEL = XXXXXXXXXX model = models.Doc2Vec.load(INPUT_MODEL) """読み込み先・保存先""" keyword = XXXXXXXXXX INPUT_DIR = XXXXXXXXXX OUTPUT_DIR = XXXXXXXXXX filelists = File_operation.get_all_paths(INPUT_DIR) """ベクトル化""" dictionary = {} for i,file in enumerate(tqdm(filelists)): title = file.split("\\")[-1] with codecs.open(file,'r','UTF-8',"ignore")as file_in: sentence = file_in.read() words = Wakati.words_list(sentence) vector = model.infer_vector(words) dictionary[title] = vector """pickleファイルに保存""" with open(os.path.join(OUTPUT_DIR,keyword + ".pkl"), mode='wb') as f: pickle.dump(dictionary, f)