def test(): glove_vector_model_path = configuration.get_config( 'glove_vector_model_path') print(glove_vector_model_path) word_vector_model_path = configuration.get_config('word_vector_model_path') print(word_vector_model_path) glove_to_word2vec(glove_vector_model_path, word_vector_model_path)
def handle_essay2sentences(): # 0. 从文件读取句子 # Columns: [id, author, result, content, feature, title, url] path = configuration.get_config('sentences_path') content = pd.read_csv(path, encoding='gb18030', usecols=['content'], iterator=True) index = 0 chunk_size = 20 chunks = [] loop = True while loop: try: chunk = content.get_chunk(chunk_size) start_index = index * chunk_size for i in range(chunk.size): essay = chunk.at[start_index + i, "content"] # chunks.append(chunk.at[start_index + i, "content"]) if isinstance(essay, str): # sentences = cut_sentences(essay) save_sentence([essay]) index += 1 if index == 2: break except StopIteration: print("read finish.") loop = False sentences_temp = [] for chunk in chunks: sentences_temp += str(chunk).split("。") sentences = [] for sentence in sentences_temp: s = sentence.split(",") sentences += s return sentences
def save_sentences_embedding(sentences: list, embedding: list): path = configuration.get_config('embedding_path') output = open(path, 'w') lines = [] for s, vector in zip(sentences, embedding): vector_str = reduce(lambda x, y: str(x) + " " + str(y), vector) lines.append(s + " " + vector_str + "\r") output.writelines(lines) output.close() return
def load_word_vector_model(path=None) -> Word2Vec: """ 加载已训练的词向量模型 Returns ------- 加载的模型 """ if path is None: path = configuration.get_config('word_vector_model_path') print("加载的词向量的路径: " + path) # 加载word2vec模型: 保存的形式为二进制 word_embedding = gensim.models.Word2Vec.load(path) # 加载glove转换的模型: 保存的为文本形式 # word_embedding = KeyedVectors.load_word2vec_format(path) return word_embedding
def get_essays() -> list: essays_path = configuration.get_config('essays_path') contents = pd.read_csv(essays_path, encoding='gb18030', usecols=['content', 'title']) essays = [] for each in contents.iterrows(): content = str(each[1]['content']).strip() title = str(each[1]['title']).strip() if title is None or not isinstance(title, str): title = '' if content is None or not isinstance(content, str): content = '' essay = Essay(title=title, content=content) essays.append(essay) print("获取到的文章数:" + str(len(essays))) return essays
def get_words_frequency_dict(): frequency_file = configuration.get_config('frequency_file') print("load word frequency file.") word2weight = {} with open(frequency_file, encoding='utf-8') as f: lines = f.readlines() for line in lines: line = line.strip() if len(line) <= 0: continue line = line.split() if len(line) == 2: word2weight[line[0]] = float(line[1]) else: print(line) return word2weight
def get_sentences(start: int, size: int, is_all: bool): path = configuration.get_config("processed_sentences") if is_all: with open(path, encoding='utf-8') as reader: sentences_all = reader.read().split("\n") sentences = [] for sen in sentences_all: if len(sen) > 0: sentences.append(sen) return sentences sentences = [] if start < 0 or size <= 0: print("参数错误请检查,start={}, size={}", start, size) return sentences linecache.clearcache() for index in range(size): sen = linecache.getline(path, start + index) if len(sen.strip()) > 1: sentences.append(sen.replace("\n", "")) return sentences
def save_sentence(sentences: list): path = configuration.get_config("processed_sentences") output = open(path, 'a+', encoding='utf-8') output.writelines(sentences) output.close()