コード例 #1
0
def test():
    glove_vector_model_path = configuration.get_config(
        'glove_vector_model_path')
    print(glove_vector_model_path)
    word_vector_model_path = configuration.get_config('word_vector_model_path')
    print(word_vector_model_path)
    glove_to_word2vec(glove_vector_model_path, word_vector_model_path)
コード例 #2
0
def handle_essay2sentences():
    # 0. 从文件读取句子
    # Columns: [id, author, result, content, feature, title, url]
    path = configuration.get_config('sentences_path')
    content = pd.read_csv(path, encoding='gb18030', usecols=['content'], iterator=True)
    index = 0
    chunk_size = 20
    chunks = []
    loop = True
    while loop:
        try:
            chunk = content.get_chunk(chunk_size)
            start_index = index * chunk_size
            for i in range(chunk.size):
                essay = chunk.at[start_index + i, "content"]
                # chunks.append(chunk.at[start_index + i, "content"])
                if isinstance(essay, str):
                    # sentences = cut_sentences(essay)
                    save_sentence([essay])
            index += 1
            if index == 2:
                break
        except StopIteration:
            print("read finish.")
            loop = False
    sentences_temp = []
    for chunk in chunks:
        sentences_temp += str(chunk).split("。")
    sentences = []
    for sentence in sentences_temp:
        s = sentence.split(",")
        sentences += s
    return sentences
コード例 #3
0
def save_sentences_embedding(sentences: list, embedding: list):
    path = configuration.get_config('embedding_path')
    output = open(path, 'w')
    lines = []
    for s, vector in zip(sentences, embedding):
        vector_str = reduce(lambda x, y: str(x) + " " + str(y), vector)
        lines.append(s + " " + vector_str + "\r")
    output.writelines(lines)
    output.close()
    return
コード例 #4
0
def load_word_vector_model(path=None) -> Word2Vec:
    """ 加载已训练的词向量模型
    Returns
    -------
        加载的模型
    """
    if path is None:
        path = configuration.get_config('word_vector_model_path')
    print("加载的词向量的路径: " + path)
    # 加载word2vec模型: 保存的形式为二进制
    word_embedding = gensim.models.Word2Vec.load(path)
    # 加载glove转换的模型: 保存的为文本形式
    # word_embedding = KeyedVectors.load_word2vec_format(path)
    return word_embedding
コード例 #5
0
def get_essays() -> list:
    essays_path = configuration.get_config('essays_path')
    contents = pd.read_csv(essays_path, encoding='gb18030', usecols=['content', 'title'])
    essays = []
    for each in contents.iterrows():
        content = str(each[1]['content']).strip()
        title = str(each[1]['title']).strip()
        if title is None or not isinstance(title, str):
            title = ''
        if content is None or not isinstance(content, str):
            content = ''
        essay = Essay(title=title, content=content)
        essays.append(essay)
    print("获取到的文章数:" + str(len(essays)))
    return essays
コード例 #6
0
def get_words_frequency_dict():
    frequency_file = configuration.get_config('frequency_file')
    print("load word frequency file.")
    word2weight = {}
    with open(frequency_file, encoding='utf-8') as f:
        lines = f.readlines()
    for line in lines:
        line = line.strip()
        if len(line) <= 0:
            continue
        line = line.split()
        if len(line) == 2:
            word2weight[line[0]] = float(line[1])
        else:
            print(line)
    return word2weight
コード例 #7
0
def get_sentences(start: int, size: int, is_all: bool):
    path = configuration.get_config("processed_sentences")
    if is_all:
        with open(path, encoding='utf-8') as reader:
            sentences_all = reader.read().split("\n")
            sentences = []
            for sen in sentences_all:
                if len(sen) > 0:
                    sentences.append(sen)
            return sentences
    sentences = []
    if start < 0 or size <= 0:
        print("参数错误请检查,start={}, size={}", start, size)
        return sentences
    linecache.clearcache()
    for index in range(size):
        sen = linecache.getline(path, start + index)
        if len(sen.strip()) > 1:
            sentences.append(sen.replace("\n", ""))
    return sentences
コード例 #8
0
def save_sentence(sentences: list):
    path = configuration.get_config("processed_sentences")
    output = open(path, 'a+', encoding='utf-8')
    output.writelines(sentences)
    output.close()