Beispiel #1
0
def get_embedding(source):
    # weight = alpah/(alpah + p)
    # alpha is a parameter, 1e-3 ~ 1e-5
    alpha = 1e-4

    if os.path.exists(words_frequence_path):
        with open(words_frequence_path, 'rb') as f:
            frequence = pickle.load(f)
    else:
        from word2vec.save_words_frequence import save_words_frequence
        save_words_frequence(words_frequence_path)
        with open(words_frequence_path, 'rb') as f:
            frequence = pickle.load(f)

    max_fre = max(frequence.values())
    words = cut(''.join(token(source)))
    word2vec = get_word2vec(word2vec_path)
    embedding = np.zeros_like(word2vec.wv['测试'])

    words = [w for w in words if w in word2vec]

    for w in words:
        weight = alpha / (alpha + frequence.get(w, max_fre))
        embedding += weight * word2vec[w]

    embedding /= len(words)

    return embedding
Beispiel #2
0
 def name(self):
     if self.type == 0:
         if not self._args:
             raise Exception('`table` lack of args')
         return self._args[0]
     if self._name:
         return self._name
     self._name = token()
     return self._name
Beispiel #3
0
    def set_tokens(self, line):
        text = ''

        counter = 0

        while counter <= len(line) - 1:

            w = line[counter]

            if w in [' ', '\t']:
                text = ''
                counter += 1
                continue

            text += w
            # print(w, ' ', text)

            if text in grammer.keyword:
                tok = util.token(text, 'keyword')
                self.tokens.append(tok)
                text = ''

            if text == ' ':
                text = ''

            if text in grammer.assignment:
                tok = util.token(text, 'assign')
                self.tokens.append(tok)
                text = ''

            if text in ["\'", '\"']:
                callback = self.string_action(line, counter)
                counter = callback['counter']
                text = callback['string']

                tok = util.Token(text, 'string')
                self.tokens.append(tok)
                text = ''
                continue

            counter += 1
Beispiel #4
0
 def __init__(self):
     super().__init__("postgres")
     self.config["ADMIN_PANEL_PASSWORD"] = token()
     self.config["ADMIN_DB_PASSWORD"] = token()
     self.config["GITEA_DB_PASSWORD"] = token()
     self.config["COURSE_DB_PASSWORD"] = token()
     self.config["AUTH_DB_PASSWORD"] = token()
     self.config["CHAT_DB_PASSWORD"] = token()
Beispiel #5
0
def get_sentence_cos(original_text, title):
    stop_words, dictionary, lda = get_model()
    sentences = split_sentence(original_text)
    if sentences == []:
        raise NameError
    sentences_cos = {}

    if title:
        original_text += title
    content_ndarray = get_ndarray(cut(''.join(token(original_text))),
                                  stop_words, dictionary, lda)

    for i, sentence in enumerate(sentences):
        sentences_cos[i] = get_cos_with_content(sentence, content_ndarray,
                                                stop_words, dictionary, lda)

    return sentences, sentences_cos
def save_words_frequence(corpus_path):
    news_content = pd.read_csv(news_path, encoding='gb18030')
    news_content.dropna(subset=['content'], inplace=True)
    news_content.drop_duplicates(subset=['content'], inplace=True)
    news_content_cut = [token(n) for n in news_content['content']]
    news_content_cut = [''.join(n) for n in news_content_cut]
    news_content_cut = [cut(n) for n in news_content_cut]

    words = []
    for document in news_content_cut:
        words += [w for w in document.split()]

    if os.path.exists(corpus_path):
        print('文件已存在, 请勿重复写入')
    else:
        with open(corpus_path, 'w', encoding='utf-8') as f:
            for sent_cut in news_content_cut:
                f.write(sent_cut)
                f.write('\n')
Beispiel #7
0
def save_words_frequence(words_frequence_path):
    news_content = pd.read_csv(news_path, encoding='gb18030')
    news_content.dropna(subset=['content'], inplace=True)
    news_content.drop_duplicates(subset=['content'], inplace=True)
    news_content_cut = [token(n) for n in news_content['content']]
    news_content_cut = [''.join(n) for n in news_content_cut]
    news_content_cut = [cut(n) for n in news_content_cut]


    words = []
    for document in news_content_cut:
        words += [w for w in document.split()]

    words_counter = Counter(words)
    frequence = {w:count/len(words) for w, count in words_counter.items()}

    if os.path.exists(words_frequence_path):
        print('文件已存在, 请勿重复写入')
    else:
        with open(words_frequence_path, 'wb') as f:
            pickle.dump(frequence, f)
Beispiel #8
0
    currword = ""
    ch = getchar()
    if not ch:
        break
    currword += ch

    if ch in [str(x) for x in range(10)]:
        while True:
            ch = getchar()
            if not ch:
                break
            if ch not in [str(x) for x in range(10)]:
                pushchar(ch)
                break
            currword += ch
        tokens.append(util.token("NUMBER", currword, linenum))

    elif currword in OPERATORS:
        while True:
            ch = getchar()
            if not ch:
                break
            if ch not in "".join(OPERATORS):
                pushchar(ch)
                break
            currword += ch
        tokens.append(util.token("OPERATOR", currword, linenum))

    elif currword == '"': 
        ending = currword
        while True:
Beispiel #9
0
 def table(self):
     if isinstance(self.query['table'], basestring):
         return Table(self.query['table'])
     if not self.query['table'] and isinstance(self._bindobj, Query):
         return Table(self._bindobj._alias or token())
     return clause(self.query['table'])
Beispiel #10
0
 def run(self):  # pylint: disable=arguments-differ
     print(f"Starting up. Using YoutubeDL [{ydl_version}]")
     super().run(token())
Beispiel #11
0
def get_cos_with_content(sentence, content_ndarray, stop_words, dictionary,
                         lda):
    sentence_ndarray = get_ndarray(cut(''.join(token(sentence))), stop_words,
                                   dictionary, lda)

    return cosine(sentence_ndarray, content_ndarray)
Beispiel #12
0
 def __init__(self):
     super().__init__("auth")
     self.config["AUTH_FLASK_SECRET_KEY"] = token()
def get_sentence_cos(original_text, title):
    word2vec = get_word2vec(word2vec_path)
    stop_words = get_stop_words(stopwords_path)
    sentences = split_sentence(original_text)
    if sentences == []:
        raise NameError
    sentences_cut = [cut(''.join(token(n))) for n in sentences]
    sentences_cut_del_stopwords = []

    is_title = False
    # 处理标题
    if title:
        title_cut = [cut(''.join(token(title)))]
        words = title_cut[0].split()
        title_cut_del_stopwords = list(set(words) - set(stop_words))
        if title_cut_del_stopwords != []:
            is_title = True

    for s in sentences_cut:
        words = s.split()
        sentence_cut_del_stopwords = list(set(words) - set(stop_words))
        if sentence_cut_del_stopwords != []:
            sentences_cut_del_stopwords.append(sentence_cut_del_stopwords)

    if sentence_cut_del_stopwords == []:
        raise NameError

    # 得到文章向量,句子向量
    sentences_vec = []
    additional_wordvec = {}
    text_vec = np.zeros_like(word2vec.wv['测试'])
    for i, sentence in enumerate(sentences_cut_del_stopwords):
        sentence_vec = np.zeros_like(word2vec.wv['测试'])
        for word in sentence:
            if word in word2vec.wv.vocab:
                sentence_vec += word2vec.wv[word]
            elif word in additional_wordvec:
                sentence_vec += additional_wordvec[word]
            else:
                additional_wordvec[word] = np.random.random(
                    word2vec.wv['测试'].shape)
        sentence_vec = sentence_vec / len(sentence)
        # 第一句话比较重要,说三遍
        if i == 0:
            text_vec += sentence_vec * 3
        else:
            text_vec += sentence_vec
        sentences_vec.append(sentence_vec)

    if is_title:
        title_vec = np.zeros_like(word2vec.wv['测试'])
        for word in title_cut_del_stopwords:
            if word in word2vec.wv.vocab:
                title_vec += word2vec.wv[word]
            elif word in additional_wordvec:
                title_vec += additional_wordvec[word]
            else:
                additional_wordvec[word] = np.random.random(
                    word2vec.wv['测试'].shape)

    text_vec += text_vec * 3
    # 多加了两遍第一句话,三遍title
    text_vec /= len(sentences) + 5

    # 求句子向量与文章向量的cosine
    sentences_cos = {}
    for i, sentence_vec in enumerate(sentences_vec):
        sentences_cos[i] = cosine(sentence_vec, text_vec)
    return sentences, sentences_cos