def get_embedding(source): # weight = alpah/(alpah + p) # alpha is a parameter, 1e-3 ~ 1e-5 alpha = 1e-4 if os.path.exists(words_frequence_path): with open(words_frequence_path, 'rb') as f: frequence = pickle.load(f) else: from word2vec.save_words_frequence import save_words_frequence save_words_frequence(words_frequence_path) with open(words_frequence_path, 'rb') as f: frequence = pickle.load(f) max_fre = max(frequence.values()) words = cut(''.join(token(source))) word2vec = get_word2vec(word2vec_path) embedding = np.zeros_like(word2vec.wv['测试']) words = [w for w in words if w in word2vec] for w in words: weight = alpha / (alpha + frequence.get(w, max_fre)) embedding += weight * word2vec[w] embedding /= len(words) return embedding
def name(self): if self.type == 0: if not self._args: raise Exception('`table` lack of args') return self._args[0] if self._name: return self._name self._name = token() return self._name
def set_tokens(self, line): text = '' counter = 0 while counter <= len(line) - 1: w = line[counter] if w in [' ', '\t']: text = '' counter += 1 continue text += w # print(w, ' ', text) if text in grammer.keyword: tok = util.token(text, 'keyword') self.tokens.append(tok) text = '' if text == ' ': text = '' if text in grammer.assignment: tok = util.token(text, 'assign') self.tokens.append(tok) text = '' if text in ["\'", '\"']: callback = self.string_action(line, counter) counter = callback['counter'] text = callback['string'] tok = util.Token(text, 'string') self.tokens.append(tok) text = '' continue counter += 1
def __init__(self): super().__init__("postgres") self.config["ADMIN_PANEL_PASSWORD"] = token() self.config["ADMIN_DB_PASSWORD"] = token() self.config["GITEA_DB_PASSWORD"] = token() self.config["COURSE_DB_PASSWORD"] = token() self.config["AUTH_DB_PASSWORD"] = token() self.config["CHAT_DB_PASSWORD"] = token()
def get_sentence_cos(original_text, title): stop_words, dictionary, lda = get_model() sentences = split_sentence(original_text) if sentences == []: raise NameError sentences_cos = {} if title: original_text += title content_ndarray = get_ndarray(cut(''.join(token(original_text))), stop_words, dictionary, lda) for i, sentence in enumerate(sentences): sentences_cos[i] = get_cos_with_content(sentence, content_ndarray, stop_words, dictionary, lda) return sentences, sentences_cos
def save_words_frequence(corpus_path): news_content = pd.read_csv(news_path, encoding='gb18030') news_content.dropna(subset=['content'], inplace=True) news_content.drop_duplicates(subset=['content'], inplace=True) news_content_cut = [token(n) for n in news_content['content']] news_content_cut = [''.join(n) for n in news_content_cut] news_content_cut = [cut(n) for n in news_content_cut] words = [] for document in news_content_cut: words += [w for w in document.split()] if os.path.exists(corpus_path): print('文件已存在, 请勿重复写入') else: with open(corpus_path, 'w', encoding='utf-8') as f: for sent_cut in news_content_cut: f.write(sent_cut) f.write('\n')
def save_words_frequence(words_frequence_path): news_content = pd.read_csv(news_path, encoding='gb18030') news_content.dropna(subset=['content'], inplace=True) news_content.drop_duplicates(subset=['content'], inplace=True) news_content_cut = [token(n) for n in news_content['content']] news_content_cut = [''.join(n) for n in news_content_cut] news_content_cut = [cut(n) for n in news_content_cut] words = [] for document in news_content_cut: words += [w for w in document.split()] words_counter = Counter(words) frequence = {w:count/len(words) for w, count in words_counter.items()} if os.path.exists(words_frequence_path): print('文件已存在, 请勿重复写入') else: with open(words_frequence_path, 'wb') as f: pickle.dump(frequence, f)
currword = "" ch = getchar() if not ch: break currword += ch if ch in [str(x) for x in range(10)]: while True: ch = getchar() if not ch: break if ch not in [str(x) for x in range(10)]: pushchar(ch) break currword += ch tokens.append(util.token("NUMBER", currword, linenum)) elif currword in OPERATORS: while True: ch = getchar() if not ch: break if ch not in "".join(OPERATORS): pushchar(ch) break currword += ch tokens.append(util.token("OPERATOR", currword, linenum)) elif currword == '"': ending = currword while True:
def table(self): if isinstance(self.query['table'], basestring): return Table(self.query['table']) if not self.query['table'] and isinstance(self._bindobj, Query): return Table(self._bindobj._alias or token()) return clause(self.query['table'])
def run(self): # pylint: disable=arguments-differ print(f"Starting up. Using YoutubeDL [{ydl_version}]") super().run(token())
def get_cos_with_content(sentence, content_ndarray, stop_words, dictionary, lda): sentence_ndarray = get_ndarray(cut(''.join(token(sentence))), stop_words, dictionary, lda) return cosine(sentence_ndarray, content_ndarray)
def __init__(self): super().__init__("auth") self.config["AUTH_FLASK_SECRET_KEY"] = token()
def get_sentence_cos(original_text, title): word2vec = get_word2vec(word2vec_path) stop_words = get_stop_words(stopwords_path) sentences = split_sentence(original_text) if sentences == []: raise NameError sentences_cut = [cut(''.join(token(n))) for n in sentences] sentences_cut_del_stopwords = [] is_title = False # 处理标题 if title: title_cut = [cut(''.join(token(title)))] words = title_cut[0].split() title_cut_del_stopwords = list(set(words) - set(stop_words)) if title_cut_del_stopwords != []: is_title = True for s in sentences_cut: words = s.split() sentence_cut_del_stopwords = list(set(words) - set(stop_words)) if sentence_cut_del_stopwords != []: sentences_cut_del_stopwords.append(sentence_cut_del_stopwords) if sentence_cut_del_stopwords == []: raise NameError # 得到文章向量,句子向量 sentences_vec = [] additional_wordvec = {} text_vec = np.zeros_like(word2vec.wv['测试']) for i, sentence in enumerate(sentences_cut_del_stopwords): sentence_vec = np.zeros_like(word2vec.wv['测试']) for word in sentence: if word in word2vec.wv.vocab: sentence_vec += word2vec.wv[word] elif word in additional_wordvec: sentence_vec += additional_wordvec[word] else: additional_wordvec[word] = np.random.random( word2vec.wv['测试'].shape) sentence_vec = sentence_vec / len(sentence) # 第一句话比较重要,说三遍 if i == 0: text_vec += sentence_vec * 3 else: text_vec += sentence_vec sentences_vec.append(sentence_vec) if is_title: title_vec = np.zeros_like(word2vec.wv['测试']) for word in title_cut_del_stopwords: if word in word2vec.wv.vocab: title_vec += word2vec.wv[word] elif word in additional_wordvec: title_vec += additional_wordvec[word] else: additional_wordvec[word] = np.random.random( word2vec.wv['测试'].shape) text_vec += text_vec * 3 # 多加了两遍第一句话,三遍title text_vec /= len(sentences) + 5 # 求句子向量与文章向量的cosine sentences_cos = {} for i, sentence_vec in enumerate(sentences_vec): sentences_cos[i] = cosine(sentence_vec, text_vec) return sentences, sentences_cos