def get_topic_term_frequency(topic_texts, min_df=1): vector = CountVectorizer(ngram_range=(1, 1), stop_words='english', min_df=min_df) vector.build_analyzer() tf = vector.fit_transform(topic_texts) return tf.toarray().sum(axis=0), vector
def __init__(self, ngram=False, use_idf=False): self.ngram = ngram self.use_idf = use_idf # Load WordNet synsets and download data if necessary try: wordnet_path = nltk.data.find("corpora/wordnet") except LookupError: nltk.download("wordnet") wordnet_path = nltk.data.find("corpora/wordnet") self.wn = wordnet.WordNetCorpusReader(wordnet_path) # Initialize the two types of n-gram generators pentagram_vectorizer = CountVectorizer( ngram_range=(1, 5), token_pattern=r"\b[A-Za-z]+\b", min_df=1, stop_words=stop_list ) unigram_vectorizer = CountVectorizer( ngram_range=(1, 1), token_pattern=r"\b[A-Za-z]+\b", min_df=1, stop_words=stop_list ) # Function for generating five-grams through unigrams self.pent_analyze = pentagram_vectorizer.build_analyzer() # Function for generating just unigrams self.uni_analyze = unigram_vectorizer.build_analyzer() # Load IDF scores self.IDF = self.get_idf_scores() self.counts = self.get_counts()
def main(): input_train_file_ptr = sys.argv[1] input_test_file_ptr = sys.argv[2] # read the csv file and return the pandas dataframe with two column as tweets and sentiment as columns. train_tweests_with_sentiments = pre_process_input_data(input_train_file_ptr) test_tweets_data = pre_process_input_data(input_test_file_ptr) bigram_vectorizer = CountVectorizer(ngram_range=(2,2),token_pattern=r'\b\w+\b', min_df=1,lowercase=True) # print tweests_array tweets_array, sentiments_array = get_tweest_and_sentiments(train_tweests_with_sentiments) print("size of tweets array is %s and sentiment array is %s " % (tweets_array.size, sentiments_array.size)) test_tweets,test_sentiments = get_tweest_and_sentiments(test_tweets_data) test_sentiments = test_sentiments.flatten() print("size of test tweets array is %s and test sentiment array is %s " % (test_tweets.size, test_sentiments.size)) parsed_train_tweets = clean_data_to_feed_classifier(tweets_array) parsed_test_tweets = clean_data_to_feed_classifier(test_tweets) # print parsed_tweests x = bigram_vectorizer.fit_transform(parsed_train_tweets) print x.size # print bigram_vectorizer.get_feature_names() bigram_vectorizer.build_analyzer() print "done 1" # print bigram_vectorizer.get_feature_names() res = bigram_vectorizer.transform(parsed_test_tweets) print "done 2" clf = LinearSVC() gnb = MultinomialNB() print "done 2" trained_classifier = do_K_fold_cross_validation(clf,gnb,x,sentiments_array.flatten()) # trained_classifier.fit(x, sentiments_array.flatten()) print "done 3" output = trained_classifier.predict(res) # print output print accuracy_score(test_sentiments,output)
def get_count_vectorizer(sentences): from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer() x = vectorizer.fit_transform(sentences) vectorizer.build_analyzer() return pd.DataFrame(x.todense(), columns=vectorizer.get_feature_names())
def Common_Vectorizer_usage(): from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer(min_df=1) corpus = [ 'This is the first document.', 'This is the second second document.', 'And the third one.', 'Is this the first document?', ] analyze = vectorizer.build_analyzer() print analyze("This is a text document to analyze.") print analyze("This is a text document to analyze.") == ['this', 'is', 'text', 'document', 'to', 'analyze'] X=vectorizer.fit_transform(corpus) print vectorizer.get_feature_names() print vectorizer.vocabulary_ #.get('document') print vectorizer.transform(['Something completely new.']).toarray() print list(X) #bigram======================================================== bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),token_pattern=r'\b\w+\b', min_df=1) analyze = bigram_vectorizer.build_analyzer() print analyze('Bi-grams are cool!') X_2 = bigram_vectorizer.fit_transform(corpus).toarray() print X_2 feature_index = bigram_vectorizer.vocabulary_.get('is this') print X_2[:, feature_index] #marui test print '\n\nmarui test=====================' def t_preprocessor(s): return ','.join([x.lower() for x in s.split(' ')]) stop_words1=['is','a','this'] #is ok: frozenset(['a', 'this', 'is']) stop_words2={'is':0,'a':1,'this':2} #is ok: convert to frozenset(['a', 'this', 'is']) cv = CountVectorizer(preprocessor=t_preprocessor,stop_words=stop_words2) params=cv.get_params() print 'get_params()',type(params),'---------------' for k in params: print k,'\t',params[k] print 'get_params end--------------' print '\nget_stop_words=',cv.get_stop_words() cv.fit(corpus) print cv.get_feature_names() print cv.transform(corpus).toarray() print '\n测试preprocesser, result:\t',cv.build_preprocessor()('this is a document') print '\n测试tokenizer,result',cv.build_tokenizer()('this is a document') print '\n测试tokenizer2,result',cv.build_tokenizer()('th-is is a document') print '\n测试tokenizer2,result',cv.build_tokenizer()('th_is is a document') print '\n测试tokenizer2,result',cv.build_tokenizer()('th&is is a document') """
def get_count(x): x = ' '.join(x) s_vect1 = CountVectorizer(ngram_range=(0, 1), stop_words='english') s_analyzer1 = s_vect1.build_analyzer() s_listNgramQuery1 = s_analyzer1(x) print(s_listNgramQuery1) s_vect2 = CountVectorizer(ngram_range=(2, 4)) s_analyzer2 = s_vect2.build_analyzer() s_listNgramQuery2 = s_analyzer2(x) print(s_listNgramQuery2) result = s_listNgramQuery1 + s_listNgramQuery2 print(result) #get the main freqdist top 20 words return result
def test_classfier_ngram(test, vocabulary, classifiers): vectorizer = CountVectorizer(ngram_range=(1, 3)) correct = 0 count = [0, 0, 0, 0, 0] for phrase in test: f = [] words = phrase[0] vector = [0] * (len(classifiers[0]) - 1) tokens = vectorizer.build_analyzer()(words) for token in tokens: if token in vocabulary: vector[vocabulary[token]] += 1 x = np.array([1] + vector) for classifier in classifiers: f.append(x.dot(classifier)) estimate_phrase_class = 0 for i in range(len(f)): if f[i] > 0: estimate_phrase_class = i + 1 count[estimate_phrase_class] += 1 true_phrase_class = int(phrase[1]) if (estimate_phrase_class == true_phrase_class): correct += 1 print("Correct: " + str(correct) + "/" + str(len(test))) print(correct / len(test)) print(count)
def bulidModel(self, filename, topicwordnum): corpus = [] vocab = [] for line in open(filename, 'r').readlines(): print line corpus.append(line.strip()) vocab += line.split(" ") #print corpus #将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频 vectorizer = CountVectorizer() print vocab print vectorizer X = vectorizer.fit_transform(corpus) analyze = vectorizer.build_analyzer() weight = X.toarray() print len(weight) print(weight[:5, :5]) model = lda.LDA(n_topics=5, n_iter=500, random_state=1) model.fit(np.asarray(weight)) topic_word = model.topic_word_ n_top_words = topicwordnum for i, topic_dist in enumerate(topic_word): topic_words = np.array(vocab)[np.argsort( topic_dist)][:-(n_top_words + 1):-1] print('Topic {}: {}'.format(i, ' '.join(topic_words))) doc_topic = model.doc_topic_ print("type(doc_topic): {}".format(type(doc_topic))) print("shape: {}".format(doc_topic.shape)) #输出前10篇文章最可能的Topic label = [] for n in range(20): topic_most_pr = doc_topic[n].argmax() label.append(topic_most_pr) print("doc: {} topic: {}".format(n, topic_most_pr))
def ida(articles): stopwords = [] doc_terms = [] with open('ch_stopwords.txt', 'r') as f: stopwords = set(f.read().lower().split('\n')) #print('stopwords', stopwords[:10]) vocab = joblib.load(open('lda-vocab.pkl', 'rb')) pkl_file = open('lda-n8-2.pkl', 'rb') lda = joblib.load(pkl_file) trigram_vectorizer = CountVectorizer(ngram_range=(2,3),token_pattern=r'([\u4e00-\u9fa5]{1}|)', vocabulary=vocab, stop_words=stopwords, analyzer='word') analyzer = trigram_vectorizer.build_analyzer() ''' for article in articles: terms = analyzer(article.Content) score = lda.score(terms) print(score) ''' article_contents = map(lambda x: x.Content, articles) doc_terms = trigram_vectorizer.fit_transform(article_contents) test = lda.transform(doc_terms) for i, scores in enumerate(test): if(i%500==0):print('update article %i' %i) article = articles[i] article.update_scores(list(scores))
class Featurizer(object): def __init__(self): self.sentiment_analyzer = Sentiment('data/AFINN-111.txt') self.bow_vectorizer = None self.bow_analyzer = None def bag_of_words(self, body): return self.bow_vectorizer.transform([body]).toarray() def text_features(self, comment): num_chars = len(comment.get("body")) num_links = count_links(comment.get("body")) simple_tokens = comment.get("body").split(' ') num_words = 0 avg_word_length = 0 for token in simple_tokens: num_words += 1 avg_word_length += len(token) avg_word_length = float(avg_word_length) / float(num_words) sentiment = self.sentiment_analyzer.analyze( self.bow_analyzer(comment.get("body"))) score = comment.get("score") return [num_chars, num_links, num_words, num_words, avg_word_length, sentiment] def transform_comment(self, comment): return numpy.hstack(( numpy.array([self.text_features(comment)], dtype='float_'), self.bag_of_words(comment.get("body")))) def score_comment(self, comment): return comment.get("score") def transform(self, comments): """ Returns a Nx(D+1) numpy matrix of features. The first D columns correspond to features, where the final column corresponds to the scores of each comment""" # if it's a single instance, return an array if isinstance(comments, dict): return transform_comment(comments) # http://scikit-learn.org/stable/modules/feature_extraction.html self.bow_vectorizer = CountVectorizer(min_df=1) self.bow_vectorizer.fit([c.get("body") for c in comments]) self.bow_analyzer = self.bow_vectorizer.build_analyzer() def features_and_label(comment): return numpy.hstack(( self.transform_comment(comment), numpy.array([[self.score_comment(comment)]], dtype='float_'))) return numpy.vstack([features_and_label(c) for c in comments])
def keyword_frequency(keyword, directory): freq_table = {} for source in glob.glob(os.path.join(directory, '*')): words = '' vect = CountVectorizer(ngram_range=(1, 3)) analyzer = vect.build_analyzer() for f in glob.glob(os.path.join(source, '*.json')): j = json.load(open(f)) if j['Language'] == 'chinese': words += ' '.join(jieba.cut(j['Title'])) words += ' '.join(jieba.cut(j['Content'])) elif j['Language'] == 'english': words += j['Title'] words += j['Content'] ngram_query = analyzer(words) fdist = nltk.FreqDist(ngram_query) freq = fdist.freq(keyword.lower()) freq_table[os.path.basename(source)] = freq pprint.pprint(freq_table) sorted_list = sorted(freq_table, key=freq_table.get, reverse=True) print('=================') print("%s loves %s most." % (sorted_list[0], keyword)) plt.bar(range(len(freq_table)), freq_table.values(), align="center") plt.xticks(range(len(freq_table)), list(freq_table.keys())) plt.show()
def msg2list(msg, ngram_range=(1, 2)): from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer(ngram_range=ngram_range) analyze = vectorizer.build_analyzer() return analyze(msg)
def extractFeatures(examples, vocab=None, frequent_ngram_col_idx=None): corpus = [] # get bags of words for each trainingexample for x, y in examples: corpus.append(x) # corpus = np.array(examples[:,0]) vectorizer = CountVectorizer(vocabulary=vocab, ngram_range=(1, 3), token_pattern=r'\b\w+\b', min_df=1) X = vectorizer.fit_transform(corpus) analyze = vectorizer.build_analyzer() fullfeature = X.toarray() print('SHAPE in Fit Model', len(fullfeature), len(fullfeature[0])) if not frequent_ngram_col_idx: sums = np.sum(fullfeature, axis=0) frequent_ngram_col_idx = np.nonzero( [x > freq_threshold for x in sums]) # specify frequency threshold to include in vocab # consider passing in pruned vocab to not need next line for dev fullfeature = fullfeature[:, frequent_ngram_col_idx[0]] print('NEW SHAPE', len(fullfeature), len(fullfeature[0])) # TODO: append new features here especially separating out genre, rating return fullfeature, vectorizer.vocabulary_, frequent_ngram_col_idx
def preprocess_tokenize(data, language, ngram=(1, 1), min_df=0.01, max_df=0.9): """ Read a list of strings. return a list of list of words without stopwords, tokenized :param data: list (iterable) of text items :param language: 'en' or 'fr' :param ngram: range of the n-gram to use (1,1) or (1,2) mostly :param min_df: cutoff for unfrequent words, between 0 and max_df :param max_df: cutoff for too frequent words or context specific stop words, between min_df and 1 :return: processed data """ if language == 'en': stopwords = stop_words_en elif language == 'fr': stopwords = stop_words_fr else: raise ValueError('Wrong language ! ') vectorizer = CountVectorizer(input='content ', analyzer='word', ngram_range=ngram, stop_words=stopwords, min_df=min_df, max_df=max_df) analyzer = vectorizer.build_analyzer() processed = [ analyzer(doc) if (doc not in [np.NaN, np.nan]) else [] for doc in data ] return processed
class Corpus: def __init__(self, tweets): self.tweets = tweets self.vocab_size = -1 self.cv = CountVectorizer() self.tokenizer = None self.build_vocab() def build_vocab(self): strings = map(lambda tweet: tweet.raw_text, self.tweets) self.cv.fit_transform(strings) self.tokenizer = self.cv.build_analyzer() self.vocab_size = len(self.cv.vocabulary_.keys()) print("vocabulary size: %d" % self.vocab_size) def vocab(self): return self.cv.vocabulary_ def tweet2array(self, tweet): assert self.tokenizer is not None tokens = self.tokenizer(tweet.raw_text) V = self.vocab() return map(lambda t: V.get(t), tokens) def tokenize(self, tweet): return self.tokenizer(tweet.raw_text)
def common_theme_in_article_sections(article, add_more_stopwords): '''This function returns the common theme of the article using the bigram method''' #preprocess the text preprocessed = preprocess_articles(article, add_more_stopwords) #to findout the bigrams in articles vectorizer = CountVectorizer(ngram_range=(2, 2)) analyzer = vectorizer.build_analyzer() bigrams = [] for arti in preprocessed: bigrams.append(analyzer(arti)) theme_of_article = [] #finding out the common theme of every article by finding the top 10 bigrams for articles in bigrams: theme_of_article.append(Counter(articles).most_common(10)) #creating a list of potential themes of individual article potential_theme_in_article_section = [] for themes in theme_of_article: for val in themes: potential_theme_in_article_section.append(val[0]) x = Counter(potential_theme_in_article_section).most_common(5) return x
def __init__(self, data, target, target_names, preprocess=False): """ Initializes the classifier by training it on the given data. :param data: text documents to train the classifier on :type data: list :param target: category indexes for each text document :type target: list :param target_names: category names :type target_names: list """ self.__clf_data = ClassifierData(data, target, target_names) if preprocess: analyzer = CountVectorizer.build_analyzer() ipp = InputPreprocessor(None) def preprocess(doc): return [ipp.normalise(word) for word in analyzer(doc)] vectorizer = CountVectorizer(analyzer=preprocess) else: vectorizer = CountVectorizer() self.__text_clf = Pipeline([ ('vect', vectorizer), ('tfidf', TfidfTransformer()), ('clf', DecisionTreeClassifier(criterion='entropy')), ])
def bulidModel(self,filename,topicwordnum): corpus = [] vocab = [] for line in open(filename, 'r').readlines(): print line corpus.append(line.strip()) vocab +=line.split(" ") #print corpus #将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频 vectorizer = CountVectorizer() print vocab print vectorizer X = vectorizer.fit_transform(corpus) analyze = vectorizer.build_analyzer() weight = X.toarray() print len(weight) print (weight[:5, :5]) model = lda.LDA(n_topics=5, n_iter=500, random_state=1) model.fit(np.asarray(weight)) topic_word = model.topic_word_ n_top_words = topicwordnum for i, topic_dist in enumerate(topic_word): topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1] print('Topic {}: {}'.format(i, ' '.join(topic_words))) doc_topic = model.doc_topic_ print("type(doc_topic): {}".format(type(doc_topic))) print("shape: {}".format(doc_topic.shape)) #输出前10篇文章最可能的Topic label = [] for n in range(20): topic_most_pr = doc_topic[n].argmax() label.append(topic_most_pr) print("doc: {} topic: {}".format(n, topic_most_pr))
def build_vectorizer(self, sequences_lists, stop_w='english', min_df=0): vectorizer = CountVectorizer(stop_words=stop_w, min_df=min_df) vectorizer.fit(sequences_lists) word2index = vectorizer.vocabulary_ word2index['<PAD>'] = max(word2index.values()) + 1 tokenizer = vectorizer.build_analyzer() return word2index, tokenizer
def constructNGramsBOW(self): text = self.docName stopsWords = set(stopwords.words('english')) text = re.sub("[^a-zA-Z]", " ", text.lower()) text = re.sub("\s\s+", " ", text) #bigram = Phraser.load('mymodel/bigram_phraser_wikien2017') #trigram = Phraser.load('mymodel/trigram_phraser_wikien2017') #sent_tokenize_list = sent_tokenize(text) with open("data/embed.vocab") as f: vocab_list = map(str.strip, f.readlines()) #for line in sent_tokenize_list: # sent = word_tokenize(line) # line = trigram[bigram[sent]] # line = [w for w in line if not w in stopsWords ] with open("data/embed.vocab") as f: vocab_list = map(str.strip, f.readlines()) vocab_dict = {w: k for k, w in enumerate(vocab_list)} vectorizer = CountVectorizer(ngram_range=(1, 3)) analyzer = vectorizer.build_analyzer() sett = analyzer(text) sett = [token.replace(" ", "_") for token in sett] BOWNgram = [token for token in sett if token in vocab_dict.keys()] BOWNgram = Counter(BOWNgram) return OrderedDict(BOWNgram)
def create_topic(): # 存取语料库, 一行为一个文档 corpus = [] for line in open(documentfile, 'r').readlines(): corpus.append(line.strip()) print(corpus) # 将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频 vectorizer = CountVectorizer() X = vectorizer.fit_transform(corpus) analyze = vectorizer.build_analyzer() weight = X.toarray() print(X.shape) # LDA算法 model = lda.LDA(n_topics=10, n_iter=500, random_state=1) model.fit(np.asanyarray(weight)) topic_word = model.topic_word_ print(topic_word) n_top_words = 8 for i, topic_dist in enumerate(topic_word): topic_words = [np.argsort(topic_dist)][:-(n_top_words+1):-1] print('Topic {}: {}'.format(i, ' '.join(topic_words))) # 文档-主题分布 doc_topic = model.doc_topic_ print("type(doc_topic): {}".format(type(doc_topic))) print("shape: {}".format(doc_topic.shape)) # 输出前10篇文章最有可能的Topic label = [] for n in range(10): topic_most_pr = doc_topic[n].argmax() label.append(topic_most_pr) print("doc: {} topic: {}".format(n, topic_most_pr))
class GRUDataSet(Dataset): def __init__(self, path, max_seq_len): self.max_seq_len = max_seq_len df = pd.read_csv(path) self.vectorizer = CountVectorizer(stop_words='english', max_df=0.99, min_df=2) self.vectorizer.fit(df.text.tolist()) self.token2idx = self.vectorizer.vocabulary_ self.token2idx['<PAD>'] = max(self.token2idx.values()) + 1 self.tokenizer = self.vectorizer.build_analyzer() self.text_encoding_fun = lambda x: [ self.token2idx[token] for token in self.tokenizer(x) if token in self.token2idx ] self.padding_fun = lambda x: x + (max_seq_len - len(x) ) * [self.token2idx['<PAD>']] sequences = [ self.text_encoding_fun(x)[:max_seq_len] for x in df.text.tolist() ] self.sequences = [self.padding_fun(sequence) for sequence in sequences] self.labels = df.target.tolist() def __getitem__(self, i): return self.sequences[i], self.labels[i] def __len__(self): return len(self.sequences)
def buildFeatureMatrixRepresentation(stopwords,corpusRepresentation,corpusList,outPath): #featuresMatrix =[] if ((not corpusRepresentation.empty) and (corpusList)): fOutput = open(outPath,"w") vectorizer = CountVectorizer(lowercase=True,stop_words=stopwords,token_pattern='(?u)\\b[\\w+,-]+\\w+\\b|\\b\\w\\w+\\b') for abstractPath in corpusList: for counter,document in enumerate(glob.iglob(abstractPath)): if ((counter<MAX_NUM_ABSTRACTS) and (document)): try: fp = open(document,"r"); content = fp.read(); fp.close() if content: vector = []; #we split each document into tokens. analyser = vectorizer.build_analyzer() tokens = analyser(content); for word in corpusRepresentation.term: if any(word in s for s in tokens): vector.append(1) else: vector.append(0) #featuresMatrix.append(vector) fOutput.write(" ".join(str(x) for x in vector)+"\n") except: print "Error trying to build the representation for the document: "+document fOutput.close();
def _fit_language(self, X_unmapped: Sequence[str], X: Sequence[str], Y: np.ndarray): cv = CountVectorizer( max_df=0.95, min_df=2, lowercase=False, ngram_range=(1, self.hyperparams.max_ngram), max_features=(self.hyperparams.max_vocab * 18), token_pattern='[a-zA-Z0-9$&+,:;=?@_/~#\\[\\]|<>.^*()%!-]+') X_vec = cv.fit_transform(trivial_generator(X)) local_vocab = set() for feat in Y.columns: res = zip( cv.get_feature_names(), mutual_info_classif(X_vec, Y[feat], discrete_features=True)) local_vocab.update(res) self.vocab = { i[0] for i in sorted(local_vocab, key=lambda i: i[1], reverse=True) [:self.hyperparams.max_vocab] } self._analyzer = cv.build_analyzer()
def feature_extractor(data): data = data.decode('utf-8') lemmatizer = WordNetLemmatizer() stop_words = stopwords.words('english') tokens = word_tokenize(data) tokens_mod = [] i = 0 while i < len(tokens): curr = tokens[i] if curr == 'no' or curr == 'not': if i - 1 >= 0: tokens_mod[-1] = tokens_mod[-1] + '+' + curr if i + 1 <= len(tokens) - 1: tokens_mod.append(curr + '+' + tokens[i + 1]) i += 1 else: tokens_mod.append(curr) i += 1 data_mod = '' for token in tokens_mod: data_mod += ' ' + token bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), token_pattern=r'\b\w+\b', min_df=1) analyze = bigram_vectorizer.build_analyzer() bigrams = analyze(data_mod) features = {bigram: 1 for bigram in bigrams} return features
class AssociationRuleBased(AbstractFindTopicList): def __init__(self, analyzer=None, n_gram_boost_map={}): self.analyzer = analyzer self.n_gram_boost_map = n_gram_boost_map def getTopicList(self, doc_list, parm): from sklearn.feature_extraction.text import CountVectorizer from analysislib.datamining import AssociationRuleMining self.rule_miner = AssociationRuleMining.getAssociationRuleMiner( "aprior") if self.analyzer == None: print("No custom Analyser given") self.vectorizer = CountVectorizer(lowercase=True, stop_words='english') self.analyzer = self.vectorizer.build_analyzer() tokenize_doc_list = [] for doc in doc_list: token_list = self.analyzer(doc) tokenize_doc_list.append(token_list) associatoin_rules = self.rule_miner.getRule(tokenize_doc_list) return self.__get_filter_topic_from_rule(associatoin_rules) def __get_filter_topic_from_rule(self, association_rules): topic_list = [] for item in association_rules: pair = item[0] topic = "" for x in pair: topic += x + " " topic_list.append(topic) return topic_list
def fit(self, X_unmapped, X, Y, max_vocab=18000, max_features_to_test=180000, window=8, dims=32, max_ngram=5): cv = CountVectorizer( max_df=0.95, min_df=2, lowercase=False, ngram_range=(1, max_ngram), max_features=max_features_to_test, token_pattern='[a-zA-Z0-9$&+,:;=?@_/~#\\[\\]|<>.^*()%!-]+') X_vec = cv.fit_transform(self._smiles_to_trivial_lang(X)) local_vocab = set() for feat in Y.columns: res = zip(cv.get_feature_names(), mutual_info_classif( X_vec, Y[feat], discrete_features=True) ) local_vocab.update(res) self.vocab = {i[0] for i in sorted( local_vocab, key=lambda i: i[1], reverse=True)[:max_vocab]} self._analyzer = cv.build_analyzer() generator = self._make_iterator(X_unmapped, training=True) document_model = Doc2Vec( vector_size=dims, workers=cpu_count(), window=window) document_model.build_vocab(generator) document_model.train( generator, total_examples=len(X_unmapped), epochs=36) self.document_model = document_model
def iter_tool(title, n): bigram_vectorizer = CountVectorizer(ngram_range=(1, n), token_pattern=r'\b\w+\b', min_df=1) analyze = bigram_vectorizer.build_analyzer() w_list = analyze(title) return w_list
def wnl_nonum(doc): wnl = WordNetLemmatizer() cv = CountVectorizer() ana = cv.build_analyzer() doc = re.sub('[0-9]', '', doc) return (wnl.lemmatize(w) for w in ana(doc))
def main(): vct = CountVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1), token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer()) vct_analizer = vct.build_analyzer() print("Start loading ...") # data fields: data, bow, file_names, target_names, target ########## NEWS GROUPS ############### # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf categories = [['alt.atheism', 'talk.religion.misc'], ['comp.graphics', 'comp.windows.x'], ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'], ['rec.sport.baseball', 'sci.crypt']] if "imdb" in args.train: ########## IMDB MOVIE REVIEWS ########### data = Bunch(load_imdb(args.train, shuffle=True, rnd=2356, vct=vct)) # should brind data as is elif "aviation" in args.train: raise Exception("We are not ready for that data yet") elif "20news" in args.train: ########## 20 news groups ###### data = Bunch(load_20newsgroups(categories=categories[0], vectorizer=vct, min_size=50)) # for testing purposes elif "dummy" in args.train: ########## DUMMY DATA########### data = Bunch(load_dummy("C:/Users/mramire8/Documents/code/python/data/dummy", shuffle=True,rnd=2356,vct=vct)) else: raise Exception("We do not know that dataset") print("Data %s" % args.train) total = len(data.train.data) print("Data size %s" % total) #print(data.train.data[0]) ## prepare pool for the sampling pool = Bunch() pool.data = data.train.bow.tocsr() # full words, for training pool.target = data.train.target pool.predicted = [] pool.remaining = set(range(pool.data.shape[0])) # indices of the pool bt = randomsampling.BootstrapFromEach(87654321) for i in range(7): query_index = bt.bootstrap(pool=pool, k=args.packsize) # get instances from each class filename = "{0}-P{1}.txt".format(args.train,i) f = codecs.open(filename, 'a+', 'utf-8') #print documents in file random.shuffle(query_index) for di in query_index: x = unicode(data.train.data[di].replace("\n","<br>")) #y = data.train.target[di] y = data.train.target_names[data.train.target[di]] #f.write(str(i)) #f.write("\t") #f.write(str(y)) #f.write("\t") #f.write(x) #f.write("\n") f.close() pool.remaining.difference_update(query_index) # remove the used ones
def encode_text(sentences, vectorizer=None, max_len=None, msg_prefix="\n", verbose=True): """Encode array_like of strings to ndarray of integers. :param sentences: (array_like of str). e.g., ["I like apples", "Me too"] :param vectorizer: (CountVectorizer, optional) :param max_len: (int) maximum length of encoded sentences. :param msg_prefix: :param verbose: :return: Tuple[CountVectorizer, int, ndarray] e.g., (CountVectorizer, 3, array([[1, 2, 3], [4, 5, 0]])) """ if verbose: print("{} Encode texts to integers".format(msg_prefix)) # Not recommend to modify below vectorizer/vocab lines. if vectorizer is None: vectorizer = CountVectorizer(stop_words="english") vectorizer.fit(sentences) # dictionary of (token, encoding) pair. # e.g., {"I": 0, "like": 1, "apples": 2, "Me": 3, "too": 4} vocab = vectorizer.vocabulary_ # Convert str to int. # - Use preprocess_and_tokenize the type of which is 'Callable[str, List[str]]' # - Do not use '0'. We will use '0' in zero padding. # e.g., sentences: ["I like apples", "Me too"] and # vocab: {"I": 0, "like": 1, "apples": 2, "Me": 3, "too": 4} # Then, encoded_sentences: [[0 + 1, 1 + 1, 2 + 1], [3 + 1, 4 + 1]] -> [[1, 2, 3], [4, 5]] preprocess_and_tokenize = vectorizer.build_analyzer() encoded_sentences = [] for s in sentences: tokens = preprocess_and_tokenize(s) # Hint: encoded_sentences.append(/* BLANK */) raise NotImplementedError assert len(encoded_sentences) == len(sentences) assert all([0 not in es for es in encoded_sentences]) # Get max_len (maximum length). # If max_len is given, use it. # e.g., [[1, 2, 3], [4, 5]] (from ["I like apples", "Me too"]) # -> 3 max_len = max_len or max(len(es) for es in encoded_sentences) # Add zero padding to make length of all sentences the same. # e.g., [[1, 2, 3], [4, 5]] # -> [[1, 2, 3], [4, 5, 0]] pad_encoded_sentences = np.zeros((len(sentences), max_len), dtype=np.int32) for idx, es in enumerate(encoded_sentences): length = len(es) if len(es) <= max_len else max_len # Hint: pad_encoded_sentences[idx, :length] = /* BLANK */ raise NotImplementedError return vectorizer, max_len, pad_encoded_sentences
def feature_extractor(data): data = data.decode('utf-8') lemmatizer = WordNetLemmatizer() stop_words = stopwords.words('english') tokens = word_tokenize(data) tokens_mod = [] i = 0 while i < len(tokens): curr = tokens[i] if curr == 'no' or curr == 'not': if i - 1 >= 0: tokens_mod[-1] = tokens_mod[-1] + '+' + curr if i + 1 <= len(tokens) - 1: tokens_mod.append(curr + '+' + tokens[i+1]) i += 1 else: tokens_mod.append(curr) i += 1 data_mod = '' for token in tokens_mod: data_mod += ' ' + token bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),token_pattern=r'\b\w+\b', min_df=1) analyze = bigram_vectorizer.build_analyzer() bigrams = analyze(data_mod) features = { bigram:1 for bigram in bigrams } return features
def fitModel(examples, acoustic=None, vocab=None, frequent_ngram_col_idx=None): corpus = [x for x, y in examples] vectorizer = CountVectorizer(vocabulary=vocab, ngram_range=(1, 3), token_pattern=r'\b\w+\b', min_df=1) X = vectorizer.fit_transform(corpus) # UNCOMMENT TO ADD NGRAM FEATURES analyze = vectorizer.build_analyzer() fullfeature = X.toarray() # print 'VOCAB SHAPE', len(fullfeature), len(fullfeature[0]) # The most time expensive part (pruning so only frequent ngrams used) if not frequent_ngram_col_idx: sums = np.sum(fullfeature, axis=0) frequent_ngram_col_idx = np.nonzero([x > 2 for x in sums]) fullfeature = fullfeature[:, frequent_ngram_col_idx[0]] # Add features from grammatical context in transcript fullfeature = contextualFeatures(examples, fullfeature) # print 'CONTEXTUAL SHAPE', len(fullfeature), len(fullfeature[0]) fullfeature = acousticFeatures(fullfeature, acoustic) # print 'FINAL SHAPE', len(fullfeature), len(fullfeature[0]) # return vectorizer return fullfeature, vectorizer.vocabulary_, frequent_ngram_col_idx
def get_topic_sim(k): corpus = [] input = open('LDA_result.txt') for line in input: corpus.append(line.strip('\r\n')) print len(corpus) # 将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频 vectorizer = CountVectorizer() print 'vectorizer',vectorizer X = vectorizer.fit_transform(corpus) analyze = vectorizer.build_analyzer() weight = X.toarray() # LDA算法 print 'LDA:' model = lda.LDA(n_topics=k, n_iter=300, random_state=1) model.fit(np.asarray(weight)) # model.fit_transform(X) is also available topic_word = model.topic_word_ # model.components_ also works # 文档-主题(Document-Topic)分布 doc_topic = model.doc_topic_ ##print("type(doc_topic): {}".format(type(doc_topic))) ##print("shape: {}".format(doc_topic.shape)) ##print doc_topic sim = sklearn.metrics.pairwise.cosine_similarity(doc_topic, dense_output=True) ##print sim return sim
def get_feature_by_opcode_bigram_word2vec(): global max_document_length global bigram_word2vec_bin with open('metrics.txt', 'a') as f: f.write("Get feature by opcode and bigram word2vec: \n") f.close() x = [] y = [] if os.path.exists(bigram_wv_data_pkl_file) and os.path.exists( label_pkl_file): f = open(bigram_wv_data_pkl_file, 'rb') x = pickle.load(f) f.close() f = open(label_pkl_file, 'rb') y = pickle.load(f) f.close() else: x, y = load_data_pkl_file() CV = CountVectorizer(ngram_range=(2, 2), decode_error="ignore", token_pattern=r'\b\w+\b', min_df=1, max_df=1.0) # 2-gram分词 analyze = CV.build_analyzer() courps = [] for text in x: text = analyze(text) text = str(text).replace('u\'', '\'') courps.append(str(text)) x = courps cores = multiprocessing.cpu_count() if os.path.exists(bigram_word2vec_bin): print "Find cache file %s" % bigram_word2vec_bin model = gensim.models.Word2Vec.load(bigram_word2vec_bin) else: model = gensim.models.Word2Vec(size=max_features, window=5, min_count=5, iter=10, workers=cores) model.build_vocab(x) model.train(x, total_examples=model.corpus_count, epochs=model.iter) model.save(bigram_word2vec_bin) x = getVecsByWord2Vec(model, x, max_features) f = open(bigram_wv_data_pkl_file, 'wb') pickle.dump(x, f) f.close() return x, y
def buildFeatureMatrixRepresentation(stopwords, corpusRepresentation, abstractPath, outPath): #featuresMatrix =[] if ((not corpusRepresentation.empty) and (abstractPath)): fOutput = open(outPath, "w") vectorizer = CountVectorizer( lowercase=True, stop_words=stopwords, token_pattern='(?u)\\b[\\w+,-]+\\w+\\b|\\b\\w\\w+\\b') for abstractPath in corpusList: for counter, document in enumerate(glob.iglob(abstractPath)): if ((counter < MAX_NUM_ABSTRACTS) and (document)): try: fp = open(document, "r") content = fp.read() fp.close() if content: vector = [] #we split each document into tokens. analyser = vectorizer.build_analyzer() tokens = analyser(content) for word in corpusRepresentation.term: if any(word in s for s in tokens): vector.append(1) else: vector.append(0) #featuresMatrix.append(vector) fOutput.write(" ".join(str(x) for x in vector) + "\n") except: print "Error trying to build the representation for the document: " + document fOutput.close()
class NLTK_CountVectorizer(CountVectorizer): def __init__(self, lang, **kwargs): CountVectorizer.__init__(self, kwargs) try: self.stemmer = SnowballStemmer(lang.lower()).stem self.vect = CountVectorizer() self.analyzer = self.analyzer_nltk except ValueError: pass def analyzer_nltk(self, x): return [self.stemmer(e) for e in self.vect.build_analyzer()(x)] def fit_transform(self, x, y): res = super().fit_transform(x, y) try: self.vect.fit(x, y) vocabs = dict() for v_it in self.vect.vocabulary_: expr = self.stemmer(v_it) if expr in vocabs: vocabs[expr].append(v_it) else: vocabs[expr] = [v_it] self.vocabulary_nltk = dict([(e[0], min(e[1])) for e in vocabs.items()]) except AttributeError: self.vocabulary_nltk = dict([(e, e) for e in self.vocabulary_]) return res
def generate_bow_doc(doc, feature_names): vectorizer = CountVectorizer(max_df=0.5, stop_words='english') tokeniser = vectorizer.build_analyzer() bow = [ w for w in tokeniser(doc) if w in feature_names and w in model.vocab ] return bow
def analyze_comment(comment): vectorizer = CountVectorizer(stop_words='english') analyzer = vectorizer.build_analyzer() comment = analyzer(clean_comment(comment)) comment = list(filter(lambda s: not '_' in s, comment)) comment = list(filter(lambda s: not any(c.isdigit() for c in s), comment)) return comment
def analyze_body(body): vectorizer = CountVectorizer(stop_words='english') analyzer = vectorizer.build_analyzer() body = analyzer(clean_body(body)) body = list(filter(lambda s: not '_' in s, body)) body = list(filter(lambda s: not any(c.isdigit() for c in s), body)) return body
def analyze_title(title): vectorizer = CountVectorizer(stop_words='english') analyzer = vectorizer.build_analyzer() title = analyzer(clean_title(title)) title = list(filter(lambda s: not '_' in s, title)) title = list(filter(lambda s: not any(c.isdigit() for c in s), title)) return title
def main(): ###corpus = importasline('../data/shakespear.txt') corpus = importasline('../data/all_modified.txt') vectorizer = CountVectorizer(min_df=1) X = vectorizer.fit_transform(corpus) analyze = vectorizer.build_analyzer() Y = [[vectorizer.vocabulary_[x] for x in analyze(corpus[i])] for i in range(len(corpus))] print(len(Y), 'len(Y)')
def main_hack(self): input_train_file_ptr = "trainingandtestdata/training.1600000.processed.noemoticon.csv" input_test_file_ptr = "trainingandtestdata/testdata.manual.2009.06.14.csv" # read the csv file and return the pandas dataframe with two column as tweets and sentiment as columns. train_tweests_with_sentiments = self.pre_process_input_data(input_train_file_ptr) test_tweets_data = self.pre_process_input_data(input_test_file_ptr) bigram_vectorizer = CountVectorizer(ngram_range=(2,2),token_pattern=r'\b\w+\b', min_df=1,lowercase=True) # print tweests_array tweets_array, sentiments_array = self.get_tweest_and_sentiments(train_tweests_with_sentiments) print(("size of tweets array is %s and sentiment array is %s " % (tweets_array.size, sentiments_array.size))) test_tweets,test_sentiments = self.get_tweest_and_sentiments(test_tweets_data) test_sentiments = test_sentiments.flatten() print(("size of test tweets array is %s and test sentiment array is %s " % (test_tweets.size, test_sentiments.size))) parsed_train_tweets = self.clean_data_to_feed_classifier(tweets_array) parsed_test_tweets = self.clean_data_to_feed_classifier(test_tweets) # print parsed_tweests x = bigram_vectorizer.fit_transform(parsed_train_tweets) print (x.size) # print bigram_vectorizer.get_feature_names() bigram_vectorizer.build_analyzer() print ("done 1") # print bigram_vectorizer.get_feature_names() res = bigram_vectorizer.transform(parsed_test_tweets) print ("done 2") clf = LinearSVC() gnb = MultinomialNB() print ("done 2") trained_classifier = self.do_K_fold_cross_validation(clf,gnb,x,sentiments_array.flatten()) # trained_classifier.fit(x, sentiments_array.flatten()) print ("done 3") output = trained_classifier.predict(res) # print output print (accuracy_score(test_sentiments,output)) # bigram_vectorizer.get_feature_names() # analyze = bigram_vectorizer.build_analyzer() # analyze # if __name__ == '__main__': # main()
class BagOfWords: """ Basic Bagof words model implemented with SciKit-Learn's sparse counting vectorizers """ def __init__(self,**kwargs): self.vectorizer_args=kwargs self.vectorizer=CountVectorizer(decode_error='ignore',**self.vectorizer_args) def __call__(self,*txt,**kwargs): """ Use SciKit vectorizer to transform the txt into an matrices of numbers represeting pure bag of words :return: """ return self.vectorizer.fit_transform([str(i) for i in txt],**kwargs) def get_word_count(self, *txt,**kwargs): """ First, fit to vocab and get word count Count occurence of each word in the bag of words representation from txt list, *txt Returns UNSORTED LIST """ self.vectorizer.fit(txt) analyze = self.vectorizer.build_analyzer() return TextUtil.stem( TextUtil.remove_stop_words(collections.Counter(itertools.chain(*(analyze(str(i)) for i in txt)))) ) def get_feature_names(self): """ Get the feature names of the vectorized vector :return: """ return self.vectorizer.get_feature_names() def reverse_transformation(self,bow_dict): """ Reverse the transformation of a dictionary representation of BOW into numpy vectors :return: """ assert isinstance(bow_dict,BaseDict) or isinstance(bow_dict,dict) vec=DictVectorizer() vec.fit_transform(bow_dict) return vec
def main(argv): vlen = int(argv[4]) abstractDict = loadAbstracts(argv[2]) catTitleDict = loadCategoryTitles(argv[1]) vectorizer = CountVectorizer(stop_words='english') analyzer = vectorizer.build_analyzer() #catVectors = getVectorsFromTitles(catTitleDict, analyzer, vlen) catVectors = getVectorsFromAbstracts(catTitleDict, abstractDict, analyzer, vlen) print len(catVectors) indexVectors(catVectors, argv[3])
def bagofwords(df, item2emomapping): '''creates simple bag of words feature space''' print "creating bag-of-words feature space" from sklearn.feature_extraction.text import CountVectorizer listofstrings = list(df['cause'].values) itemlabels = ['q%0.f' % qnum for qnum in df['qnum'].values] vectorizer = CountVectorizer(min_df=1) analyzer = vectorizer.build_analyzer() bagofwords = vectorizer.fit_transform(listofstrings) features = vectorizer.get_feature_names() bagofwords = bagofwords.toarray() itemavgs = [list(line) for line in bagofwords] ndf = makedataframe(itemavgs, itemlabels, item2emomapping) ndimf.quicksave(ndf, os.path.join(rootdir,'data/stimdfs','bagofwordsdf.pkl')) return ndf
def wtf(self): from rutez.rutez import Rutez from sklearn.feature_extraction.text import CountVectorizer tez = Rutez() with open('data/first_sentences.html') as f: full_text = f.read() vectorizer = CountVectorizer(ngram_range=(1,2)) analyzer = vectorizer.build_analyzer() data = (analyzer(full_text)) # print(type(data), len(data)) sinsets = set() for item in data: word = item.upper() if word in tez.word2sinsets: for sinset in tez.word2sinsets[word]: print(word, '|', sinset, '|', tez.upper_sinsets(sinset))
def __init__(self): #BOWs preparation filenames = ['bow/1StarsSamples.json', 'bow/2StarsSamples.json', 'bow/3StarsSamples.json', 'bow/4StarsSamples.json', 'bow/5StarsSamples.json'] self.vectorizer = CountVectorizer(input='filename', ngram_range=(1,3), stop_words='english', strip_accents='unicode', token_pattern=ur'\b\w+\b') dtm = self.vectorizer.fit_transform(filenames).toarray() self.dtm = scale(dtm) vocab = np.array(self.vectorizer.get_feature_names()) _vectorizer = CountVectorizer(input='content', ngram_range=(1,3), stop_words='english', strip_accents='unicode', token_pattern=ur'\b\w+\b') self.analyze = _vectorizer.build_analyzer() #Load dictionaries and model with open("dict/dict2bins.p", "rb") as f1, open("dict/dict3bins.p", "rb") as f2, open("dict/dict6bins.p", "rb") as f3, open("model/clf.pkl", "rb") as fm: self.dict2bins = pickle.load(f1) self.dict3bins = pickle.load(f2) self.dict6bins = pickle.load(f3) self.model = pickle.load(fm) #load model
def count_letters(filenames): from glob import glob # with spaces # CountVectorizer(token_pattern=r'[A-Za-z ]',min_df=1) #Just letters, no spaces filenames=glob(filenames) text=[] vectorizer=CountVectorizer(token_pattern=r'[A-Za-z]',min_df=1) analyze = vectorizer.build_analyzer() for filename in filenames: with open(filename) as fid: mytext=fid.read() #mytext=mytext.decode('utf8','ignore') text.append(mytext) X=vectorizer.fit_transform(text).toarray() return X,[str(_) for _ in vectorizer.get_feature_names()]
def main(): corpus = importasline('../data/grouping1/groupA.txt',ignorehyphen = True) vectorizer = CountVectorizer(min_df=1) X = vectorizer.fit_transform(corpus) analyze = vectorizer.build_analyzer() Y = [[vectorizer.vocabulary_[x] for x in analyze(corpus[i])] for i in range(len(corpus))] print(Y) words = vectorizer.get_feature_names() num_of_hidden_states = 5 print(len(words)) print(Y) hmm = modelhmm(num_of_hidden_states, len(words), Y, 'modelnhidden5groupA') hmm.syllable_analysis() exit() if(False): for i in range(5000): print(i) print(hmm.update_state_corpus(Y)) hmm.savemodel() #print(hmm.obs_[:,Y[0]]) print(hmm.trans_) hmm.loadmodel() print('transloaded',hmm.trans_.shape) print('obsloaded',hmm.obs_.shape) for i in range(20): robotpoem = '' line,linew = hmm.generating_random_line() for j in linew: robotpoem+=' '+words[j]+' ' print(robotpoem) hmm.analyzing_word(words) hmm.analysing_obs(words) wordtag=nltk.pos_tag(words,tagset='universal') pos = [x[1] for x in wordtag] stat = nltk.FreqDist(pos) print stat.most_common()
def main(): corpus = importasline('../data/shakespear_modified.txt', ignorehyphen = True) vectorizer = CountVectorizer(min_df=1) X = vectorizer.fit_transform(corpus) analyze = vectorizer.build_analyzer() Y = [[vectorizer.vocabulary_[x] for x in analyze(corpus[i])] for i in range(len(corpus))] words = vectorizer.get_feature_names() print(len(words)) mm = Markov(len(words), Y, 'modelnhidden1000groupA') print len(mm.inversetable), 'len(mm.inversetable)' print mm.inversetable[0: 4], 'mm.inversetable[0: 4]' for i in range(20): [line,linew] = mm.generating_random_line() ###print linew, ': linew' robotpoem = '' for j in linew[:-1]: robotpoem+=' '+words[j]+' '
class Corpus: def __init__(self, tweets): self.tweets = tweets self.vocab_size = -1 self.cv = CountVectorizer() self.tokenizer = None self.build_vocab() def build_vocab(self): strings = map(lambda tweet: tweet.raw_text, self.tweets) self.cv.fit_transform(strings) self.tokenizer = self.cv.build_analyzer() self.vocab_size = len(self.cv.vocabulary_.keys()) print("vocabulary size: %d" % self.vocab_size) def vocab(self): return self.cv.vocabulary_ def tweet2array(self, tweet): assert self.tokenizer is not None tokens = self.tokenizer(tweet.raw_text) V = self.vocab() return map(lambda t: V.get(t), tokens) def tokenize(self, tweet): return self.tokenizer(tweet.raw_text)
batch_size = 20 if __name__ == "__main__": if len(sys.argv) < 4: print("Usage: {} [input file] [model json file] [weights file]".format(sys.argv[0])) quit(1) path = sys.argv[1] model_json_file = sys.argv[2] weights = sys.argv[3] print("Reading input...") token_regex = r"(?u)([\(\)\[\]]|\b\w+\b)" # cv = CountVectorizer(ngram_range=(1,ngrams), token_pattern=token_regex) cv = CountVectorizer(token_pattern=token_regex, min_df=2) an = cv.build_analyzer() corpus = [] with open(path) as f: for line in f: corpus.append(line.strip()) # vectorize and n-gram-ize the corpus X = cv.fit_transform(corpus) print("Building vectors...") # vocabulary size, including padding element vocabulary_size = len(cv.vocabulary_) + 1 print("Vocabulary size: {} Corpus size: {}".format(vocabulary_size, len(corpus)))
# seg_list = jieba.cut(u'它来自山东省的一个小村子', cut_all=True) # print '全模式:', '/ '.join(seg_list) # # seg_list = jieba.cut(u'This is the first document.', cut_all=True) # print '全模式:', '/ '.join(seg_list) def tokenize(text): tokens = jieba.cut(text, cut_all=False) return list(tokens) vectorizer = CountVectorizer(min_df=1, tokenizer=tokenize) analyzer = vectorizer.build_analyzer() print(analyzer('This is a text document to analyze.')) print(analyzer(u'它来自山东省的一个小村子')) # ## corpus = [ 'This is the first document.', 'This is the second second document.', 'And a third one.', 'Is this the first document?', u'他来自山东省的一个小村子', ] # use jieba tokenizer X = vectorizer.fit_transform(corpus) print(X)
def poem_generate(num_pairs): print "We are doing the 2rd order Markov model!" print "Number of poems to generate:", num_pairs # how many pairs to generate ending_words_dict = sample_ending_word(num_pairs) poems_dict = dict() h_en = Hyphenator('en_US') prondict = nltk.corpus.cmudict.dict() for ind in ['A','B','C','D','E','F','G']: print "Group:", ind # get ending words ending_words = ending_words_dict[ind] # preprocess data corpusname = '../data/grouping2/group' + ind + '.txt' corpus = importasline(corpusname, ignorehyphen=False) vectorizer = CountVectorizer(min_df=1) X = vectorizer.fit_transform(corpus) analyze = vectorizer.build_analyzer() Y = [[vectorizer.vocabulary_[x] for x in analyze(corpus[i])] for i in range(len(corpus))] ending_tokens = [[vectorizer.vocabulary_[x] for x in ending_words[i]] for i in range(len(ending_words))] # print(Y) words = vectorizer.get_feature_names() print "Number of words:", len(words) # train in a reverse direction for i, line in enumerate(Y): Y[i] = line[::-1] # print(Y) # generate number of syllables for every word words_num_syllables = np.zeros(len(words), dtype=int) for wordid, word in enumerate(words): try: phon = prondict[word][0] words_num_syllables[wordid] = sum(map(hasNumbers, phon)) except: words_num_syllables[wordid] = len(h_en.syllables(unicode(word))) if not words_num_syllables[wordid]: words_num_syllables[wordid] = count_syllables(word) # train model modelname = 'model2rdMMgroup' + ind hmm = Markov( len(words), Y, words_num_syllables, modelname) print(len(hmm.inversetable)) # generate poems subpoems = [None]*num_pairs for pairid in range(num_pairs): start_token = ending_tokens[pairid] robotpoem0 = '' line0,linew0 = hmm.generating_random_line_end(start_token[0]) for j in linew0[-2::-1]: robotpoem0+=' '+words[j]+' ' print(robotpoem0) robotpoem1 = '' line1,linew1 = hmm.generating_random_line_end(start_token[1]) for j in linew1[-2::-1]: robotpoem1+=' '+words[j]+' ' print(robotpoem1) subpoems[pairid] = (robotpoem0, robotpoem1) # add the best subpoem to poems_dict poems_dict[ind] = subpoems # write down the poems poem_file_name = '../poems2rdMM/reverse_with_punctuations.txt' fwrite = open(poem_file_name, 'w') for poemid in range(num_pairs): # construct poems robotpoem = [None]*14 robotpoem[0] = poems_dict['A'][poemid][0] robotpoem[2] = poems_dict['A'][poemid][1] robotpoem[1] = poems_dict['B'][poemid][0] robotpoem[3] = poems_dict['B'][poemid][1] robotpoem[4] = poems_dict['C'][poemid][0] robotpoem[6] = poems_dict['C'][poemid][1] robotpoem[5] = poems_dict['D'][poemid][0] robotpoem[7] = poems_dict['D'][poemid][1] robotpoem[8] = poems_dict['E'][poemid][0] robotpoem[10] = poems_dict['E'][poemid][1] robotpoem[9] = poems_dict['F'][poemid][0] robotpoem[11] = poems_dict['F'][poemid][1] robotpoem[12] = poems_dict['G'][poemid][0] robotpoem[13] = poems_dict['G'][poemid][1] robotpoem = Format(robotpoem) # write into file print>>fwrite, str(poemid) for lineid in range(14): print>>fwrite, robotpoem[lineid] fwrite.close()
def find_all_ngrams(input_string, max_n): vectorizer = CountVectorizer(ngram_range = (1, max_n)) analyzer = vectorizer.build_analyzer() return(analyzer(input_string))
def __init__(self, callback_func, relevant_kw): self.callback_func = callback_func self.relevant_kw = relevant_kw self.stop_words = create_stop_words() vect_kw = CountVectorizer(tokenizer=MyTokenizer(), ngram_range=(1, 3), stop_words=self.stop_words) self.analyse_kw = vect_kw.build_analyzer()
clf_7 = Pipeline([ ('vect', TfidfVectorizer( stop_words=stop_words, token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b", )), ('clf', MultinomialNB(alpha=0.01)), ]) evaluate_cross_validation(clf_7, news.data, news.target, 5) ''' from sklearn.feature_extraction.text import TfidfTransformer transformer = TfidfTransformer() def my_tokenizer(s): return s.split() vectorizer = CountVectorizer(tokenizer=my_tokenizer) str = 'I am sure some bashers of Pens fans are pretty confused about the lack' print vectorizer.build_analyzer()(str) print vectorizer.build_tokenizer()(str) print vectorizer.build_preprocessor()(str) s1 = 'rạng sáng nay theo giờ hà_nội danh_hiệu cầu_thủ giá_trị mvp giải mls năm được công_bố tiền_đạo gốc việt_lee_nguyễn ứng_viên sáng_giá không kém đôi ngôi_sao đá giải ngoại_hạng robbie_keane los_angeles_galaxy obafemi_martins seattle_sounders bình_chọn dựa số phiếu clb dự mls giới truyền_thông cầu_thủ robbie_keane người số phiếu trận chung_kết mls cup robbie_keane los_angeles_galaxy giành danh_hiệu cầu_thủ giá_trị mls lee_nguyễn được đánh_giá cao bình_chọn ảnh espn lee_nguyễn xếp thứ_ba bình_chọn đạt tổng_số phiếu mùa lee_nguyễn ghi bàn năm pha kiến_tạo cuối giải thi_đấu ấn_tượng vai_trò cầm_trịch lối chơi ghi_bàn cho new_england_revolution vòng play off mls cup tiền vệ_sinh năm ghi thêm hai bàn ba pha kiến_tạo đưa revolution đoạt vô_địch mls khu_vực miền đông giành vé dự chung_kết mls cup đối_đầu đội bóng keane la galaxy tháng lee_nguyễn được hlv jurgen_klinsmann triệu_tập trở_lại tuyển mỹ nhờ phong_độ ấn_tượng mls cựu inter_milan newcastle_utd obafemi_martins đứng thứ_hai số phiếu bầu cầu_thủ clb phiếu bầu clb phiếu bầu truyền thông phiếu bầu cầu thủ tổng robbie_keane la galaxy obafemi_martins seattle_sounders lee_nguyễn new england rev bradley_wright phillips ny red_bulls tuấn' s2 = 'lee_nguyễn trải một năm thi_đấu hoàn_hảo ảnh usa today kết_quả được công_bố trang thông_tin chính_thức ban tổ_chức giải mls phần bình_luận tiền_vệ công lee_nguyễn đoạn lọt danh_sách bầu_chọn cuối_cùng cho danh_hiệu cầu_thủ giá_trị mls cho thấy lee_nguyễn một bước đột_phá sự_nghiệp nơi đanh ghi bàn đứng thứ_tư danh_sách vua_phá_lưới mùa vừa_qua tiền_vệ ghi_bàn cao lịch_sử mls chân chuyền đứng thứ_hai new_england năm pha kiến_tạo thành_công lee_nguyễn hoàn_toàn xứng_đáng lần đầu_tiên được lọt vào đội_hình tiêu_biểu mùa pha lập_công kiến_tạo lối chơi sáng_tạo ổn_định lee_nguyễn góp_phần quan_trọng làm_nên mùa giải thành_công rực_rỡ new_england_revolution họ nhì mls miền đông khi đăng_quang mls cup khu_vực đồng_nghĩa một suất vào chung_kết mls cup toàn_quốc nhờ lọt vào danh_sách rút_gọn cuối_cùng cho đua cầu_thủ giá_trị mvp robbie_keane los_angeles_galaxy obafemi_martins seattle_sounders bàn thắng gỡ hòa 1-1 vào lưới houston_dynamo tuần ngôi_sao sinh năm lọt danh_sách bốn bàn thắng đẹp mls sau bảy năm được gọi trở_lại đội_tuyển mỹ đội_hình tiêu_biểu mùa vừa_qua los_angles_galaxy đóng_góp nhiều ba cá_nhân chia đều hàng thủ đến hàng công đội bóng đối_thủ cạnh_tranh vô_địch mls cup lee_nguyễn revolution sân stubhub_center california ngày tới đội_hình tiêu_biểu mls mùa thủ_môn bill_hamid dc united hậu_vệ bobby_boswell dc united omar_gonzalez los_angeles_galaxy chad_marshall seattle_sounders tiền_vệ landon_donovan los_angeles_galaxy thierry_henry new_york_red_bulls lee_nguyễn new_england_revolution diego_valeri portland_timbers tiền_đạo robbie_keane los_angeles_galaxy obafemi_martins seattle_sounders fc bradley_wright phillips new_york_red_bulls đông_anh' s3 = 'thành_lương đỏ làm_nên tuyệt_phẩm trận đấu cuối_cùng bảng philippines ảnh giang_huy malaysia tập_trung hôm_qua để chuẩn_bị cho trận đấu tuyển việt_nam ngày sân_nhà shah_alam sau khi lách khe cửa hẹp để giành vị_trí thứ_hai bảng tay đội singapore thầy_trò salleh háo_hức muốn được kết_quả thật tốt một lời xin_lỗi để cđv nhà thất_vọng thời_gian gì phát_biểu có_thể thấy salleh nghiên_cứu kỹ báo_cáo hlv_u2 ong_kim_swee người được liên_đoàn bóng_đá malaysia fam cử sang hà_nội theo_dõi đối_thủ bảng trọng_tâm tuyển việt_nam đá giao_hữu tuyển việt_nam giải đấu nên phần_nào biết làm gì để kiềm_chế sức_mạnh họ salleh tiết_lộ báo_giới malaysia chúng tô đặc_biệt cẩn_trọng số nguyễn_văn_quyết số phạm_thành_lương cầu_thủ nguy_hiểm ong_kim_swee cho biết như_thế cầu_thủ văn_quyết đỏ chưa ghi_bàn được đối_thủ đánh_giá cao lối chơi ảnh giang_huy cá_nhân ong_kim_swee đưa nhận_xét tuyển việt_nam sau một thời_gian do_thám đội bóng xây_dựng được một phong_cách hoàn_toàn khác_biệt thời hlv người nhật_bản_toshiya_miura họ cầm bóng tốt không_bao_giờ chuyền bóng ngược sau luôn hướng lên phía miura sở_hữu cầu_thủ kỹ_thuật cá_nhân tốt malaysia cảnh_giác mỗi khi đối_phương bóng sát vòng cấm_địa việt_nam ghi hai bàn vào lưới philippines cú sút xa khi được hỏi điểm yếu tuyển việt_nam ong_kim_swee người giúp u23 malaysia vô_địch sea games tỏ bí_hiểm gì thấy một tập_thể gắn_kết mỗi vị_trí đều điểm yếu họ để thủng lưới ba lần điểm yếu có_thể tận_dụng khai_thác hlv salleh đen âm_thầm chuẩn_bị kế_hoạch gây bất_ngờ tuyển việt_nam sân_nhà ảnh ts bên_cạnh việc tìm cách phong_tỏa hai ngòi_nổ tuyển việt_nam salleh cố_gắng giải_quyết khoảng_trống shukor_adan mohd_amri_yahya để hai cầu_thủ trụ_cột đều vắng_mặt trận lượt_đi án treo_giò indra_putra_mahyuddin kunanlan manaf_mamat đều có_thể được tung vào sân_sau khi minh_chứng được khả_năng buổi tập safiq_rahim mohd_muslim có_thể đá vị_trí tiền_vệ trụ thay_thế cho shukor_adan salleh tiết_lộ ít_nhiều khung đội_hình thi_đấu cuối tuần người thay_thế amri_yahya trận đấu kulanan hoặc manaf_mamat tuấn' corpus = [s1, s2, s3] print 'DOne'
def get_bigrams_trigrams(text=[], termCount=20, w2v=None, es=None): bigram_vectorizer = CountVectorizer(ngram_range=(2,2)) bigram_analyze = bigram_vectorizer.build_analyzer() trigram_vectorizer = CountVectorizer(ngram_range=(3,3)) trigram_analyze = trigram_vectorizer.build_analyzer() bi_results= map(lambda t: bigram_analyze(t), text) tri_results= map(lambda t: trigram_analyze(t), text) bigrams = [] bi_dict_corpus = {} for doc in bi_results: bi_dict={} for bi in doc: bi=bi.replace(' ','_') if bi in bi_dict: bi_dict[bi] = bi_dict[bi] + 1 else: bi_dict[bi] = 1 if bi_dict: # Yamuna: Removing for now as it is slow #phrases = remove_stopword_phrases(bi_dict.keys()) phrases = bi_dict.keys() if w2v.word_vec is None: results = get_documents(phrases, "term", ["term"], "word_phrase_to_vec", "terms", es) phrases = [res.lower() for res in results.keys()] else: phrases = [term for term in phrases if not w2v.get(term) is None] bi_dict_subset = {phrase: bi_dict[phrase] for phrase in phrases} if bi_dict_subset: bigrams.append(bi_dict_subset) for phrase in bi_dict_subset.keys(): if phrase in bi_dict_corpus: bi_dict_corpus[phrase] = bi_dict_corpus[phrase] + bi_dict_subset[phrase] else: bi_dict_corpus[phrase] = bi_dict_subset[phrase] trigrams = [] tri_dict_corpus = {} for doc in tri_results: tri_dict={} for tri in doc: tri=tri.replace(' ','_') if tri in tri_dict: tri_dict[tri] = tri_dict[tri] + 1 else: tri_dict[tri] = 1 if tri_dict: # Yamuna: Removing for now as it is slow #phrases = remove_stopword_phrases(tri_dict.keys()) phrases = tri_dict.keys() if w2v.word_vec is None: results = get_documents(phrases, "term", ["term"], "word_phrase_to_vec", "terms", es) phrases = [res for res in results.keys()] else: phrases = [term for term in phrases if not w2v.get(term) is None] tri_dict_subset = {phrase: tri_dict[phrase] for phrase in phrases} if tri_dict_subset: trigrams.append(tri_dict_subset) for phrase in tri_dict_subset.keys(): if phrase in tri_dict_corpus: tri_dict_corpus[phrase] = tri_dict_corpus[phrase] + tri_dict_subset[phrase] else: tri_dict_corpus[phrase] = tri_dict_subset[phrase] return bigrams, trigrams, sorted(bi_dict_corpus.items(), key=operator.itemgetter(1), reverse=True)[0:termCount], sorted(tri_dict_corpus.items(), key=operator.itemgetter(1), reverse=True)[0:termCount]
def split_into_lemmas(tweet): bigram_vectorizer = CountVectorizer(ngram_range=(1, 3), token_pattern=r'\b\w+\b', min_df=1) analyze = bigram_vectorizer.build_analyzer() return analyze(tweet)