def test_model_count_vectorizer_custom_tokenizer(self): corpus = numpy.array([ '9999', '999 99', '1234', '1 2 3 4', '1 2 3 4+', ]).reshape((5, 1)) vect = CountVectorizer(ngram_range=(1, 1), tokenizer=lambda s: [s]) vect.fit(corpus.ravel()) extra = {CountVectorizer: {"separators": ["ZZZZ"]}} prev = vect.tokenizer vect.tokenizer = None model_onnx = convert_sklearn(vect, 'CountVectorizer', [('input', StringTensorType([1]))], options=extra, target_opset=TARGET_OPSET) vect.tokenizer = prev self.assertTrue(model_onnx is not None) dump_data_and_model( corpus, vect, model_onnx, basename="SklearnTfidfVectorizer11CustomTokenizer-OneOff-SklCol", allow_failure="StrictVersion(onnxruntime.__version__) <= " "StrictVersion('0.4.0')")
def pipeline_process_data(paths, haveTarget = False, dataNum = 500): ''' input: output: ''' memoize=True if not haveTarget: for path in paths: if not os.path.exists(encode_url(path)): memoize = False else: memoize = True print("\nloading news from ", path) paper = newspaper.build(path, memoize_articles = memoize) urls = [] f = open(encode_url(path, "URLData/"), 'w') for article in paper.articles: urls.append(article.url) f.write(article.url + "\n") f.close() if haveTarget: feature, target = _getData_pos_neg(_parse(paths), dataNum, haveTarget = True) else: urls = [] for path in paths: f = open(encode_url(path, "URLData/"), 'r') urls.extend(f.readlines()) feature, data =_getData_pos_neg(urls, dataNum, haveTarget = False) vectorizer = CountVectorizer(min_df=1) vectorizer.stop_words = stopwords.words('english') vectorizer.tokenizer = bigrams_Tokenizer() # vectorizer.tokenizer = RegexpTokenizer(r'\w+') vectorizer.max_features = 2000 feature_matrix = vectorizer.fit_transform(feature) transformer = TfidfTransformer().fit(feature_matrix) feature_matrix = transformer.transform(feature_matrix) if haveTarget: return feature_matrix, target else: return feature_matrix, data # ##test code # if __name__ == "__main__": # X,Y = pipeline_process_data('/Users/gaoqin/Downloads/reviews_Video_Games.json.gz') # X = pipeline_process_data('https://www.huffingtonpost.com/') # count = [0,0,0,0,0] # for i in Y[400:]: # count[int(i - 1)] += 1 # print (count)
def __init__(self, seed=None, **kwargs): super().__init__(seed) vectorizer = CountVectorizer(lowercase=False) if not ENABLE_PRE_PROCESSING: vectorizer.tokenizer = str.split self.text_clf = Pipeline([('vect', vectorizer), ('tfidf', TfidfTransformer()), ('clf', self._init_classifier(**kwargs))], verbose=True) self._is_trained = False
def make_vectorizer(texts, null_set=(0, ""), unknown_set=(1, "###"), option=None): """ """ print("Making vectorizer.") vectorizer = CountVectorizer(max_df=1.0, min_df=10, max_features=10000, stop_words=[null_set[1], unknown_set[1]]) vectorizer.tokenizer = JapaneseMecabWordExtractor(split_mode="unigram", use_all=True, tagger_option=option) vectorizer.fit(texts) max_id = max(vectorizer.vocabulary_.values()) prev_char = vectorizer.get_feature_names()[null_set[0]] vectorizer.vocabulary_[null_set[1]] = null_set[0] vectorizer.vocabulary_[prev_char] = max_id + 1 prev_char = vectorizer.get_feature_names()[unknown_set[0]] vectorizer.vocabulary_[unknown_set[1]] = unknown_set[0] vectorizer.vocabulary_[prev_char] = max_id + 2 return vectorizer
def getKeywords(model, data): print("generating keywords") vectorizer = CountVectorizer(min_df=1) vectorizer.stop_words = stopwords.words('english') vectorizer.tokenizer = keyword_generator() # vectorizer.tokenizer = RegexpTokenizer(r'\w+') feature_matrix = vectorizer.fit_transform(data) # transformer = TfidfTransformer().fit(feature_matrix) # feature_matrix = transformer.transform(feature_matrix) vocab = list(vectorizer.get_feature_names()) counts = normalize(feature_matrix.sum(axis=0).A1) from collections import Counter freq_distribution = Counter(dict(zip(vocab, counts))) res = dict(freq_distribution.most_common(500)) badwords = open("bad-words.txt", "r").readlines() badwords = set(word.strip() for word in badwords) nres = dict() for words in res: for word in words.split(): if word in badwords: nres[words] = res[words] print(words , nres[words]) continue import json json_str = json.dumps(nres) # print(json_str) f = open(model + "_keywords.json", "w") f.write(json_str) f.close() return freq_distribution
from sklearn.feature_extraction.text import CountVectorizer def remove_stop_word_tokenizer(s): """ Custom tokenizer """ count_vect = CountVectorizer() default_tokenizer_function = count_vect.build_tokenizer() words = default_tokenizer_function(s) words = list(w for w in words if w.lower() not in stopwordslist) return words count_vect = CountVectorizer() count_vect.tokenizer = remove_stop_word_tokenizer tfidf_transformer = TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True) def trainNaiveBayes(data: list, targets: list): """ Trains a Naive Bayes classifier with the SciKitLearn modules :param data: list of tweets :param targets: list of associated targets for each tweet :return: Predictor """ X_tweet_counts = count_vect.fit_transform(data) # Compute term frequencies and store in X_train_tf # Compute tfidf feature values and store in X_train_tfidf X_train_tfidf = tfidf_transformer.fit_transform(X_tweet_counts)
from nltk.stem.porter import* stemmer=PorterStemmer() words1=[word_tokenize(i) for i in content] words2=[[i.lower() for i in doc] for doc in words1] words3=[[i for i in doc if re.search('^[a-z]+$',i)] for doc in words2] words4=[[i for i in doc if i not in stop] for doc in words3] words5=[[stemmer.stem(i) for i in doc] for doc in words4] text=[] for i in range(len(words5)): for j in words5[i]: text.append(j) from sklearn.feature_extraction.text import CountVectorizer vec=CountVectorizer() vec.fit(text) vec.vocabulary_ vec.vocabulary_.get(u'algorithm') vec.get_feature_names() vec.build_analyzer() vec.build_tokenizer() vec.tokenizer()