def test_model_count_vectorizer_custom_tokenizer(self):
        corpus = numpy.array([
            '9999',
            '999 99',
            '1234',
            '1 2 3 4',
            '1 2 3 4+',
        ]).reshape((5, 1))
        vect = CountVectorizer(ngram_range=(1, 1), tokenizer=lambda s: [s])
        vect.fit(corpus.ravel())

        extra = {CountVectorizer: {"separators": ["ZZZZ"]}}

        prev = vect.tokenizer
        vect.tokenizer = None
        model_onnx = convert_sklearn(vect,
                                     'CountVectorizer',
                                     [('input', StringTensorType([1]))],
                                     options=extra,
                                     target_opset=TARGET_OPSET)
        vect.tokenizer = prev

        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            corpus,
            vect,
            model_onnx,
            basename="SklearnTfidfVectorizer11CustomTokenizer-OneOff-SklCol",
            allow_failure="StrictVersion(onnxruntime.__version__) <= "
            "StrictVersion('0.4.0')")
Example #2
0
def pipeline_process_data(paths, haveTarget = False, dataNum = 500):
    '''
        input:

        output:
    '''
    memoize=True

    if not haveTarget:
        for path in paths:
            if not os.path.exists(encode_url(path)):
                memoize = False
            else:
                memoize = True
            print("\nloading news from ", path)
            paper = newspaper.build(path, memoize_articles = memoize)
            urls = []
            f = open(encode_url(path, "URLData/"), 'w')
            for article in paper.articles:
                urls.append(article.url)
                f.write(article.url + "\n")
            f.close()


    if haveTarget:
        feature, target = _getData_pos_neg(_parse(paths), dataNum, haveTarget = True)
    else:
        urls = []
        for path in paths:
            f = open(encode_url(path, "URLData/"), 'r')
            urls.extend(f.readlines())
        feature, data =_getData_pos_neg(urls, dataNum, haveTarget = False)

    vectorizer = CountVectorizer(min_df=1)

    vectorizer.stop_words = stopwords.words('english')

    vectorizer.tokenizer = bigrams_Tokenizer()
    # vectorizer.tokenizer = RegexpTokenizer(r'\w+')
    vectorizer.max_features = 2000
    feature_matrix = vectorizer.fit_transform(feature)

    transformer = TfidfTransformer().fit(feature_matrix)
    feature_matrix = transformer.transform(feature_matrix)

    if haveTarget:
        return feature_matrix, target   
    else:
        return feature_matrix, data
    

# ##test code
# if __name__ == "__main__":
    # X,Y = pipeline_process_data('/Users/gaoqin/Downloads/reviews_Video_Games.json.gz')
    # X = pipeline_process_data('https://www.huffingtonpost.com/')
    # count = [0,0,0,0,0]
    # for i in Y[400:]:
    #   count[int(i - 1)] += 1
    # print (count)
    def __init__(self, seed=None, **kwargs):
        super().__init__(seed)

        vectorizer = CountVectorizer(lowercase=False)
        if not ENABLE_PRE_PROCESSING:
            vectorizer.tokenizer = str.split

        self.text_clf = Pipeline([('vect', vectorizer),
                                  ('tfidf', TfidfTransformer()),
                                  ('clf', self._init_classifier(**kwargs))],
                                 verbose=True)
        self._is_trained = False
Example #4
0
def make_vectorizer(texts,
                    null_set=(0, ""),
                    unknown_set=(1, "###"),
                    option=None):
    """
    """
    print("Making vectorizer.")
    vectorizer = CountVectorizer(max_df=1.0,
                                 min_df=10,
                                 max_features=10000,
                                 stop_words=[null_set[1], unknown_set[1]])
    vectorizer.tokenizer = JapaneseMecabWordExtractor(split_mode="unigram",
                                                      use_all=True,
                                                      tagger_option=option)
    vectorizer.fit(texts)
    max_id = max(vectorizer.vocabulary_.values())
    prev_char = vectorizer.get_feature_names()[null_set[0]]
    vectorizer.vocabulary_[null_set[1]] = null_set[0]
    vectorizer.vocabulary_[prev_char] = max_id + 1
    prev_char = vectorizer.get_feature_names()[unknown_set[0]]
    vectorizer.vocabulary_[unknown_set[1]] = unknown_set[0]
    vectorizer.vocabulary_[prev_char] = max_id + 2
    return vectorizer
Example #5
0
def getKeywords(model, data):
    print("generating keywords")

    vectorizer = CountVectorizer(min_df=1)

    vectorizer.stop_words = stopwords.words('english')

    vectorizer.tokenizer = keyword_generator()
    # vectorizer.tokenizer = RegexpTokenizer(r'\w+')
    feature_matrix = vectorizer.fit_transform(data)

    # transformer = TfidfTransformer().fit(feature_matrix)
    # feature_matrix = transformer.transform(feature_matrix)

    vocab = list(vectorizer.get_feature_names())

    counts = normalize(feature_matrix.sum(axis=0).A1)
    from collections import Counter
    freq_distribution = Counter(dict(zip(vocab, counts)))
    res = dict(freq_distribution.most_common(500))
    badwords = open("bad-words.txt", "r").readlines()
    badwords = set(word.strip() for word in badwords)
    nres = dict()
    for words in res:
        for word in words.split():
            if word in badwords:
                nres[words] = res[words]
                print(words , nres[words])
                continue
    import json
    json_str = json.dumps(nres)
    # print(json_str)
    f = open(model + "_keywords.json", "w")
    f.write(json_str)
    f.close()
    return freq_distribution
Example #6
0
from sklearn.feature_extraction.text import CountVectorizer


def remove_stop_word_tokenizer(s):
    """
    Custom tokenizer
    """
    count_vect = CountVectorizer()
    default_tokenizer_function = count_vect.build_tokenizer()
    words = default_tokenizer_function(s)
    words = list(w for w in words if w.lower() not in stopwordslist)
    return words


count_vect = CountVectorizer()
count_vect.tokenizer = remove_stop_word_tokenizer
tfidf_transformer = TfidfTransformer(norm='l2', smooth_idf=True, sublinear_tf=False, use_idf=True)


def trainNaiveBayes(data: list, targets: list):
    """
    Trains a Naive Bayes classifier with the SciKitLearn modules
    :param data: list of tweets
    :param targets: list of associated targets for each tweet
    :return: Predictor
    """
    X_tweet_counts = count_vect.fit_transform(data)

    # Compute term frequencies and store in X_train_tf
    # Compute tfidf feature values and store in X_train_tfidf
    X_train_tfidf = tfidf_transformer.fit_transform(X_tweet_counts)
Example #7
0
File: Tweepy.py Project: jacklxf/ML
from nltk.stem.porter import*
stemmer=PorterStemmer()

words1=[word_tokenize(i) for i in content]
words2=[[i.lower() for i in doc] for doc in words1]
words3=[[i for i in doc if re.search('^[a-z]+$',i)] for doc in words2]
words4=[[i for i in doc if i not in stop] for doc in words3]
words5=[[stemmer.stem(i) for i in doc] for doc in words4]

text=[]
for i in range(len(words5)):
    for j in words5[i]:
        text.append(j)


from sklearn.feature_extraction.text import CountVectorizer
vec=CountVectorizer()
vec.fit(text)
vec.vocabulary_
vec.vocabulary_.get(u'algorithm')
vec.get_feature_names()
vec.build_analyzer()
vec.build_tokenizer()
vec.tokenizer()