Ejemplo n.º 1
0
def get_topic_term_frequency(topic_texts, min_df=1):
    vector = CountVectorizer(ngram_range=(1, 1),
                             stop_words='english',
                             min_df=min_df)
    vector.build_analyzer()
    tf = vector.fit_transform(topic_texts)
    return tf.toarray().sum(axis=0), vector
    def __init__(self, ngram=False, use_idf=False):
        self.ngram = ngram
        self.use_idf = use_idf

        # Load WordNet synsets and download data if necessary
        try:
            wordnet_path = nltk.data.find("corpora/wordnet")
        except LookupError:
            nltk.download("wordnet")
            wordnet_path = nltk.data.find("corpora/wordnet")
        self.wn = wordnet.WordNetCorpusReader(wordnet_path)

        # Initialize the two types of n-gram generators
        pentagram_vectorizer = CountVectorizer(
            ngram_range=(1, 5), token_pattern=r"\b[A-Za-z]+\b", min_df=1, stop_words=stop_list
        )
        unigram_vectorizer = CountVectorizer(
            ngram_range=(1, 1), token_pattern=r"\b[A-Za-z]+\b", min_df=1, stop_words=stop_list
        )

        # Function for generating five-grams through unigrams
        self.pent_analyze = pentagram_vectorizer.build_analyzer()

        # Function for generating just unigrams
        self.uni_analyze = unigram_vectorizer.build_analyzer()

        # Load IDF scores
        self.IDF = self.get_idf_scores()
        self.counts = self.get_counts()
Ejemplo n.º 3
0
def main():
	input_train_file_ptr = sys.argv[1]
	input_test_file_ptr = sys.argv[2]
	# read the csv file and return the pandas dataframe with two column as tweets and sentiment as columns.
	train_tweests_with_sentiments = pre_process_input_data(input_train_file_ptr)
	test_tweets_data = pre_process_input_data(input_test_file_ptr)
	bigram_vectorizer = CountVectorizer(ngram_range=(2,2),token_pattern=r'\b\w+\b', min_df=1,lowercase=True)
	# print tweests_array
	tweets_array, sentiments_array = get_tweest_and_sentiments(train_tweests_with_sentiments)
	print("size of tweets array is %s and sentiment array is %s  " % (tweets_array.size, sentiments_array.size))
	test_tweets,test_sentiments = get_tweest_and_sentiments(test_tweets_data)
	test_sentiments =  test_sentiments.flatten()
	print("size of test tweets array is %s and test sentiment array is %s  " % (test_tweets.size, test_sentiments.size))
	parsed_train_tweets = clean_data_to_feed_classifier(tweets_array)
	parsed_test_tweets = clean_data_to_feed_classifier(test_tweets)
	# print parsed_tweests
	x = bigram_vectorizer.fit_transform(parsed_train_tweets)
	print x.size
	# print bigram_vectorizer.get_feature_names()
	bigram_vectorizer.build_analyzer()
	print "done 1"
	# print bigram_vectorizer.get_feature_names()
	res = bigram_vectorizer.transform(parsed_test_tweets)
	print "done 2"
	clf = LinearSVC()
	gnb = MultinomialNB()
	print "done 2"
	trained_classifier = do_K_fold_cross_validation(clf,gnb,x,sentiments_array.flatten())
	# trained_classifier.fit(x, sentiments_array.flatten())
	print "done 3"
	output =  trained_classifier.predict(res)
	# print output
	print accuracy_score(test_sentiments,output)
Ejemplo n.º 4
0
def get_count_vectorizer(sentences):
    from sklearn.feature_extraction.text import CountVectorizer

    vectorizer = CountVectorizer()
    x = vectorizer.fit_transform(sentences)
    vectorizer.build_analyzer()

    return pd.DataFrame(x.todense(), columns=vectorizer.get_feature_names())
Ejemplo n.º 5
0
    def Common_Vectorizer_usage():
        from sklearn.feature_extraction.text import CountVectorizer
        vectorizer = CountVectorizer(min_df=1)
        corpus = [
            'This is the first document.',
            'This is the second second document.',
            'And the third one.',
            'Is this the first document?',
        ]

        analyze = vectorizer.build_analyzer()
        print analyze("This is a text document to analyze.")
        print analyze("This is a text document to analyze.") == ['this', 'is', 'text', 'document', 'to', 'analyze']
        
        X=vectorizer.fit_transform(corpus)
        print vectorizer.get_feature_names()
        print vectorizer.vocabulary_    #.get('document')
        print vectorizer.transform(['Something completely new.']).toarray()
        print list(X) 
        
        #bigram========================================================
        bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),token_pattern=r'\b\w+\b', min_df=1)
        analyze = bigram_vectorizer.build_analyzer()
        print analyze('Bi-grams are cool!')
        X_2 = bigram_vectorizer.fit_transform(corpus).toarray()
        print X_2

        feature_index = bigram_vectorizer.vocabulary_.get('is this')
        print X_2[:, feature_index] 
        
        #marui test
        print '\n\nmarui test====================='
        def t_preprocessor(s):
            return ','.join([x.lower() for x in s.split(' ')])

        stop_words1=['is','a','this']           #is ok: frozenset(['a', 'this', 'is'])
        stop_words2={'is':0,'a':1,'this':2}     #is ok: convert to frozenset(['a', 'this', 'is'])    
            
        cv = CountVectorizer(preprocessor=t_preprocessor,stop_words=stop_words2)
        params=cv.get_params()
        print 'get_params()',type(params),'---------------'
        for k in params:
            print k,'\t',params[k]
        print 'get_params end--------------'
        print '\nget_stop_words=',cv.get_stop_words()
        
        cv.fit(corpus)
        print cv.get_feature_names()
        print cv.transform(corpus).toarray()
        print '\n测试preprocesser, result:\t',cv.build_preprocessor()('this is a document')
        print '\n测试tokenizer,result',cv.build_tokenizer()('this is a document')
        print '\n测试tokenizer2,result',cv.build_tokenizer()('th-is is a document')
        print '\n测试tokenizer2,result',cv.build_tokenizer()('th_is is a document')
        print '\n测试tokenizer2,result',cv.build_tokenizer()('th&is is a document')

        """
def get_count(x):
    x = ' '.join(x)
    s_vect1 = CountVectorizer(ngram_range=(0, 1), stop_words='english')
    s_analyzer1 = s_vect1.build_analyzer()
    s_listNgramQuery1 = s_analyzer1(x)
    print(s_listNgramQuery1)
    s_vect2 = CountVectorizer(ngram_range=(2, 4))
    s_analyzer2 = s_vect2.build_analyzer()
    s_listNgramQuery2 = s_analyzer2(x)
    print(s_listNgramQuery2)
    result = s_listNgramQuery1 + s_listNgramQuery2
    print(result)
    #get the main freqdist top 20 words
    return result
Ejemplo n.º 7
0
def test_classfier_ngram(test, vocabulary, classifiers):
    vectorizer = CountVectorizer(ngram_range=(1, 3))
    correct = 0
    count = [0, 0, 0, 0, 0]
    for phrase in test:
        f = []
        words = phrase[0]
        vector = [0] * (len(classifiers[0]) - 1)
        tokens = vectorizer.build_analyzer()(words)
        for token in tokens:
            if token in vocabulary:
                vector[vocabulary[token]] += 1
        x = np.array([1] + vector)
        for classifier in classifiers:
            f.append(x.dot(classifier))

        estimate_phrase_class = 0
        for i in range(len(f)):
            if f[i] > 0:
                estimate_phrase_class = i + 1

        count[estimate_phrase_class] += 1

        true_phrase_class = int(phrase[1])

        if (estimate_phrase_class == true_phrase_class):
            correct += 1

    print("Correct: " + str(correct) + "/" + str(len(test)))
    print(correct / len(test))
    print(count)
Ejemplo n.º 8
0
    def bulidModel(self, filename, topicwordnum):
        corpus = []
        vocab = []
        for line in open(filename, 'r').readlines():
            print line
            corpus.append(line.strip())
            vocab += line.split(" ")
            #print corpus
            #将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频
        vectorizer = CountVectorizer()
        print vocab
        print vectorizer
        X = vectorizer.fit_transform(corpus)
        analyze = vectorizer.build_analyzer()
        weight = X.toarray()
        print len(weight)
        print(weight[:5, :5])
        model = lda.LDA(n_topics=5, n_iter=500, random_state=1)
        model.fit(np.asarray(weight))
        topic_word = model.topic_word_
        n_top_words = topicwordnum

        for i, topic_dist in enumerate(topic_word):
            topic_words = np.array(vocab)[np.argsort(
                topic_dist)][:-(n_top_words + 1):-1]
            print('Topic {}: {}'.format(i, ' '.join(topic_words)))
        doc_topic = model.doc_topic_
        print("type(doc_topic): {}".format(type(doc_topic)))
        print("shape: {}".format(doc_topic.shape))
        #输出前10篇文章最可能的Topic
        label = []
        for n in range(20):
            topic_most_pr = doc_topic[n].argmax()
            label.append(topic_most_pr)
            print("doc: {} topic: {}".format(n, topic_most_pr))
Ejemplo n.º 9
0
def ida(articles):
	stopwords = []
	doc_terms = []
	with open('ch_stopwords.txt', 'r') as f:
		stopwords = set(f.read().lower().split('\n'))

	#print('stopwords', stopwords[:10])

	vocab = joblib.load(open('lda-vocab.pkl', 'rb'))

	pkl_file = open('lda-n8-2.pkl', 'rb')
	lda = joblib.load(pkl_file)

	trigram_vectorizer = CountVectorizer(ngram_range=(2,3),token_pattern=r'([\u4e00-\u9fa5]{1}|)', vocabulary=vocab, stop_words=stopwords, analyzer='word')
	analyzer = trigram_vectorizer.build_analyzer()
	

	'''
	for article in articles:
		terms = analyzer(article.Content)
		score = lda.score(terms)
		print(score)
	'''
	article_contents = map(lambda x: x.Content, articles)
	doc_terms = trigram_vectorizer.fit_transform(article_contents)
	test = lda.transform(doc_terms)
	for i, scores in enumerate(test):
		if(i%500==0):print('update article %i' %i)
		article = articles[i]
		article.update_scores(list(scores))
Ejemplo n.º 10
0
class Featurizer(object):
    def __init__(self):
        self.sentiment_analyzer = Sentiment('data/AFINN-111.txt')
        self.bow_vectorizer = None
        self.bow_analyzer = None

    def bag_of_words(self, body):
        return self.bow_vectorizer.transform([body]).toarray()

    def text_features(self, comment):
        num_chars = len(comment.get("body"))
        num_links = count_links(comment.get("body"))

        simple_tokens = comment.get("body").split(' ')
        num_words = 0
        avg_word_length = 0
        for token in simple_tokens:
            num_words += 1
            avg_word_length += len(token)
        avg_word_length = float(avg_word_length) / float(num_words)

        sentiment = self.sentiment_analyzer.analyze(
            self.bow_analyzer(comment.get("body")))

        score = comment.get("score")

        return [num_chars, num_links, num_words, num_words, 
                avg_word_length, sentiment]

    def transform_comment(self, comment):
        return numpy.hstack((
            numpy.array([self.text_features(comment)], 
                        dtype='float_'),
            self.bag_of_words(comment.get("body"))))

    def score_comment(self, comment):
        return comment.get("score")

    def transform(self, comments):
        """ Returns a Nx(D+1) numpy matrix of features. The first D columns
        correspond to features, where the final column corresponds to the
        scores of each comment"""

        # if it's a single instance, return an array
        if isinstance(comments, dict):
            return transform_comment(comments)

        # http://scikit-learn.org/stable/modules/feature_extraction.html
        self.bow_vectorizer = CountVectorizer(min_df=1)
        self.bow_vectorizer.fit([c.get("body") for c in comments])
        self.bow_analyzer = self.bow_vectorizer.build_analyzer()

        def features_and_label(comment):
            return numpy.hstack((
                self.transform_comment(comment),
                numpy.array([[self.score_comment(comment)]], 
                            dtype='float_')))

        return numpy.vstack([features_and_label(c) 
                             for c in comments])
Ejemplo n.º 11
0
def keyword_frequency(keyword, directory):
    freq_table = {}

    for source in glob.glob(os.path.join(directory, '*')):
        words = ''
        vect = CountVectorizer(ngram_range=(1, 3))
        analyzer = vect.build_analyzer()

        for f in glob.glob(os.path.join(source, '*.json')):
            j = json.load(open(f))
            if j['Language'] == 'chinese':
                words += ' '.join(jieba.cut(j['Title']))
                words += ' '.join(jieba.cut(j['Content']))
            elif j['Language'] == 'english':
                words += j['Title']
                words += j['Content']
        ngram_query = analyzer(words)
        fdist = nltk.FreqDist(ngram_query)
        freq = fdist.freq(keyword.lower())
        freq_table[os.path.basename(source)] = freq

    pprint.pprint(freq_table)

    sorted_list = sorted(freq_table, key=freq_table.get, reverse=True)
    print('=================')
    print("%s loves %s most." % (sorted_list[0], keyword))

    plt.bar(range(len(freq_table)), freq_table.values(), align="center")
    plt.xticks(range(len(freq_table)), list(freq_table.keys()))
    plt.show()
Ejemplo n.º 12
0
def msg2list(msg, ngram_range=(1, 2)):
    from sklearn.feature_extraction.text import CountVectorizer

    vectorizer = CountVectorizer(ngram_range=ngram_range)
    analyze = vectorizer.build_analyzer()

    return analyze(msg)
Ejemplo n.º 13
0
def extractFeatures(examples, vocab=None, frequent_ngram_col_idx=None):
    corpus = []  # get bags of words for each trainingexample
    for x, y in examples:
        corpus.append(x)
    # corpus = np.array(examples[:,0])
    vectorizer = CountVectorizer(vocabulary=vocab,
                                 ngram_range=(1, 3),
                                 token_pattern=r'\b\w+\b',
                                 min_df=1)
    X = vectorizer.fit_transform(corpus)
    analyze = vectorizer.build_analyzer()
    fullfeature = X.toarray()
    print('SHAPE in Fit Model', len(fullfeature), len(fullfeature[0]))
    if not frequent_ngram_col_idx:
        sums = np.sum(fullfeature, axis=0)
        frequent_ngram_col_idx = np.nonzero(
            [x > freq_threshold
             for x in sums])  # specify frequency threshold to include in vocab

    # consider passing in pruned vocab to not need next line for dev
    fullfeature = fullfeature[:, frequent_ngram_col_idx[0]]
    print('NEW SHAPE', len(fullfeature), len(fullfeature[0]))

    # TODO: append new features here especially separating out genre, rating

    return fullfeature, vectorizer.vocabulary_, frequent_ngram_col_idx
def preprocess_tokenize(data, language, ngram=(1, 1), min_df=0.01, max_df=0.9):
    """
    Read a list of strings. return a list of list of words without stopwords, tokenized
    :param data: list (iterable) of text items
    :param language: 'en' or 'fr'
    :param ngram: range of the n-gram to use (1,1) or (1,2) mostly
    :param min_df: cutoff for unfrequent words, between 0 and max_df
    :param max_df: cutoff for too frequent words or context specific stop words, between min_df and 1
    :return: processed data
    """

    if language == 'en':
        stopwords = stop_words_en
    elif language == 'fr':
        stopwords = stop_words_fr
    else:
        raise ValueError('Wrong language ! ')

    vectorizer = CountVectorizer(input='content ',
                                 analyzer='word',
                                 ngram_range=ngram,
                                 stop_words=stopwords,
                                 min_df=min_df,
                                 max_df=max_df)

    analyzer = vectorizer.build_analyzer()
    processed = [
        analyzer(doc) if (doc not in [np.NaN, np.nan]) else [] for doc in data
    ]

    return processed
Ejemplo n.º 15
0
class Corpus:
    def __init__(self, tweets):
        self.tweets = tweets
        self.vocab_size = -1
        self.cv = CountVectorizer()
        self.tokenizer = None
        self.build_vocab()

    def build_vocab(self):
        strings = map(lambda tweet: tweet.raw_text, self.tweets)
        self.cv.fit_transform(strings)
        self.tokenizer = self.cv.build_analyzer()
        self.vocab_size = len(self.cv.vocabulary_.keys())
        print("vocabulary size: %d" % self.vocab_size)

    def vocab(self):
        return self.cv.vocabulary_

    def tweet2array(self, tweet):
        assert self.tokenizer is not None
        tokens = self.tokenizer(tweet.raw_text)
        V = self.vocab()
        return map(lambda t: V.get(t), tokens)

    def tokenize(self, tweet):
        return self.tokenizer(tweet.raw_text)
Ejemplo n.º 16
0
def common_theme_in_article_sections(article, add_more_stopwords):
    '''This function returns the common theme of the article using the bigram method'''

    #preprocess the text
    preprocessed = preprocess_articles(article, add_more_stopwords)

    #to findout the bigrams in articles
    vectorizer = CountVectorizer(ngram_range=(2, 2))
    analyzer = vectorizer.build_analyzer()

    bigrams = []

    for arti in preprocessed:
        bigrams.append(analyzer(arti))

    theme_of_article = []

    #finding out the common theme of every article by finding the top 10 bigrams
    for articles in bigrams:
        theme_of_article.append(Counter(articles).most_common(10))

    #creating a list of potential themes of individual article
    potential_theme_in_article_section = []
    for themes in theme_of_article:
        for val in themes:
            potential_theme_in_article_section.append(val[0])

    x = Counter(potential_theme_in_article_section).most_common(5)

    return x
Ejemplo n.º 17
0
    def __init__(self, data, target, target_names, preprocess=False):
        """
        Initializes the classifier by training it on the given data.
        :param data: text documents to train the classifier on
        :type data: list
        :param target: category indexes for each text document
        :type target: list
        :param target_names: category names
        :type target_names: list
        """

        self.__clf_data = ClassifierData(data, target, target_names)
        if preprocess:
            analyzer = CountVectorizer.build_analyzer()
            ipp = InputPreprocessor(None)

            def preprocess(doc):
                return [ipp.normalise(word) for word in analyzer(doc)]

            vectorizer = CountVectorizer(analyzer=preprocess)

        else:
            vectorizer = CountVectorizer()
        self.__text_clf = Pipeline([
            ('vect', vectorizer),
            ('tfidf', TfidfTransformer()),
            ('clf', DecisionTreeClassifier(criterion='entropy')),
        ])
Ejemplo n.º 18
0
    def bulidModel(self,filename,topicwordnum):
        corpus = []
        vocab = []
        for line in open(filename, 'r').readlines():
            print line
            corpus.append(line.strip())
            vocab +=line.split(" ")
            #print corpus
            #将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频
        vectorizer = CountVectorizer()
        print vocab
        print vectorizer
        X = vectorizer.fit_transform(corpus)
        analyze = vectorizer.build_analyzer()
        weight = X.toarray()
        print len(weight)
        print (weight[:5, :5])
        model = lda.LDA(n_topics=5, n_iter=500, random_state=1)
        model.fit(np.asarray(weight))
        topic_word = model.topic_word_
        n_top_words = topicwordnum

        for i, topic_dist in enumerate(topic_word):
            topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
            print('Topic {}: {}'.format(i, ' '.join(topic_words)))
        doc_topic = model.doc_topic_
        print("type(doc_topic): {}".format(type(doc_topic)))
        print("shape: {}".format(doc_topic.shape))
        #输出前10篇文章最可能的Topic
        label = []
        for n in range(20):
            topic_most_pr = doc_topic[n].argmax()
            label.append(topic_most_pr)
            print("doc: {} topic: {}".format(n, topic_most_pr))
 def build_vectorizer(self, sequences_lists, stop_w='english', min_df=0):
     vectorizer = CountVectorizer(stop_words=stop_w, min_df=min_df)
     vectorizer.fit(sequences_lists)
     word2index = vectorizer.vocabulary_
     word2index['<PAD>'] = max(word2index.values()) + 1
     tokenizer = vectorizer.build_analyzer()
     return word2index, tokenizer
Ejemplo n.º 20
0
    def constructNGramsBOW(self):

        text = self.docName
        stopsWords = set(stopwords.words('english'))
        text = re.sub("[^a-zA-Z]", " ", text.lower())
        text = re.sub("\s\s+", " ", text)
        #bigram = Phraser.load('mymodel/bigram_phraser_wikien2017')
        #trigram = Phraser.load('mymodel/trigram_phraser_wikien2017')
        #sent_tokenize_list = sent_tokenize(text)
        with open("data/embed.vocab") as f:
            vocab_list = map(str.strip, f.readlines())
        #for line in sent_tokenize_list:
        #    sent = word_tokenize(line)
        #    line = trigram[bigram[sent]]
        #    line = [w for w in line if not w in stopsWords ]
        with open("data/embed.vocab") as f:
            vocab_list = map(str.strip, f.readlines())
            vocab_dict = {w: k for k, w in enumerate(vocab_list)}

        vectorizer = CountVectorizer(ngram_range=(1, 3))
        analyzer = vectorizer.build_analyzer()
        sett = analyzer(text)

        sett = [token.replace(" ", "_") for token in sett]
        BOWNgram = [token for token in sett if token in vocab_dict.keys()]
        BOWNgram = Counter(BOWNgram)
        return OrderedDict(BOWNgram)
Ejemplo n.º 21
0
def create_topic():
    # 存取语料库, 一行为一个文档
    corpus = []
    for line in open(documentfile, 'r').readlines():
        corpus.append(line.strip())
    print(corpus)
    # 将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(corpus)
    analyze = vectorizer.build_analyzer()
    weight = X.toarray()
    print(X.shape)
    # LDA算法
    model = lda.LDA(n_topics=10, n_iter=500, random_state=1)
    model.fit(np.asanyarray(weight))
    topic_word = model.topic_word_

    print(topic_word)
    n_top_words = 8
    for i, topic_dist in enumerate(topic_word):
        topic_words = [np.argsort(topic_dist)][:-(n_top_words+1):-1]
        print('Topic {}: {}'.format(i, ' '.join(topic_words)))

    # 文档-主题分布
    doc_topic = model.doc_topic_
    print("type(doc_topic): {}".format(type(doc_topic)))
    print("shape: {}".format(doc_topic.shape))

    # 输出前10篇文章最有可能的Topic
    label = []
    for n in range(10):
        topic_most_pr = doc_topic[n].argmax()
        label.append(topic_most_pr)
        print("doc: {} topic: {}".format(n, topic_most_pr))
Ejemplo n.º 22
0
class GRUDataSet(Dataset):
    def __init__(self, path, max_seq_len):
        self.max_seq_len = max_seq_len
        df = pd.read_csv(path)

        self.vectorizer = CountVectorizer(stop_words='english',
                                          max_df=0.99,
                                          min_df=2)
        self.vectorizer.fit(df.text.tolist())

        self.token2idx = self.vectorizer.vocabulary_
        self.token2idx['<PAD>'] = max(self.token2idx.values()) + 1

        self.tokenizer = self.vectorizer.build_analyzer()
        self.text_encoding_fun = lambda x: [
            self.token2idx[token] for token in self.tokenizer(x)
            if token in self.token2idx
        ]
        self.padding_fun = lambda x: x + (max_seq_len - len(x)
                                          ) * [self.token2idx['<PAD>']]

        sequences = [
            self.text_encoding_fun(x)[:max_seq_len] for x in df.text.tolist()
        ]
        self.sequences = [self.padding_fun(sequence) for sequence in sequences]
        self.labels = df.target.tolist()

    def __getitem__(self, i):
        return self.sequences[i], self.labels[i]

    def __len__(self):
        return len(self.sequences)
def buildFeatureMatrixRepresentation(stopwords,corpusRepresentation,corpusList,outPath):
    #featuresMatrix =[]
    if ((not corpusRepresentation.empty) and (corpusList)):
        fOutput = open(outPath,"w")
        vectorizer = CountVectorizer(lowercase=True,stop_words=stopwords,token_pattern='(?u)\\b[\\w+,-]+\\w+\\b|\\b\\w\\w+\\b')
        for abstractPath in corpusList:
            for counter,document in enumerate(glob.iglob(abstractPath)):
                if ((counter<MAX_NUM_ABSTRACTS) and (document)):
                    try:
                        fp = open(document,"r");
                        content = fp.read();
                        fp.close()
                        if content:
                            vector = [];
                            #we split each document into tokens.
                            analyser = vectorizer.build_analyzer()
                            tokens = analyser(content);
                            for word in corpusRepresentation.term:
                                if any(word in s for s in tokens):
                                    vector.append(1)
                                else:
                                    vector.append(0)
                            #featuresMatrix.append(vector)
                            fOutput.write(" ".join(str(x) for x in vector)+"\n")
                    except:
                        print "Error trying to build the representation for the document: "+document
        fOutput.close();
Ejemplo n.º 24
0
    def _fit_language(self, X_unmapped: Sequence[str], X: Sequence[str],
                      Y: np.ndarray):
        cv = CountVectorizer(
            max_df=0.95,
            min_df=2,
            lowercase=False,
            ngram_range=(1, self.hyperparams.max_ngram),
            max_features=(self.hyperparams.max_vocab * 18),
            token_pattern='[a-zA-Z0-9$&+,:;=?@_/~#\\[\\]|<>.^*()%!-]+')

        X_vec = cv.fit_transform(trivial_generator(X))

        local_vocab = set()
        for feat in Y.columns:
            res = zip(
                cv.get_feature_names(),
                mutual_info_classif(X_vec, Y[feat], discrete_features=True))
            local_vocab.update(res)
        self.vocab = {
            i[0]
            for i in sorted(local_vocab, key=lambda i: i[1], reverse=True)
            [:self.hyperparams.max_vocab]
        }

        self._analyzer = cv.build_analyzer()
Ejemplo n.º 25
0
def feature_extractor(data):
    data = data.decode('utf-8')
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')

    tokens = word_tokenize(data)
    tokens_mod = []
    i = 0
    while i < len(tokens):
        curr = tokens[i]
        if curr == 'no' or curr == 'not':
            if i - 1 >= 0:
                tokens_mod[-1] = tokens_mod[-1] + '+' + curr
            if i + 1 <= len(tokens) - 1:
                tokens_mod.append(curr + '+' + tokens[i + 1])
                i += 1
        else:
            tokens_mod.append(curr)
        i += 1

    data_mod = ''
    for token in tokens_mod:
        data_mod += ' ' + token

    bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),
                                        token_pattern=r'\b\w+\b',
                                        min_df=1)
    analyze = bigram_vectorizer.build_analyzer()
    bigrams = analyze(data_mod)
    features = {bigram: 1 for bigram in bigrams}
    return features
Ejemplo n.º 26
0
class AssociationRuleBased(AbstractFindTopicList):
    def __init__(self, analyzer=None, n_gram_boost_map={}):
        self.analyzer = analyzer
        self.n_gram_boost_map = n_gram_boost_map

    def getTopicList(self, doc_list, parm):
        from sklearn.feature_extraction.text import CountVectorizer
        from analysislib.datamining import AssociationRuleMining

        self.rule_miner = AssociationRuleMining.getAssociationRuleMiner(
            "aprior")
        if self.analyzer == None:
            print("No custom Analyser given")
            self.vectorizer = CountVectorizer(lowercase=True,
                                              stop_words='english')
            self.analyzer = self.vectorizer.build_analyzer()
        tokenize_doc_list = []
        for doc in doc_list:
            token_list = self.analyzer(doc)
            tokenize_doc_list.append(token_list)
        associatoin_rules = self.rule_miner.getRule(tokenize_doc_list)
        return self.__get_filter_topic_from_rule(associatoin_rules)

    def __get_filter_topic_from_rule(self, association_rules):
        topic_list = []
        for item in association_rules:
            pair = item[0]
            topic = ""
            for x in pair:
                topic += x + " "
            topic_list.append(topic)
        return topic_list
Ejemplo n.º 27
0
    def fit(self, X_unmapped, X, Y, max_vocab=18000,
            max_features_to_test=180000, window=8, dims=32, max_ngram=5):
        cv = CountVectorizer(
            max_df=0.95, min_df=2, lowercase=False, ngram_range=(1, max_ngram),
            max_features=max_features_to_test,
            token_pattern='[a-zA-Z0-9$&+,:;=?@_/~#\\[\\]|<>.^*()%!-]+')

        X_vec = cv.fit_transform(self._smiles_to_trivial_lang(X))

        local_vocab = set()
        for feat in Y.columns:
            res = zip(cv.get_feature_names(),
                      mutual_info_classif(
                          X_vec, Y[feat], discrete_features=True)
                      )
            local_vocab.update(res)
        self.vocab = {i[0] for i in sorted(
            local_vocab, key=lambda i: i[1], reverse=True)[:max_vocab]}

        self._analyzer = cv.build_analyzer()

        generator = self._make_iterator(X_unmapped, training=True)

        document_model = Doc2Vec(
            vector_size=dims, workers=cpu_count(), window=window)
        document_model.build_vocab(generator)
        document_model.train(
            generator, total_examples=len(X_unmapped), epochs=36)

        self.document_model = document_model
Ejemplo n.º 28
0
def iter_tool(title, n):
    bigram_vectorizer = CountVectorizer(ngram_range=(1, n),
                                        token_pattern=r'\b\w+\b',
                                        min_df=1)
    analyze = bigram_vectorizer.build_analyzer()
    w_list = analyze(title)
    return w_list
Ejemplo n.º 29
0
def wnl_nonum(doc):
    wnl = WordNetLemmatizer()
    cv = CountVectorizer()
    ana = cv.build_analyzer()

    doc = re.sub('[0-9]', '', doc)
    return (wnl.lemmatize(w) for w in ana(doc))
Ejemplo n.º 30
0
def main():
    vct = CountVectorizer(encoding='ISO-8859-1', min_df=5, max_df=1.0, binary=True, ngram_range=(1, 1),
                          token_pattern='\\b\\w+\\b', tokenizer=StemTokenizer())
    vct_analizer = vct.build_analyzer()
    print("Start loading ...")
    # data fields: data, bow, file_names, target_names, target

    ########## NEWS GROUPS ###############
    # easy to hard. see "Less is More" paper: http://axon.cs.byu.edu/~martinez/classes/678/Presentations/Clawson.pdf
    categories = [['alt.atheism', 'talk.religion.misc'],
                  ['comp.graphics', 'comp.windows.x'],
                  ['comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware'],
                  ['rec.sport.baseball', 'sci.crypt']]

    if "imdb" in args.train:
        ########## IMDB MOVIE REVIEWS ###########
        data = Bunch(load_imdb(args.train, shuffle=True, rnd=2356, vct=vct))  # should brind data as is
    elif "aviation" in args.train:
        raise Exception("We are not ready for that data yet")
    elif "20news" in args.train:
        ########## 20 news groups ######
        data = Bunch(load_20newsgroups(categories=categories[0], vectorizer=vct, min_size=50))  # for testing purposes
    elif "dummy" in args.train:
        ########## DUMMY DATA###########
        data = Bunch(load_dummy("C:/Users/mramire8/Documents/code/python/data/dummy", shuffle=True,rnd=2356,vct=vct))
    else:
        raise Exception("We do not know that dataset")

    print("Data %s" % args.train)
    total = len(data.train.data)
    print("Data size %s" % total)
    #print(data.train.data[0])

    ## prepare pool for the sampling
    pool = Bunch()
    pool.data = data.train.bow.tocsr()   # full words, for training
    pool.target = data.train.target
    pool.predicted = []
    pool.remaining = set(range(pool.data.shape[0]))  # indices of the pool

    bt = randomsampling.BootstrapFromEach(87654321)
    for i in range(7):
        query_index = bt.bootstrap(pool=pool, k=args.packsize)  # get instances from each class
        filename = "{0}-P{1}.txt".format(args.train,i)
        f = codecs.open(filename, 'a+', 'utf-8')
        #print documents in file
        random.shuffle(query_index)
        for di in query_index:
            x = unicode(data.train.data[di].replace("\n","<br>"))
            #y = data.train.target[di]
            y = data.train.target_names[data.train.target[di]]
            #f.write(str(i))
            #f.write("\t")
            #f.write(str(y))
            #f.write("\t")
            #f.write(x)
            #f.write("\n")

        f.close()
        pool.remaining.difference_update(query_index) # remove the used ones
Ejemplo n.º 31
0
def encode_text(sentences,
                vectorizer=None,
                max_len=None,
                msg_prefix="\n",
                verbose=True):
    """Encode array_like of strings to ndarray of integers.

    :param sentences: (array_like of str).
        e.g., ["I like apples", "Me too"]
    :param vectorizer: (CountVectorizer, optional)
    :param max_len: (int) maximum length of encoded sentences.
    :param msg_prefix:
    :param verbose:
    :return: Tuple[CountVectorizer, int, ndarray]
        e.g., (CountVectorizer,
                3,
                array([[1, 2, 3], [4, 5, 0]]))
    """
    if verbose:
        print("{} Encode texts to integers".format(msg_prefix))

    # Not recommend to modify below vectorizer/vocab lines.
    if vectorizer is None:
        vectorizer = CountVectorizer(stop_words="english")
        vectorizer.fit(sentences)
    # dictionary of (token, encoding) pair.
    #    e.g., {"I": 0, "like": 1, "apples": 2, "Me": 3, "too": 4}
    vocab = vectorizer.vocabulary_

    # Convert str to int.
    # - Use preprocess_and_tokenize the type of which is 'Callable[str, List[str]]'
    # - Do not use '0'. We will use '0' in zero padding.
    # e.g., sentences: ["I like apples", "Me too"] and
    #       vocab: {"I": 0, "like": 1, "apples": 2, "Me": 3, "too": 4}
    #       Then, encoded_sentences: [[0 + 1, 1 + 1, 2 + 1], [3 + 1, 4 + 1]] -> [[1, 2, 3], [4, 5]]
    preprocess_and_tokenize = vectorizer.build_analyzer()
    encoded_sentences = []
    for s in sentences:
        tokens = preprocess_and_tokenize(s)
        # Hint: encoded_sentences.append(/* BLANK */)
        raise NotImplementedError

    assert len(encoded_sentences) == len(sentences)
    assert all([0 not in es for es in encoded_sentences])
    # Get max_len (maximum length).
    # If max_len is given, use it.
    # e.g., [[1, 2, 3], [4, 5]] (from ["I like apples", "Me too"])
    #       -> 3
    max_len = max_len or max(len(es) for es in encoded_sentences)

    # Add zero padding to make length of all sentences the same.
    # e.g., [[1, 2, 3], [4, 5]]
    #       -> [[1, 2, 3], [4, 5, 0]]
    pad_encoded_sentences = np.zeros((len(sentences), max_len), dtype=np.int32)
    for idx, es in enumerate(encoded_sentences):
        length = len(es) if len(es) <= max_len else max_len
        # Hint: pad_encoded_sentences[idx, :length] = /* BLANK */
        raise NotImplementedError

    return vectorizer, max_len, pad_encoded_sentences
Ejemplo n.º 32
0
def feature_extractor(data):
    data = data.decode('utf-8')
    lemmatizer = WordNetLemmatizer()
    stop_words = stopwords.words('english')

    tokens = word_tokenize(data)
    tokens_mod = []
    i = 0
    while i < len(tokens):
        curr = tokens[i]
        if curr == 'no' or curr == 'not':
            if i - 1 >= 0:
                tokens_mod[-1] = tokens_mod[-1] + '+' + curr
            if i + 1 <= len(tokens) - 1:
                tokens_mod.append(curr + '+' + tokens[i+1])
                i += 1
        else:
            tokens_mod.append(curr)
        i += 1

    data_mod = ''
    for token in tokens_mod:
        data_mod += ' ' + token

    bigram_vectorizer = CountVectorizer(ngram_range=(1, 2),token_pattern=r'\b\w+\b', min_df=1)
    analyze = bigram_vectorizer.build_analyzer()
    bigrams = analyze(data_mod)
    features = { bigram:1 for bigram in bigrams }
    return features
Ejemplo n.º 33
0
def fitModel(examples, acoustic=None, vocab=None, frequent_ngram_col_idx=None):
    corpus = [x for x, y in examples]
    vectorizer = CountVectorizer(vocabulary=vocab,
                                 ngram_range=(1, 3),
                                 token_pattern=r'\b\w+\b',
                                 min_df=1)
    X = vectorizer.fit_transform(corpus)

    # UNCOMMENT TO ADD NGRAM FEATURES
    analyze = vectorizer.build_analyzer()
    fullfeature = X.toarray()
    # print 'VOCAB SHAPE', len(fullfeature), len(fullfeature[0])

    # The most time expensive part (pruning so only frequent ngrams used)
    if not frequent_ngram_col_idx:
        sums = np.sum(fullfeature, axis=0)
        frequent_ngram_col_idx = np.nonzero([x > 2 for x in sums])
    fullfeature = fullfeature[:, frequent_ngram_col_idx[0]]

    # Add features from grammatical context in transcript
    fullfeature = contextualFeatures(examples, fullfeature)
    # print 'CONTEXTUAL SHAPE', len(fullfeature), len(fullfeature[0])

    fullfeature = acousticFeatures(fullfeature, acoustic)
    # print 'FINAL SHAPE', len(fullfeature), len(fullfeature[0])

    # return vectorizer
    return fullfeature, vectorizer.vocabulary_, frequent_ngram_col_idx
def get_topic_sim(k):
    corpus = []
    input = open('LDA_result.txt')
    for line in input:
        corpus.append(line.strip('\r\n'))
    print len(corpus)

    # 将文本中的词语转换为词频矩阵 矩阵元素a[i][j] 表示j词在i类文本下的词频

    vectorizer = CountVectorizer()
    print 'vectorizer',vectorizer

    X = vectorizer.fit_transform(corpus)
    analyze = vectorizer.build_analyzer()
    weight = X.toarray()
    # LDA算法
    print 'LDA:'


    model = lda.LDA(n_topics=k, n_iter=300, random_state=1)
    model.fit(np.asarray(weight))  # model.fit_transform(X) is also available
    topic_word = model.topic_word_  # model.components_ also works

    # 文档-主题(Document-Topic)分布
    doc_topic = model.doc_topic_
    ##print("type(doc_topic): {}".format(type(doc_topic)))
    ##print("shape: {}".format(doc_topic.shape))
    ##print doc_topic
    sim = sklearn.metrics.pairwise.cosine_similarity(doc_topic, dense_output=True)
    ##print sim
    return sim
Ejemplo n.º 35
0
def get_feature_by_opcode_bigram_word2vec():
    global max_document_length
    global bigram_word2vec_bin

    with open('metrics.txt', 'a') as f:
        f.write("Get feature by opcode and bigram word2vec: \n")
        f.close()

    x = []
    y = []

    if os.path.exists(bigram_wv_data_pkl_file) and os.path.exists(
            label_pkl_file):
        f = open(bigram_wv_data_pkl_file, 'rb')
        x = pickle.load(f)
        f.close()
        f = open(label_pkl_file, 'rb')
        y = pickle.load(f)
        f.close()
    else:
        x, y = load_data_pkl_file()

        CV = CountVectorizer(ngram_range=(2, 2),
                             decode_error="ignore",
                             token_pattern=r'\b\w+\b',
                             min_df=1,
                             max_df=1.0)
        # 2-gram分词
        analyze = CV.build_analyzer()
        courps = []
        for text in x:
            text = analyze(text)
            text = str(text).replace('u\'', '\'')
            courps.append(str(text))
        x = courps

        cores = multiprocessing.cpu_count()

        if os.path.exists(bigram_word2vec_bin):
            print "Find cache file %s" % bigram_word2vec_bin
            model = gensim.models.Word2Vec.load(bigram_word2vec_bin)
        else:
            model = gensim.models.Word2Vec(size=max_features,
                                           window=5,
                                           min_count=5,
                                           iter=10,
                                           workers=cores)
            model.build_vocab(x)
            model.train(x,
                        total_examples=model.corpus_count,
                        epochs=model.iter)
            model.save(bigram_word2vec_bin)

        x = getVecsByWord2Vec(model, x, max_features)

        f = open(bigram_wv_data_pkl_file, 'wb')
        pickle.dump(x, f)
        f.close()

    return x, y
Ejemplo n.º 36
0
def buildFeatureMatrixRepresentation(stopwords, corpusRepresentation,
                                     abstractPath, outPath):
    #featuresMatrix =[]
    if ((not corpusRepresentation.empty) and (abstractPath)):
        fOutput = open(outPath, "w")
        vectorizer = CountVectorizer(
            lowercase=True,
            stop_words=stopwords,
            token_pattern='(?u)\\b[\\w+,-]+\\w+\\b|\\b\\w\\w+\\b')
        for abstractPath in corpusList:
            for counter, document in enumerate(glob.iglob(abstractPath)):
                if ((counter < MAX_NUM_ABSTRACTS) and (document)):
                    try:
                        fp = open(document, "r")
                        content = fp.read()
                        fp.close()
                        if content:
                            vector = []
                            #we split each document into tokens.
                            analyser = vectorizer.build_analyzer()
                            tokens = analyser(content)
                            for word in corpusRepresentation.term:
                                if any(word in s for s in tokens):
                                    vector.append(1)
                                else:
                                    vector.append(0)
                            #featuresMatrix.append(vector)
                            fOutput.write(" ".join(str(x)
                                                   for x in vector) + "\n")
                    except:
                        print "Error trying to build the representation for the document: " + document
        fOutput.close()
Ejemplo n.º 37
0
class NLTK_CountVectorizer(CountVectorizer):
    def __init__(self, lang, **kwargs):
        CountVectorizer.__init__(self, kwargs)

        try:
            self.stemmer = SnowballStemmer(lang.lower()).stem
            self.vect = CountVectorizer()
            self.analyzer = self.analyzer_nltk
        except ValueError:
            pass

    def analyzer_nltk(self, x):
        return [self.stemmer(e) for e in self.vect.build_analyzer()(x)]

    def fit_transform(self, x, y):
        res = super().fit_transform(x, y)

        try:
            self.vect.fit(x, y)

            vocabs = dict()
            for v_it in self.vect.vocabulary_:
                expr = self.stemmer(v_it)
                if expr in vocabs:
                    vocabs[expr].append(v_it)
                else:
                    vocabs[expr] = [v_it]

            self.vocabulary_nltk = dict([(e[0], min(e[1]))
                                         for e in vocabs.items()])
        except AttributeError:
            self.vocabulary_nltk = dict([(e, e) for e in self.vocabulary_])

        return res
Ejemplo n.º 38
0
def generate_bow_doc(doc, feature_names):
    vectorizer = CountVectorizer(max_df=0.5, stop_words='english')
    tokeniser = vectorizer.build_analyzer()
    bow = [
        w for w in tokeniser(doc) if w in feature_names and w in model.vocab
    ]
    return bow
Ejemplo n.º 39
0
def analyze_comment(comment):
    vectorizer = CountVectorizer(stop_words='english')
    analyzer = vectorizer.build_analyzer()

    comment = analyzer(clean_comment(comment))

    comment = list(filter(lambda s: not '_' in s, comment))
    comment = list(filter(lambda s: not any(c.isdigit() for c in s), comment))

    return comment
Ejemplo n.º 40
0
def analyze_body(body):
    vectorizer = CountVectorizer(stop_words='english')
    analyzer = vectorizer.build_analyzer()

    body = analyzer(clean_body(body))

    body = list(filter(lambda s: not '_' in s, body))
    body = list(filter(lambda s: not any(c.isdigit() for c in s), body))

    return body
Ejemplo n.º 41
0
def analyze_title(title):
    vectorizer = CountVectorizer(stop_words='english')
    analyzer = vectorizer.build_analyzer()

    title = analyzer(clean_title(title))

    title = list(filter(lambda s: not '_' in s, title))
    title = list(filter(lambda s: not any(c.isdigit() for c in s), title))

    return title
Ejemplo n.º 42
0
def main():

    ###corpus = importasline('../data/shakespear.txt')
    corpus = importasline('../data/all_modified.txt')
    vectorizer = CountVectorizer(min_df=1)
    X = vectorizer.fit_transform(corpus)
    analyze = vectorizer.build_analyzer()
    Y = [[vectorizer.vocabulary_[x] for x in analyze(corpus[i])] for i in range(len(corpus))]

    print(len(Y), 'len(Y)')
Ejemplo n.º 43
0
	def main_hack(self):
		input_train_file_ptr = "trainingandtestdata/training.1600000.processed.noemoticon.csv"
		input_test_file_ptr = "trainingandtestdata/testdata.manual.2009.06.14.csv"
		# read the csv file and return the pandas dataframe with two column as tweets and sentiment as columns.
		train_tweests_with_sentiments = self.pre_process_input_data(input_train_file_ptr)
		test_tweets_data = self.pre_process_input_data(input_test_file_ptr)
		bigram_vectorizer = CountVectorizer(ngram_range=(2,2),token_pattern=r'\b\w+\b', min_df=1,lowercase=True)
		# print tweests_array
		tweets_array, sentiments_array = self.get_tweest_and_sentiments(train_tweests_with_sentiments)
		print(("size of tweets array is %s and sentiment array is %s  " % (tweets_array.size, sentiments_array.size)))
		test_tweets,test_sentiments = self.get_tweest_and_sentiments(test_tweets_data)
		test_sentiments =  test_sentiments.flatten()
		print(("size of test tweets array is %s and test sentiment array is %s  " % (test_tweets.size, test_sentiments.size)))
		parsed_train_tweets = self.clean_data_to_feed_classifier(tweets_array)
		parsed_test_tweets = self.clean_data_to_feed_classifier(test_tweets)
		# print parsed_tweests
		x = bigram_vectorizer.fit_transform(parsed_train_tweets)
		print (x.size)
		# print bigram_vectorizer.get_feature_names()
		bigram_vectorizer.build_analyzer()
		print ("done 1")
		# print bigram_vectorizer.get_feature_names()
		res = bigram_vectorizer.transform(parsed_test_tweets)
		print ("done 2")
		clf = LinearSVC()
		gnb = MultinomialNB()
		print ("done 2")
		trained_classifier = self.do_K_fold_cross_validation(clf,gnb,x,sentiments_array.flatten())
		# trained_classifier.fit(x, sentiments_array.flatten())
		print ("done 3")
		output =  trained_classifier.predict(res)
		# print output
		print (accuracy_score(test_sentiments,output))
	    # bigram_vectorizer.get_feature_names()
	    # analyze = bigram_vectorizer.build_analyzer()
	    # analyze



	# if __name__ == '__main__':
	#     main()
Ejemplo n.º 44
0
class BagOfWords:
    """
    Basic Bagof words model implemented with SciKit-Learn's sparse counting vectorizers
    """
    def __init__(self,**kwargs):
        self.vectorizer_args=kwargs
        self.vectorizer=CountVectorizer(decode_error='ignore',**self.vectorizer_args)

    def __call__(self,*txt,**kwargs):
        """
        Use SciKit vectorizer to transform the txt into an matrices of numbers represeting pure bag of words
        :return:
        """
        return self.vectorizer.fit_transform([str(i) for i in txt],**kwargs)

    def get_word_count(self, *txt,**kwargs):
        """
        First, fit to vocab and get word count

        Count occurence of each word in the bag of words representation from txt list, *txt

        Returns UNSORTED LIST
        """
        self.vectorizer.fit(txt)
        analyze = self.vectorizer.build_analyzer()


        return TextUtil.stem(
            TextUtil.remove_stop_words(collections.Counter(itertools.chain(*(analyze(str(i)) for i in txt))))
        )



    def get_feature_names(self):
        """
        Get the feature names of the vectorized vector

        :return:
        """
        return self.vectorizer.get_feature_names()

    def reverse_transformation(self,bow_dict):
        """
        Reverse the transformation of a dictionary representation of BOW into numpy vectors

        :return:
        """
        assert isinstance(bow_dict,BaseDict) or isinstance(bow_dict,dict)

        vec=DictVectorizer()
        vec.fit_transform(bow_dict)

        return vec
Ejemplo n.º 45
0
def main(argv):
  vlen = int(argv[4])
  abstractDict = loadAbstracts(argv[2])
  catTitleDict = loadCategoryTitles(argv[1])

  vectorizer = CountVectorizer(stop_words='english')
  analyzer = vectorizer.build_analyzer()

  #catVectors = getVectorsFromTitles(catTitleDict, analyzer, vlen)
  catVectors = getVectorsFromAbstracts(catTitleDict, abstractDict, analyzer,
                                       vlen)
  print len(catVectors)
  indexVectors(catVectors, argv[3])
Ejemplo n.º 46
0
def bagofwords(df, item2emomapping):
    '''creates simple bag of words feature space'''
    print "creating bag-of-words feature space"
    from sklearn.feature_extraction.text import CountVectorizer

    listofstrings = list(df['cause'].values)
    itemlabels = ['q%0.f' % qnum for qnum in df['qnum'].values]
    vectorizer = CountVectorizer(min_df=1)
    analyzer = vectorizer.build_analyzer()
    bagofwords = vectorizer.fit_transform(listofstrings)
    features = vectorizer.get_feature_names()
    bagofwords = bagofwords.toarray()
    itemavgs = [list(line) for line in bagofwords]
    ndf = makedataframe(itemavgs, itemlabels, item2emomapping)
    ndimf.quicksave(ndf, os.path.join(rootdir,'data/stimdfs','bagofwordsdf.pkl'))
    return ndf
Ejemplo n.º 47
0
    def wtf(self):
        from rutez.rutez import Rutez
        from sklearn.feature_extraction.text import CountVectorizer
        tez = Rutez()
        with open('data/first_sentences.html') as f:
            full_text = f.read()
        vectorizer = CountVectorizer(ngram_range=(1,2))
        analyzer = vectorizer.build_analyzer()
        data = (analyzer(full_text))
        # print(type(data), len(data))

        sinsets = set()
        for item in data:
            word = item.upper()
            if word in tez.word2sinsets:
                for sinset in tez.word2sinsets[word]:
                    print(word, '|', sinset, '|', tez.upper_sinsets(sinset))
Ejemplo n.º 48
0
    def __init__(self):
               
        #BOWs preparation
        filenames = ['bow/1StarsSamples.json', 'bow/2StarsSamples.json', 'bow/3StarsSamples.json', 'bow/4StarsSamples.json', 'bow/5StarsSamples.json']
        
        self.vectorizer = CountVectorizer(input='filename', ngram_range=(1,3), stop_words='english', strip_accents='unicode', token_pattern=ur'\b\w+\b')        
        dtm = self.vectorizer.fit_transform(filenames).toarray()
        self.dtm = scale(dtm)
        vocab = np.array(self.vectorizer.get_feature_names())        
        _vectorizer = CountVectorizer(input='content', ngram_range=(1,3), stop_words='english', strip_accents='unicode', token_pattern=ur'\b\w+\b')
        self.analyze = _vectorizer.build_analyzer()

        #Load dictionaries and model
        with open("dict/dict2bins.p", "rb") as f1, open("dict/dict3bins.p", "rb") as f2, open("dict/dict6bins.p", "rb") as f3, open("model/clf.pkl", "rb") as fm:
            self.dict2bins = pickle.load(f1)
            self.dict3bins = pickle.load(f2)
            self.dict6bins = pickle.load(f3)
            self.model = pickle.load(fm)     #load model
Ejemplo n.º 49
0
def count_letters(filenames):
    from glob import glob
    # with spaces
    # CountVectorizer(token_pattern=r'[A-Za-z ]',min_df=1)
    #Just letters, no spaces
    filenames=glob(filenames)
    
    text=[]
    vectorizer=CountVectorizer(token_pattern=r'[A-Za-z]',min_df=1)
    analyze = vectorizer.build_analyzer()    
    for filename in filenames:
        with open(filename) as fid:
            mytext=fid.read()
            #mytext=mytext.decode('utf8','ignore')            
            text.append(mytext)
        
    X=vectorizer.fit_transform(text).toarray()
    return X,[str(_) for _ in vectorizer.get_feature_names()]     
def main():


    corpus = importasline('../data/grouping1/groupA.txt',ignorehyphen = True)

    vectorizer = CountVectorizer(min_df=1)
    X = vectorizer.fit_transform(corpus)
    analyze = vectorizer.build_analyzer()
    Y = [[vectorizer.vocabulary_[x] for x in analyze(corpus[i])] for i in range(len(corpus))]
    print(Y)
    words = vectorizer.get_feature_names()
    num_of_hidden_states = 5
    print(len(words))
    print(Y)
    hmm = modelhmm(num_of_hidden_states, len(words), Y, 'modelnhidden5groupA')

    hmm.syllable_analysis()
    exit()
    if(False):
        for i in range(5000):
            print(i)
            print(hmm.update_state_corpus(Y))
        hmm.savemodel()
    #print(hmm.obs_[:,Y[0]])
        print(hmm.trans_)
    hmm.loadmodel()
    print('transloaded',hmm.trans_.shape)

    print('obsloaded',hmm.obs_.shape)
    for i in range(20):
        robotpoem = ''
        line,linew = hmm.generating_random_line()
        for j in linew:
            robotpoem+=' '+words[j]+' '
        print(robotpoem)


    hmm.analyzing_word(words)
    hmm.analysing_obs(words)
    wordtag=nltk.pos_tag(words,tagset='universal')
    pos = [x[1] for x in wordtag]
    stat = nltk.FreqDist(pos)
    print stat.most_common()
Ejemplo n.º 51
0
def main():

    corpus = importasline('../data/shakespear_modified.txt', ignorehyphen = True)
   

    vectorizer = CountVectorizer(min_df=1)
    X = vectorizer.fit_transform(corpus)
    analyze = vectorizer.build_analyzer()
    Y = [[vectorizer.vocabulary_[x] for x in analyze(corpus[i])] for i in range(len(corpus))]
    words = vectorizer.get_feature_names()
    print(len(words))
    mm = Markov(len(words), Y, 'modelnhidden1000groupA')
    print len(mm.inversetable), 'len(mm.inversetable)'
    print mm.inversetable[0: 4], 'mm.inversetable[0: 4]'
    for i in range(20):
        [line,linew] = mm.generating_random_line()
        ###print linew, ': linew' 
        robotpoem = ''
        for j in linew[:-1]:
            robotpoem+=' '+words[j]+' '
Ejemplo n.º 52
0
class Corpus:
	def __init__(self, tweets):
		self.tweets = tweets
		self.vocab_size = -1
		self.cv = CountVectorizer()
		self.tokenizer = None
		self.build_vocab()
	def build_vocab(self):
		strings = map(lambda tweet: tweet.raw_text, self.tweets)
		self.cv.fit_transform(strings)
		self.tokenizer = self.cv.build_analyzer()		
		self.vocab_size = len(self.cv.vocabulary_.keys())
		print("vocabulary size: %d" % self.vocab_size)
	def vocab(self):
		return self.cv.vocabulary_
	def tweet2array(self, tweet):
		assert self.tokenizer is not None
		tokens = self.tokenizer(tweet.raw_text)
		V = self.vocab()
		return map(lambda t: V.get(t), tokens)
	def tokenize(self, tweet):
		return self.tokenizer(tweet.raw_text)
Ejemplo n.º 53
0
batch_size = 20

if __name__ == "__main__":
    if len(sys.argv) < 4:
        print("Usage: {} [input file] [model json file] [weights file]".format(sys.argv[0]))
        quit(1)

    path = sys.argv[1]
    model_json_file = sys.argv[2]
    weights = sys.argv[3]

    print("Reading input...")
    token_regex = r"(?u)([\(\)\[\]]|\b\w+\b)"
    # cv = CountVectorizer(ngram_range=(1,ngrams), token_pattern=token_regex)
    cv = CountVectorizer(token_pattern=token_regex, min_df=2)
    an = cv.build_analyzer()

    corpus = []
    with open(path) as f:
        for line in f:
            corpus.append(line.strip())

    # vectorize and n-gram-ize the corpus
    X = cv.fit_transform(corpus)

    print("Building vectors...")
    # vocabulary size, including padding element
    vocabulary_size = len(cv.vocabulary_) + 1

    print("Vocabulary size: {}  Corpus size: {}".format(vocabulary_size, len(corpus)))
Ejemplo n.º 54
0
# seg_list = jieba.cut(u'它来自山东省的一个小村子', cut_all=True)
# print '全模式:', '/ '.join(seg_list)
#
# seg_list = jieba.cut(u'This is the first document.', cut_all=True)
# print '全模式:', '/ '.join(seg_list)


def tokenize(text):
    tokens = jieba.cut(text, cut_all=False)
    return list(tokens)


vectorizer = CountVectorizer(min_df=1, tokenizer=tokenize)

analyzer = vectorizer.build_analyzer()
print(analyzer('This is a text document to analyze.'))
print(analyzer(u'它来自山东省的一个小村子'))

# ##
corpus = [
    'This is the first document.',
    'This is the second second document.',
    'And a third one.',
    'Is this the first document?',
    u'他来自山东省的一个小村子',
]

# use jieba tokenizer
X = vectorizer.fit_transform(corpus)
print(X)
def poem_generate(num_pairs):
    print "We are doing the 2rd order Markov model!"
    print "Number of poems to generate:", num_pairs
    # how many pairs to generate
    ending_words_dict = sample_ending_word(num_pairs)
    poems_dict = dict()

    h_en = Hyphenator('en_US')
    prondict = nltk.corpus.cmudict.dict()

    for ind in ['A','B','C','D','E','F','G']:
        print "Group:", ind
        # get ending words
        ending_words = ending_words_dict[ind]

        # preprocess data
        corpusname = '../data/grouping2/group' + ind + '.txt'
        corpus = importasline(corpusname, ignorehyphen=False)

        vectorizer = CountVectorizer(min_df=1)
        X = vectorizer.fit_transform(corpus)
        analyze = vectorizer.build_analyzer()
        Y = [[vectorizer.vocabulary_[x] for x in analyze(corpus[i])] for i in range(len(corpus))]
        ending_tokens = [[vectorizer.vocabulary_[x] for x in ending_words[i]] for i in range(len(ending_words))]
        # print(Y)
        words = vectorizer.get_feature_names()
        print "Number of words:", len(words)
        # train in a reverse direction
        for i, line in enumerate(Y):
            Y[i] = line[::-1]
        # print(Y)

        # generate number of syllables for every word
        words_num_syllables = np.zeros(len(words), dtype=int)
        for wordid, word in enumerate(words):
            try:
                phon = prondict[word][0]
                words_num_syllables[wordid] = sum(map(hasNumbers, phon))
            except:
                words_num_syllables[wordid] = len(h_en.syllables(unicode(word)))
            if not words_num_syllables[wordid]:
                words_num_syllables[wordid] = count_syllables(word)

        # train model
        modelname = 'model2rdMMgroup' + ind
        hmm = Markov( len(words), Y, words_num_syllables, modelname)
        print(len(hmm.inversetable))

        # generate poems
        subpoems = [None]*num_pairs
        for pairid in range(num_pairs):
            start_token = ending_tokens[pairid]
            robotpoem0 = ''
            line0,linew0 = hmm.generating_random_line_end(start_token[0])
            for j in linew0[-2::-1]:
                robotpoem0+=' '+words[j]+' '
            print(robotpoem0)
            robotpoem1 = ''
            line1,linew1 = hmm.generating_random_line_end(start_token[1])
            for j in linew1[-2::-1]:
                robotpoem1+=' '+words[j]+' '
            print(robotpoem1)
            subpoems[pairid] = (robotpoem0, robotpoem1)

        # add the best subpoem to poems_dict
        poems_dict[ind] = subpoems

    # write down the poems
    poem_file_name = '../poems2rdMM/reverse_with_punctuations.txt'
    fwrite = open(poem_file_name, 'w')
    for poemid in range(num_pairs):
        # construct poems
        robotpoem = [None]*14
        robotpoem[0] = poems_dict['A'][poemid][0]
        robotpoem[2] = poems_dict['A'][poemid][1]
        robotpoem[1] = poems_dict['B'][poemid][0]
        robotpoem[3] = poems_dict['B'][poemid][1]
        robotpoem[4] = poems_dict['C'][poemid][0]
        robotpoem[6] = poems_dict['C'][poemid][1]
        robotpoem[5] = poems_dict['D'][poemid][0]
        robotpoem[7] = poems_dict['D'][poemid][1]
        robotpoem[8] = poems_dict['E'][poemid][0]
        robotpoem[10] = poems_dict['E'][poemid][1]
        robotpoem[9] = poems_dict['F'][poemid][0]
        robotpoem[11] = poems_dict['F'][poemid][1]
        robotpoem[12] = poems_dict['G'][poemid][0]
        robotpoem[13] = poems_dict['G'][poemid][1]

        robotpoem = Format(robotpoem)

        # write into file
        print>>fwrite, str(poemid)
        for lineid in range(14):
            print>>fwrite, robotpoem[lineid]
    fwrite.close()
Ejemplo n.º 56
0
def find_all_ngrams(input_string, max_n):
    vectorizer = CountVectorizer(ngram_range = (1, max_n))
    analyzer = vectorizer.build_analyzer()
    return(analyzer(input_string))
Ejemplo n.º 57
0
 def __init__(self, callback_func, relevant_kw):
     self.callback_func = callback_func
     self.relevant_kw = relevant_kw
     self.stop_words = create_stop_words()
     vect_kw = CountVectorizer(tokenizer=MyTokenizer(), ngram_range=(1, 3), stop_words=self.stop_words)
     self.analyse_kw = vect_kw.build_analyzer()
Ejemplo n.º 58
0
 clf_7 = Pipeline([
     ('vect', TfidfVectorizer(
                 stop_words=stop_words,
                 token_pattern=ur"\b[a-z0-9_\-\.]+[a-z][a-z0-9_\-\.]+\b",         
     )),
     ('clf', MultinomialNB(alpha=0.01)),
 ]) 
 
 evaluate_cross_validation(clf_7, news.data, news.target, 5)
 '''
 
 
 
 from sklearn.feature_extraction.text import TfidfTransformer
 transformer = TfidfTransformer()
 
 def my_tokenizer(s):
     return s.split()
 vectorizer = CountVectorizer(tokenizer=my_tokenizer)
 str = 'I am sure some bashers of Pens fans are pretty confused about the lack'
 print vectorizer.build_analyzer()(str)
 print vectorizer.build_tokenizer()(str)
 print vectorizer.build_preprocessor()(str)
 
 s1 = 'rạng sáng nay theo giờ hà_nội danh_hiệu cầu_thủ giá_trị mvp giải mls năm được công_bố tiền_đạo gốc việt_lee_nguyễn ứng_viên sáng_giá không kém đôi ngôi_sao đá giải ngoại_hạng robbie_keane los_angeles_galaxy obafemi_martins seattle_sounders bình_chọn dựa số phiếu clb dự mls giới truyền_thông cầu_thủ robbie_keane người số phiếu trận chung_kết mls cup robbie_keane los_angeles_galaxy giành danh_hiệu cầu_thủ giá_trị mls lee_nguyễn được đánh_giá cao bình_chọn ảnh espn lee_nguyễn xếp thứ_ba bình_chọn đạt tổng_số phiếu mùa lee_nguyễn ghi bàn năm pha kiến_tạo cuối giải thi_đấu ấn_tượng vai_trò cầm_trịch lối chơi ghi_bàn cho new_england_revolution vòng play off mls cup tiền vệ_sinh năm ghi thêm hai bàn ba pha kiến_tạo đưa revolution đoạt vô_địch mls khu_vực miền đông giành vé dự chung_kết mls cup đối_đầu đội bóng keane la galaxy tháng lee_nguyễn được hlv jurgen_klinsmann triệu_tập trở_lại tuyển mỹ nhờ phong_độ ấn_tượng mls cựu inter_milan newcastle_utd obafemi_martins đứng thứ_hai số phiếu bầu cầu_thủ clb phiếu bầu clb phiếu bầu truyền thông phiếu bầu cầu thủ tổng robbie_keane la galaxy obafemi_martins seattle_sounders lee_nguyễn new england rev bradley_wright phillips ny  red_bulls tuấn'
 s2 = 'lee_nguyễn trải một năm thi_đấu hoàn_hảo ảnh usa today kết_quả được công_bố trang thông_tin chính_thức ban tổ_chức giải mls phần bình_luận tiền_vệ công lee_nguyễn đoạn lọt danh_sách bầu_chọn cuối_cùng cho danh_hiệu cầu_thủ giá_trị mls cho thấy lee_nguyễn một bước đột_phá sự_nghiệp nơi đanh ghi bàn đứng thứ_tư danh_sách vua_phá_lưới mùa vừa_qua tiền_vệ ghi_bàn cao lịch_sử mls chân chuyền đứng thứ_hai new_england năm pha kiến_tạo thành_công lee_nguyễn hoàn_toàn xứng_đáng lần đầu_tiên được lọt vào đội_hình tiêu_biểu mùa pha lập_công kiến_tạo lối chơi sáng_tạo ổn_định lee_nguyễn góp_phần quan_trọng làm_nên mùa giải thành_công rực_rỡ new_england_revolution họ nhì mls miền đông khi đăng_quang mls cup khu_vực đồng_nghĩa một suất vào chung_kết mls cup toàn_quốc nhờ lọt vào danh_sách rút_gọn cuối_cùng cho đua cầu_thủ giá_trị mvp robbie_keane los_angeles_galaxy obafemi_martins seattle_sounders bàn thắng gỡ hòa 1-1 vào lưới houston_dynamo tuần ngôi_sao sinh năm lọt danh_sách bốn bàn thắng đẹp mls sau bảy năm được gọi trở_lại đội_tuyển mỹ đội_hình tiêu_biểu mùa vừa_qua los_angles_galaxy đóng_góp nhiều ba cá_nhân chia đều hàng thủ đến hàng công đội bóng đối_thủ cạnh_tranh vô_địch mls cup lee_nguyễn revolution sân stubhub_center california ngày tới đội_hình tiêu_biểu mls mùa thủ_môn bill_hamid dc united hậu_vệ bobby_boswell dc united omar_gonzalez los_angeles_galaxy chad_marshall seattle_sounders tiền_vệ landon_donovan los_angeles_galaxy thierry_henry new_york_red_bulls lee_nguyễn new_england_revolution diego_valeri portland_timbers tiền_đạo robbie_keane los_angeles_galaxy obafemi_martins seattle_sounders fc bradley_wright phillips new_york_red_bulls đông_anh'
 s3 = 'thành_lương đỏ làm_nên tuyệt_phẩm trận đấu cuối_cùng bảng philippines ảnh giang_huy malaysia tập_trung hôm_qua để chuẩn_bị cho trận đấu tuyển việt_nam ngày sân_nhà shah_alam sau khi lách khe cửa hẹp để giành vị_trí thứ_hai bảng tay đội singapore thầy_trò salleh háo_hức muốn được kết_quả thật tốt một lời xin_lỗi để cđv nhà thất_vọng thời_gian gì phát_biểu có_thể thấy salleh nghiên_cứu kỹ báo_cáo hlv_u2 ong_kim_swee người được liên_đoàn bóng_đá malaysia fam cử sang hà_nội theo_dõi đối_thủ bảng trọng_tâm tuyển việt_nam đá giao_hữu tuyển việt_nam giải đấu nên phần_nào biết làm gì để kiềm_chế sức_mạnh họ salleh tiết_lộ báo_giới malaysia chúng tô đặc_biệt cẩn_trọng số nguyễn_văn_quyết số phạm_thành_lương cầu_thủ nguy_hiểm ong_kim_swee cho biết như_thế cầu_thủ văn_quyết đỏ chưa ghi_bàn được đối_thủ đánh_giá cao lối chơi ảnh giang_huy cá_nhân ong_kim_swee đưa nhận_xét tuyển việt_nam sau một thời_gian do_thám đội bóng xây_dựng được một phong_cách hoàn_toàn khác_biệt thời hlv người nhật_bản_toshiya_miura họ cầm bóng tốt không_bao_giờ chuyền bóng ngược sau luôn hướng lên phía miura sở_hữu cầu_thủ kỹ_thuật cá_nhân tốt malaysia cảnh_giác mỗi khi đối_phương bóng sát vòng cấm_địa việt_nam ghi hai bàn vào lưới philippines cú sút xa khi được hỏi điểm yếu tuyển việt_nam ong_kim_swee người giúp u23 malaysia vô_địch sea games tỏ bí_hiểm gì thấy một tập_thể gắn_kết mỗi vị_trí đều điểm yếu họ để thủng lưới ba lần điểm yếu có_thể tận_dụng khai_thác hlv salleh đen âm_thầm chuẩn_bị kế_hoạch gây bất_ngờ tuyển việt_nam sân_nhà ảnh ts bên_cạnh việc tìm cách phong_tỏa hai ngòi_nổ tuyển việt_nam salleh cố_gắng giải_quyết khoảng_trống shukor_adan mohd_amri_yahya để hai cầu_thủ trụ_cột đều vắng_mặt trận lượt_đi án treo_giò indra_putra_mahyuddin kunanlan manaf_mamat đều có_thể được tung vào sân_sau khi minh_chứng được khả_năng buổi tập safiq_rahim mohd_muslim có_thể đá vị_trí tiền_vệ trụ thay_thế cho shukor_adan salleh tiết_lộ ít_nhiều khung đội_hình thi_đấu cuối tuần người thay_thế amri_yahya trận đấu kulanan hoặc manaf_mamat tuấn'
 corpus = [s1, s2, s3]
 
 
 print 'DOne'
def get_bigrams_trigrams(text=[], termCount=20, w2v=None, es=None):
        
        bigram_vectorizer = CountVectorizer(ngram_range=(2,2))
        bigram_analyze = bigram_vectorizer.build_analyzer()
        trigram_vectorizer = CountVectorizer(ngram_range=(3,3))
        trigram_analyze = trigram_vectorizer.build_analyzer()
        
        bi_results= map(lambda t: bigram_analyze(t), text)
        tri_results= map(lambda t: trigram_analyze(t), text)
        
        bigrams = []
        bi_dict_corpus = {}
        for doc in bi_results:
                bi_dict={}
                for bi in doc:
                        bi=bi.replace(' ','_')
                        if bi in bi_dict:
                                bi_dict[bi] = bi_dict[bi] + 1
                        else:
                                bi_dict[bi] = 1 
                                
                if bi_dict:
                        # Yamuna: Removing for now as it is slow
                        #phrases = remove_stopword_phrases(bi_dict.keys())        
                        phrases = bi_dict.keys()
                        if w2v.word_vec is None:
                                results = get_documents(phrases, "term", ["term"], "word_phrase_to_vec", "terms", es)
                                phrases = [res.lower() for res in results.keys()]
                        else:
                                phrases = [term for term in phrases if not w2v.get(term) is None]
                        
        
                        bi_dict_subset = {phrase: bi_dict[phrase] for phrase in phrases}
                        if bi_dict_subset:
                                bigrams.append(bi_dict_subset)
                                for phrase in bi_dict_subset.keys():
                                        if phrase in bi_dict_corpus:
                                                bi_dict_corpus[phrase] = bi_dict_corpus[phrase] + bi_dict_subset[phrase]
                                        else:
                                                bi_dict_corpus[phrase] = bi_dict_subset[phrase]
                                                
                        
        trigrams = []
        tri_dict_corpus = {}
        for doc in tri_results:
                tri_dict={}
                for tri in doc:
                        tri=tri.replace(' ','_')
                        if tri in tri_dict:
                                tri_dict[tri] = tri_dict[tri] + 1
                        else:
                                tri_dict[tri] = 1
                if tri_dict:
                        # Yamuna: Removing for now as it is slow
                        #phrases = remove_stopword_phrases(tri_dict.keys())        
                        phrases = tri_dict.keys()
                        if w2v.word_vec is None:
                                results = get_documents(phrases, "term", ["term"], "word_phrase_to_vec", "terms", es)
                                phrases = [res for res in results.keys()]
                        else:
                                phrases = [term for term in phrases if not w2v.get(term) is None]

                        tri_dict_subset = {phrase: tri_dict[phrase] for phrase in phrases}
                        if tri_dict_subset:
                                trigrams.append(tri_dict_subset)
                                for phrase in tri_dict_subset.keys():
                                        if phrase in tri_dict_corpus:
                                                tri_dict_corpus[phrase] = tri_dict_corpus[phrase] + tri_dict_subset[phrase]
                                        else:
                                                tri_dict_corpus[phrase] = tri_dict_subset[phrase]
                                                
        return bigrams, trigrams, sorted(bi_dict_corpus.items(), key=operator.itemgetter(1), reverse=True)[0:termCount], sorted(tri_dict_corpus.items(), key=operator.itemgetter(1), reverse=True)[0:termCount]
def split_into_lemmas(tweet):
    bigram_vectorizer = CountVectorizer(ngram_range=(1, 3), token_pattern=r'\b\w+\b', min_df=1)
    analyze = bigram_vectorizer.build_analyzer()
    return analyze(tweet)