Esempio n. 1
0
    def printLSA(self):
        corpus = []
        for message in self.message_list:
            corpus += message.text
#         for message in self.message_list:
#             for text in message.text:
#                 corpus.append(text)
        #tfidf stuff
        vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
        X = vectorizer.fit_transform(corpus)
        idf = vectorizer.idf_
        #lsa stuff
        lsa = TruncatedSVD(n_components=27, n_iter=100)
        lsa.fit(X)
    
        print dict(zip(vectorizer.get_feature_names(), idf))
        print ""
        
        #print related concepts
        terms = vectorizer.get_feature_names()
        for i, comp in enumerate(lsa.components_): 
            termsInComp = zip (terms,comp)
            sortedTerms =  sorted(termsInComp, key=lambda x: x[1], reverse=True) [:10]
            print "Concept %d:" % i
            for term in sortedTerms:
                print term[0]
            print " "
        
        #print sorted stuff to see    
        v = sorted(zip(vectorizer.get_feature_names(), idf), key=lambda x:x[1])
        print v
        print "\n\n"
Esempio n. 2
0
def getFeatures(tweets, vocabularyWords):
	"""
		Gets the features (word count, represented as a sparse matrix), 
		where we can recover the particular feature labels.

		We then weight features via Tf-idf terms. (http://en.wikipedia.org/wiki/Tf%E2%80%93idf)

		See: http://scikit-learn.org/dev/modules/feature_extraction.html#text-feature-extraction
	"""
	from sklearn.feature_extraction.text import TfidfVectorizer

	vectorizer = TfidfVectorizer(vocabulary = vocabularyWords, ngram_range = (1, 3))
	features = vectorizer.fit_transform(tweets)

	# print "features are: "
	# print features.toarray()
	print "features length is: "
	print len(features.toarray()[0])

	# print "feature names are: "
	# print vectorizer.get_feature_names()
	print "feature name lengths are: "
	print len(vectorizer.get_feature_names())

	return (features.toarray(), vectorizer.get_feature_names())
Esempio n. 3
0
def test_text_vectorization():
    mongo_dataset = MongoHC("hc", "re0")
    data = [d for d in mongo_dataset.get_all(order_by="id_doc")]
    text = [d["text"] for d in data[1:2]]
    tfidf_vectorizer = TfidfVectorizer(
        max_df=1,
        max_features=200000,
        min_df=1,
        stop_words="english",
        strip_accents="unicode",
        use_idf=True,
        ngram_range=(1, 1),
        norm="l2",
    )
    tfidf_matrix = tfidf_vectorizer.fit_transform(text)
    print tfidf_vectorizer.get_feature_names()
    print tfidf_matrix.data

    indices = np.argsort(tfidf_vectorizer.idf_)[::-1]
    print indices
    features = tfidf_vectorizer.get_feature_names()
    top_n = 5
    top_features = [features[i] for i in indices[:top_n]]

    print len(features)
    print tfidf_matrix.shape
    print top_features
Esempio n. 4
0
def text_to_vectors(dirname_or_textdata,test_dirname_or_textdata=None,ngram_range=(1, 1),verbose=False):
    if isinstance(dirname_or_textdata,str):
        textdata=load_files(dirname_or_textdata,verbose)
    else:
        textdata=dirname_or_textdata

    from sklearn.feature_extraction.text import TfidfVectorizer
    vectorizer = TfidfVectorizer(ngram_range=ngram_range)
    vectors = vectorizer.fit_transform(textdata.data)
    
    data=Struct()
    data.vectorizer=vectorizer
    data.vectors=vectors
    data.targets=textdata.targets
    data.target_names=textdata.target_names
    data.feature_names=vectorizer.get_feature_names()
    
    if not test_dirname_or_textdata is None:
        if isinstance(test_dirname_or_textdata,str):
            textdata=load_files(test_dirname_or_textdata,verbose)
        else:
            textdata=test_dirname_or_textdata

        test_vectors = vectorizer.transform(textdata.data)
        test_data=Struct()
        test_data.vectorizer=vectorizer
        test_data.vectors=test_vectors
        test_data.targets=textdata.targets
        test_data.target_names=textdata.target_names
        test_data.feature_names=vectorizer.get_feature_names()
        
        return data,test_data
    else:
        return data
Esempio n. 5
0
def test2():
    with codecs.open('/home/zhangwj/Applications/Scrapy/baike/files/data_fenci.txt', 'rb',encoding='utf-8') as f:
        data_samples = f.read()
    n_features = 1000
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,   #CountVectorizer是通过fit_transform函数将文本中的词语转换为词频矩阵
                                       max_features=n_features,stop_words=u"应该"
                                       ) #TfidfTransformer是统计vectorizer中每个词语的tf-idf权值
    tfidf = tfidf_vectorizer.fit_transform(data_samples)  # return sparse matrix, [n_samples, n_features],Tf-idf-weighted document-term matrix.
    tfidf_vectorizer.get_feature_names() #上面输出的是tfidf的权重矩阵 sample*feature, 该函数打印feature names, 一个sample是一篇文档
def main(K, numfeatures, sample_file, num_display_words, outputfile):
    K_clusters = K
    stop_words = set(stopwords.words('spanish')).union(set(['http','www','san', '099','098','096','097']))
    #stop_words = [word.decode('utf-8') for word in stopwords.words('spanish')]#stopwords.words("spanish")
    vectorizer = TfidfVectorizer(max_df=0.5, max_features=numfeatures,
                                     min_df=2, stop_words=set(stop_words),
                                     use_idf=True)

    text = []

    with open(sample_file, 'rb') as csvfile:
         reader = csv.reader(csvfile)
         for row in reader:
             text.append(row[1])

    t0 = time()
    print("Extracting features from the training dataset using a sparse vectorizer")
    X = vectorizer.fit_transform(text)
    print("done in %fs" % (time() - t0))
    print("n_samples: %d, n_features: %d" % X.shape)

    idf = vectorizer.idf_
    words = dict(zip(vectorizer.get_feature_names(), idf))

    terms = sorted(words, key=words.__getitem__)[0:10]

    # mapping from feature id to acutal word
    id2words ={}
    for i,word in enumerate(vectorizer.get_feature_names()):
        id2words[i] = word

    t0 = time()
    print("Applying topic modeling, using LDA")
    print(str(K_clusters) + " topics")
    corpus = matutils.Sparse2Corpus(X,  documents_columns=False)
    lda = models.ldamodel.LdaModel(corpus, num_topics=K_clusters, id2word=id2words)
    print("done in %fs" % (time() - t0))

    #write json version
    json_data = {"terms":terms,"topics":None}
    json_topics = []
    for i, item in enumerate(lda.show_topics(num_topics=K_clusters, num_words=num_display_words, formatted=False)):
        topic = {}
        topic['name']= "topic" + str(i)
        topic['children']= []
        for weight,term in item:
            child = {}
            child['name'] = term
            child['weight'] = weight
            topic['children'].append(child)
            #output_text.append( term + " : " + str(weight) )
        json_topics.append(topic)
    json_data['topics'] = json_topics

    with open(outputfile + ".json", 'w') as outfile:
        json.dump(json_data, outfile)
def tfidf_vectorizer(codex,\
                     max_df=1,\
                     min_df=0,\
                     stop_words='english',\
                     train_split=False
                     ):
    """
        Calculate term frequency for words in all comments 

        Input:  text string (nouns only from noun_tokenizer)
        Output: transformed input, term list from tfidf, model
    """

    #Select english stopwords
    cachedStopWords = set(stopwords.words("english"))

    #Add words to stopwords list
    cachedStopWords.update(('and','I','A','And','So','arnt','This','When','It',\
                            'many','Many','so','cant','Yes','yes','No','no',\
                            'These','these','',' ','ok','na', 'edit','idk',\
                            'gon','wasnt','yt','sure','watch','whats','youre',\
                            'theyll','anyone'
                            ))
    if train_split:
        #Initialize model
        vectorizer = TfidfVectorizer(max_df=max_df,\
                                     min_df=min_df,\
                                     stop_words=cachedStopWords\
                                     )
        x_train, x_test = train_test_split(codex)

        #Transform codex to vectors and calculate TFIDFs
        X = vectorizer.fit_transform(x_train)

        #Get all word tokens
        terms = vectorizer.get_feature_names()
        return X, terms, vectorizer
    else:
        #Initialize model
        vectorizer = TfidfVectorizer(max_df=max_df,\
                                     min_df=min_df,\
                                     stop_words=cachedStopWords
                                     )
        
        #Transform codex to vectors and calculate TFIDFs
        X = vectorizer.fit_transform(codex)

        #Get all word tokens
        terms = vectorizer.get_feature_names()
        return X, terms, vectorizer
Esempio n. 8
0
def LoadDocuments(fname, collect_links):
    crawl_data, urls, titles, relationships = pages_to_mem(fname, collect_links)
    tfidfVect = TfidfVectorizer(strip_accents='unicode', stop_words='english', ngram_range=(1,2), sublinear_tf=True)
    term_tfidf = tfidfVect.fit_transform(crawl_data)
    dict_values = tfidfVect.get_feature_names()
    i = iter(dict_values)
    term_b = dict(izip(i, xrange(len(dict_values))))    # dictionary of words and indicies

    tfidfVect = TfidfVectorizer(strip_accents='unicode', stop_words='english', ngram_range=(1,2))
    title_tfidf = tfidfVect.fit_transform(titles)
    dict_values = tfidfVect.get_feature_names()
    i = iter(dict_values)
    title_b = dict(izip(i, xrange(len(dict_values))))    # dictionary of words and indicies
    return title_tfidf, title_b, term_tfidf, term_b, urls, relationships
Esempio n. 9
0
def test1():
    n_samples = 2000
    n_features = 1000
    print("Loading dataset...")
    dataset = fetch_20newsgroups(shuffle=True, random_state=1,
                                 remove=('headers', 'footers', 'quotes'))
    data_samples = dataset.data[:n_samples]

    # Use tf-idf features for NMF.
    print("Extracting tf-idf features for NMF...")
    tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2,
                                       max_features=n_features,
                                       stop_words='english')
    tfidf = tfidf_vectorizer.fit_transform(data_samples)  #sparse matrix, [n_samples, n_features],Tf-idf-weighted document-term matrix.
    tfidf_vectorizer.get_feature_names() #上面输出的是tfidf的权重矩阵 sample*feature, 该函数打印feature names, 一个sample是一篇文档
Esempio n. 10
0
def rocchio(request):
	from sklearn.feature_extraction.text import CountVectorizer
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.feature_extraction.text import TfidfTransformer
	from sklearn.utils.extmath import randomized_svd
	from sklearn import feature_selection
	import pandas as pd
	document_index = []
	s = SessionStore()
	sessionData = db.sessionHistory.find_one({"session_key":s.session_key})
	urls_visited = sessionData['url_visited']
	urls = []
	for url in urls_visited:
		urls.append(url[0])
	bodyContentList = db.crawledCollection.find({'url':{"$in":urls}}, {'body':1})
	body = []
	terms = []
	for x in bodyContentList:
		body.append(re.sub('[!@#$%^&*()[]./<>?\|`~-=_+]0-9', '', x['body']))

	# Turning the body content into a bag of words
	top_features=[]
	
	vectorizer = TfidfVectorizer(stop_words = 'english')
	X = vectorizer.fit_transform(body)
	indices = np.argsort(vectorizer.idf_)[::-1]
	features = vectorizer.get_feature_names()
	top_n = 10
	top_features.append([features[i] for i in indices[:top_n]])

	print top_features
	
	vectorizer = CountVectorizer(min_df = 1, stop_words = 'english')
	dtm = vectorizer.fit_transform(body)

	index=pd.DataFrame(dtm.toarray(),index=body,columns=vectorizer.get_feature_names())
	indexterms=vectorizer.get_feature_names()
	
	transform=TfidfTransformer()
	tfidf=transform.fit_transform(dtm)
	
	U, Sigma, V = randomized_svd(tfidf, n_components=5,
                                      n_iter=5, transpose=True,
                                      random_state=None)
	

	#getting the highes count of words and adding it into the query
	return HttpResponse(top_features)
Esempio n. 11
0
def tfidf(synopses):
    tfidf_vectorizer=TfidfVectorizer(max_df=0.8, max_features=200000, min_df=0.2, stop_words='english', use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))
    tfidf_matrix = tfidf_vectorizer.fit_transform(synopses)
    terms=tfidf_vectorizer.get_feature_names()
    print("terms:",terms)
    print(tfidf_matrix.shape)
    return  terms,tfidf_matrix  # 返回tfidf矩阵
Esempio n. 12
0
  def get_top_terms(self, stops=STOPS):

    # vecotrize using only 1-grams
    vectorizer = TfidfVectorizer(stop_words=stops, ngram_range=(1,3))
    tfidf = vectorizer.fit_transform(self.docs)

    # enumerate feature names, ie. the actual words
    self.feature_names = vectorizer.get_feature_names()

    # convert to dense array
    dense = tfidf.todense()

    # container for top terms per doc
    self.features = []

    for doc in dense:
      doc = doc.tolist()[0]

      # creates a list of tuples, (term_id, score)
      phrase_scores = [pair for pair in zip(range(0, len(doc)), doc) if pair[1] > 0]
      # feature_ids = sorted(phrase_scores, key=lambda t: t[1] * -1)
      doc_features = []

      for f_ in phrase_scores:
        fname = self.feature_names[f_[0]]
        fscore = f_[1]
        doc_features.append((fscore, fname))

      top_terms = sorted(doc_features, reverse=True) #[:n_terms]
      # top_terms = ",".join([ x[1] for x in top_terms ])
      self.features.append(top_terms)
def preprocess(word_data, targets):
    print("\n### PREPROCESSING DATA ###")

    # vectorize
    print("-- Vectorization")
    vectorizer = TfidfVectorizer(sublinear_tf=True)  # , stop_words='english'
    data_transformed = vectorizer.fit_transform(word_data)

    # feature selection
    print("-- Feature Selection")
    selector = SelectPercentile(percentile=5)
    data_selected = selector.fit_transform(data_transformed, targets)
    if data_selected.shape[1] == 0:
        data_selected = data_transformed
    else:
        print("Top {} features were selected".format(data_selected.shape[1]))

        # print top features
        nr_features = 30
        i = selector.scores_.argsort()[::-1][:nr_features]
        top_features = np.column_stack((np.asarray(vectorizer.get_feature_names())[i],
                                        selector.scores_[i],
                                        selector.pvalues_[i]))
        print("\nTop %i Features:" % nr_features)
        print(pd.DataFrame(top_features, columns=["token", "score", "p-val"]), "\n")

    features_train, features_test, labels_train, labels_test = \
        train_test_split(data_selected, targets, test_size=0.2, stratify=targets)

    return features_train, features_test, labels_train, labels_test
Esempio n. 14
0
def get_peronalpreference_vectors(vocab, user_pref_values):
    vectorizer = TfidfVectorizer(vocabulary=vocab, lowercase=False)
    vectors = vectorizer.fit_transform(user_pref_values).toarray()
    words = vectorizer.get_feature_names()
    # idf = vectorizer.idf_
    # print dict(zip(vectorizer.get_feature_names(), idf))
    return words, vectors
Esempio n. 15
0
def cluster(data, k):

    vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words=['nfl','game','team'])

    td_matrix = vectorizer.fit_transform(data)
    km = KMeans(n_clusters=k, init='k-means++', max_iter=300, n_jobs=-1)
    km.fit(td_matrix)
    order_centroids = km.cluster_centers_.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names()

    def count(acc,value):
        acc[value] += 1
        return acc

    cluster_counts = reduce(count, km.labels_, [0]*k)

    #_max = (0,0)
    #for i in range(0,len(cluster_counts)):
    #    if _max[1] < cluster_counts[i]:
    #        _max = (i,cluster_counts[i])

    #print _max[0], _max[1], float(_max[1]) / len(data)
    # print counts

    result = []

    for i in reversed(numpy.array(cluster_counts).argsort()):
        x = [float(cluster_counts[i])/len(data)]
        for ind in order_centroids[i, :10]:
            x.append(terms[ind])
        result.append(x)

    return result
def classify(clf, chapter_contents_train, y_train, chapter_contents_test,k=20):
    # convert the training data text to features using TF-IDF vectorization
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english')
    X_train = vectorizer.fit_transform(chapter_contents_train)
    # X_train_array = X_train.toarray()
    # print "tfidf vector length: ", len(X_train_array) #dbg
    # print "X_train_array[0] length: ", len(X_train_array[0]) #dbg

    # use only the best k features according to chi-sq selection
    ch2 = SelectKBest(chi2, k=k)
    X_train = ch2.fit_transform(X_train, y_train)

    # determine the actual features used after best-k selection
    feature_names = np.asarray(vectorizer.get_feature_names())
    chisq_mask = ch2.get_support()
    features_masks = zip(feature_names,chisq_mask)
    selected_features = [z[0] for z in features_masks if z[1]]

    # train the classifier
    clf.fit(X_train, y_train)

    # convert the test data text into features using the same vectorizer as for training
    X_test = vectorizer.transform(chapter_contents_test)
    X_test = ch2.transform(X_test)

    # obtain binary class predictions for the test set
    preds = clf.predict(X_test)
    return preds, selected_features, clf
Esempio n. 17
0
def tfidf_word_match_share(question1, question2):
    qs = question1 + question2
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', min_df=3)
    tfidf_matrix = tfidf_vectorizer.fit_transform(qs)
    feature_names = tfidf_vectorizer.get_feature_names()
    # dense = tfidf_matrix.todense()
    # word_index_dict = dict((j, i) for i, j in enumerate(feature_names))

    tf_idf = []
    for q1, q2 in zip(question1, question2):
        q1words = {}
        q2words = {}
        for word in str(q1).lower().split():
            if word not in stops:
                q1words[word] = 1
        for word in str(q2).lower().split():
            if word not in stops:
                q2words[word] = 1
        if len(q1words) == 0 or len(q2words) == 0:
            tf_idf.append([0])
        else:
            q1_tfidf = tfidf_vectorizer.transform([" ".join(q1words.keys())])
            q2_tfidf = tfidf_vectorizer.transform([" ".join(q2words.keys())])
            inter = np.intersect1d(q1_tfidf.indices, q2_tfidf.indices)
            shared_weights = 0
            for word_index in inter:
                shared_weights += (q1_tfidf[0, word_index] + q2_tfidf[0, word_index])
            total_weights = q1_tfidf.sum() + q2_tfidf.sum()
            if np.sum(total_weights) == 0:
                tf_idf.append([0])
            else:
                score = np.sum(shared_weights) / np.sum(total_weights)
                tf_idf.append([round(score, 2)])
    print("Created tf_idf features feature")
    return np.array(tf_idf)
Esempio n. 18
0
	def get_tfidf_model(self, dirname):
		data = Sentences(dirname)
		tfidf_vectorizer = TfidfVectorizer(stop_words='english')
		tfidf_matrix = tfidf_vectorizer.fit_transform(data)
		mat_array = tfidf_matrix.toarray()
		fn = tfidf_vectorizer.get_feature_names()
		return tfidf_vectorizer
Esempio n. 19
0
class Train:
    """Using non-negative matrix factorization to learn the vector of a document"""
    def __init__(self,filename_in):
        self.text = []
        for line in open(filename_in,'rb'):
            self.text.append(line.strip().decode('utf-8'))

    def train(self,n_topics=10):
        self.vectorizer = TfidfVectorizer(min_df=0.001,max_df=0.6)
        tfidf = self.vectorizer.fit_transform(self.text)
        n_samples = len(self.text)
        print("Fitting the NMF model with n_samples=%d and n_features=%d..."
            % (n_samples,n_topics))
        self.nmf = NMF(n_components = n_topics, random_state = 1).fit(tfidf)

    def show_result(self,n_top_words=10):
        feature_names = self.vectorizer.get_feature_names()
        for topic_idx, topic in enumerate(self.nmf.components_):
            print("Topic #%d:" % topic_idx)
            print(" ".join([feature_names[i]
                    for i in topic.argsort()[:-n_top_words - 1:-1]]))
            print()

    def __str__(self):
        print np.shape(self.nmf.components_)[1]+'topics'
Esempio n. 20
0
def get_salience_matrix(keys, salient_set):
    """ run test set on salient terms """
    salient_feats = []
    tfidf = TfidfVectorizer(stop_words="english")
    top_n = 100
    for key in keys:
        salience_test = []
        top_terms = []
        history = clean(tweets[str(key)]["audience"]["user"]["history"])[1:]
        # print len(history)
        try:
            teeeff = tfidf.fit_transform(history)
            indices = np.argsort(tfidf.idf_)[::-1]
            features = tfidf.get_feature_names()
            top_terms = [features[i] for i in indices[:top_n]]
        except:
            top_terms = []

        for term in salient_set:
            if term in top_terms:
                salience_test.append(1)
            else:
                salience_test.append(0)
        salient_feats.append(salience_test)
    return np.array(salient_feats)
Esempio n. 21
0
    def build_model(self):
        vectorizer = TfidfVectorizer(tokenizer=tokenize, stop_words = 'english')
        vector = vectorizer.fit_transform(self.df['Comment'].values).toarray()
        self.model = NMF(n_components=self.n_topics).fit(vector)
        self.features = vectorizer.get_feature_names()
        self.matrix = self.model.transform(self.vectorizer)

    # From the matrix, retrieve top example, topics words, and number of comments per topics 
	def output_data(self):
		'''
        OUTPUT DataFrame
        '''
	    self.examples = []
	    self.comment = []
	    self.topic_words = []

	    # Retrieve the comment most relevant to each topic
	    index = self.matrix.argmax(axis=0)
	    self.df = self.df.reset_index()
	    self.examples.append(self.df.ix[index]['Comment'].values)
	    np.sort(self.matrix, axis =1)

	    # Retrieve all comments that are relevant to each topic
	    for i in range(self.n_topics):
	        self.comment.append(len(self.matrix[:,i][self.matrix[:,i] > 0.05]))
	    
	    self.num_per_topics = len(comments)

		# Retrieve top 10 topic words
	    for topic in self.model.components_:
	        topic_words.append(" ".join([self.features[i]
	                for i in topic.argsort()[:-10-1:-1]])) 

	    return self.examples, self.topic_words, self.num_per_topics 
def test_string_compare():
    # http://stackoverflow.com/questions/8897593/similarity-between-two-text-documents
    # ? http://stackoverflow.com/questions/2380394/simple-implementation-of-n-gram-tf-idf-and-cosine-similarity-in-python?noredirect=1&lq=1
    # "Equivalent to CountVectorizer followed by TfidfTransformer." http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
    # http://stackoverflow.com/questions/32128802/how-to-use-sklearns-countvectorizerand-to-get-ngrams-that-include-any-punctua
    # http://stackoverflow.com/questions/23850256/how-can-i-pass-a-preprocessor-to-tfidfvectorizer-sklearn-python

    from sklearn.feature_extraction.text import TfidfVectorizer
    from nltk.tokenize import TreebankWordTokenizer

    test_text_1 = ["I'd like an apple",
                   "An apple a day keeps the doctor away",
                   "Never compare an apple to an orange",
                   "I prefer scikit-learn to Orange",
                   "I'd like an apple",
                   "I'd like an orange"]
    test_text_2 = ["I'd like an apple",
                   "I'd like an apple orange"]
    
    #<> add a stemming step? couse be useful for typos
                   
    vect = TfidfVectorizer(min_df=0,
                           stop_words="english",
                           #tokenizer=TreebankWordTokenizer().tokenize, #to get dashes,etc
                           lowercase=True) 
    tfidf = vect.fit_transform(test_text_2) #This is equivalent to fit followed by transform, but more efficiently implemented.
    sim_M = (tfidf * tfidf.T).A
    print "tfidf\n", tfidf       
    print      
    print "vect.get_feature_names()\n", vect.get_feature_names() 
    print                  
    print "similiarity matrix\n", sim_M
    #plt.imshow(sim_M, cmap='rainbow', interpolation='nearest')
    print "similarity between 2 sentences:", sim_M[0,1]
Esempio n. 23
0
def get_voc_tfidfdoc_from_synant(antPath, synPath, gre):
    voc = get_voc_from_synant(antPath, synPath, gre)
    doc = defaultdict(list)
    with open(antPath, 'r') as f:
        for row in f:
            row = row.strip().split()
            target = row[0]
            if target in voc:
                for w in row[1:]:
                    if w in voc:
                        doc[target].append(w)
    with open(synPath, 'r') as f:
        for row in f:
            row = row.strip().split()
            target = row[0]
            if target in voc:
                for w in row[1:]:
                    if w in voc:
                        doc[target].append(w)
    entries = doc.keys()
    doc = doc.values()
    doc = [' '.join(x) for x in doc]
    td = TfidfVectorizer(token_pattern=u'(?u)\\b[^ ]+\\b')
    doc = td.fit_transform(doc)
    items = td.get_feature_names()
    items = [str(x) for x in items]
    return voc, doc, entries, items
Esempio n. 24
0
def create_d3_list(sym):
    tweets = []

    twitter_data = get_twitter_data(sym).items(100)
    for tweet in twitter_data:
        tweets.append(tweet.text)

    vectorizer = TfidfVectorizer(stop_words='english')
    vectors = vectorizer.fit_transform(tweets).toarray()
    words = vectorizer.get_feature_names()
    #words = [ word for word in words if word.isalpha() ]

    avg = np.sum(vectors, axis=0) #/ np.sum(vectors > 0, axis=0)

    print "top 10 by average tf-idf"
    d = enchant.Dict("en_US")
    #top_vals = [ str(word) for word in get_top_values(avg, 100, words) if d.check(word) ]
    words_avg = zip(words, avg)
    words_avg.sort(key=lambda tup: tup[1], reverse=True)

    d3_list = []
    sizing_d3 = (110 / words_avg[0][1])
    for cell in words_avg:
        if d.check(cell[0]):
            if cell[0] != 'rt':
                d3_list.append({"text": str(cell[0]), "size": cell[1] * sizing_d3})

    return d3_list
Esempio n. 25
0
	def vectorize_words(self, clean_profiles, max_features = 500) :
		# Vectorize the words in the cleaned profiles using 
		# term frequency/inverse document frequency (TF-IDF)
		print "Creating the bag of words...\n"

		# Initialize the "CountVectorizer" object, which is scikit-learn's
		# bag of words tool.  
		vectorizer = TfidfVectorizer(min_df=1, max_features = max_features) 
		vectorizer._validate_vocabulary()

		# fit_transform() does two functions: First, it fits the model
		# and learns the vocabulary; second, it transforms our training data
		# into feature vectors. The input to fit_transform should be a list of 
		# strings.
		data_features = vectorizer.fit_transform(clean_profiles)

		# Numpy arrays are easy to work with, so convert the result to an 
		# array
		data_features = data_features.toarray()
		print data_features.shape

		vocab = vectorizer.get_feature_names()

		# Sum up the counts of each vocabulary word
		dist = np.sum(data_features, axis=0)

	
		return vectorizer, data_features, vocab, dist
Esempio n. 26
0
class Classifier(object):
	def __init__(self):

		self.classifier = LogisticRegression(intercept_scaling=100)
		self.vectorizer = TfidfVectorizer()
	
	def trainvectorizer(self,corpus):
		
		self.vectorizer.fit_transform(corpus)
		file1 = open("feature_names.txt","w")
		names = self.vectorizer.get_feature_names()
		print len(names)
		for name in names:
			file1.write(name.encode('utf8')+"\n")
		file1.close()
		print "vectrizer train is over...."


	def trainclassifier(self,train_X,train_Y):
		
		self.classifier.fit(train_X,train_Y)
		print "classifier train is over ...."

	def getfeature(self,text):#return a feature array
		matrx = self.vectorizer.transform([text]).toarray()
		array = matrx[0]
		return array
		
	def getresult(self,feature):#return true or false
		
		return self.classifier.predict(feature)
Esempio n. 27
0
    def getFeatures(self,tweets,query=None):

        # tfidf matrix
        rewroteText = [t.processedText for t in tweets]
        tfidf_vectorizer = TfidfVectorizer(max_df=0.9,
                                           max_features=10000,
                                            min_df=0.05,
                                            stop_words=STOPWORDS,
                                            use_idf=True,
                                            tokenizer=process,
                                            ngram_range=(1,2))

        tfidfMatrix =tfidf_vectorizer.fit_transform(rewroteText)
        print "Found {} meaningful words".format(tfidfMatrix.shape[1])

        self.tfidfDict = tfidf_vectorizer.get_feature_names()
        # print self.tfidfDict
        context = []
        for t in tweets:
            context.append([t.timezone,t.hasPhoto,t.sentimentScore])
        context = np.array(context)

        #print "tdidf {}".format(tfidfMatrix.shape)
        #print "norm {} {}".format(normContext.shape,type(normContext))
        features = np.hstack((tfidfMatrix.A,context))
        #print features.shape
        return features
Esempio n. 28
0
def tf_idf_features(train_data, test_data):
    # Bag-of-words representation
    tf_idf_vectorize = TfidfVectorizer()
    tf_idf_train = tf_idf_vectorize.fit_transform(train_data.data) #bag-of-word features for training data
    feature_names = tf_idf_vectorize.get_feature_names() #converts feature index to the word it represents.
    tf_idf_test = tf_idf_vectorize.transform(test_data.data)
    return tf_idf_train, tf_idf_test, feature_names
class MedicalKeywordTfIdf(BaseEstimator, TransformerMixin):
    MEDICAL_KEYWORDS = ["Medical_Keyword_" + str(i) for i in range(1, 49)]

    def __init__(self):
        self._vec = TfidfVectorizer(max_df=0.95, min_df=2)

    def get_feature_names(self):

        return [x + "_TFIDF" for x in self._vec.get_feature_names()]

    def get_data_array(self, df):

        return df[self.MEDICAL_KEYWORDS] \
            .apply(lambda x: " ".join(x[x == 1].index), axis=1).values

    def fit(self, df, y=None):
        data_arr = self.get_data_array(df)
        self._vec.fit(data_arr)

        return self

    def transform(self, df):
        data_arr = self.get_data_array(df)

        return self._vec.transform(data_arr).toarray()
def tfidf_scores(work_dir, output_file, ngram_range=(1, 1), num_results=5):
    tf = TfidfVectorizer(analyzer="word", ngram_range=ngram_range, min_df=0, stop_words="english")

    if work_dir[-1] != "/":
        work_dir += "/"
    files = [work_dir + f for f in os.listdir(work_dir) if "@" in f]

    corpus = []
    for f in files:
        corpus.append(word_bag(f))

    print "-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-"
    print "fitting the corpus to generate TfIdf matrix"
    tfidf_matrix = tf.fit_transform(corpus)
    print "finished!"
    print "-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-.-"
    feature_names = tf.get_feature_names()

    dense = tfidf_matrix.todense()

    g = open(output_file, "w")
    for (ii, f) in enumerate(files):
        chat = dense[ii].tolist()[0]
        phrase_scores = [pair for pair in enumerate(chat) if pair[1] > 0]

        sorted_phrase_scores = sorted(phrase_scores, key=lambda x: x[1] * -1)
        # print 'words most used in %s' %(f.split('/')[-1])
        g.write("words most used in %s\n" % (f.split("/")[-1]))
        for phrase, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores][
            :num_results
        ]:
            # print('{0: <20} {1}'.format(phrase, score))
            g.write("{0: <20} {1}\n".format(phrase, score))
Esempio n. 31
0
class LdaWithTfidf:
    def __init__(self,
                 optimising=True,
                 max_df=config.max_document_freq,
                 min_df=config.min_document_freq,
                 max_feat=config.max_features):
        self.optimising = optimising
        self.lemmatizer = WordNetLemmatizer()
        self.tokenizer = RegexpTokenizer(r'[a-zA-z]+')
        self.vectorizer = TfidfVectorizer(max_df=max_df,
                                          max_features=max_feat,
                                          min_df=min_df,
                                          stop_words='english',
                                          use_idf=True)

    def split_twitter_file(self):
        print("Dividing file into 70% training and 30% testing")
        allTweets = []
        columnnames = []
        with open(config.twitter_orig_file, "rt",
                  encoding="utf-8") as original:
            line_count = 0
            for line in original:
                if line_count == 0:
                    columnnames.append(line)
                    line_count += 1
                    continue
                allTweets.append(str(line))
        print(columnnames)
        random.shuffle(allTweets)
        with open(config.twitter_training_file, 'wt',
                  encoding="utf-8") as training:
            training.write(columnnames[0])
            for line in allTweets[0:int(len(allTweets) * 0.70)]:
                training.write(line)
        with open(config.twitter_test_file, 'wt', encoding="utf-8") as test:
            test.write(columnnames[0])
            for line in allTweets[int(len(allTweets) * 0.70):]:
                test.write(line)

    def read_twitter_training_dataset(self, limit=0):
        print("File directory found at " + config.twitter_training_file)
        lines = []
        tokenizer = RegexpTokenizer(r'\@\w+')
        with open(config.twitter_training_file, "rt",
                  encoding="utf-8") as training:
            reader = csv.reader(training, delimiter=',')
            line_count = 0
            for line in reader:
                if line_count == 0:
                    print(f'column names are {",".join(line)}')
                    line_count += 1
                    continue
                try:
                    timestamp = line[12]
                    user = line[7]
                    output = line[10]
                    sentence = output
                    mentions = []
                    tokenised = tokenizer.tokenize(output)
                    for token in tokenised:
                        if token.startswith('@'):
                            mentions.append(token[1:])

                except:
                    continue
                lines.append(self.apply_lemmatizer(sentence))
                line_count += 1
        if limit >= 1:
            return lines[0:limit]
        return lines

    def read_twitter_testing_dataset(self, limit=0):
        print("File directory found at " + config.twitter_training_file)
        lines = []
        tokenizer = RegexpTokenizer(r'\@\w+')
        with open(config.twitter_training_file, "rt",
                  encoding="utf-8") as training:
            reader = csv.reader(training, delimiter=',')
            line_count = 0
            for line in reader:
                if line_count == 0:
                    print(f'column names are {",".join(line)}')
                    line_count += 1
                    continue

                timestamp = line[12]
                user = line[7]
                output = line[10]
                sentence = output
                mentions = []
                tokenised = tokenizer.tokenize(output)
                for token in tokenised:
                    if token.startswith('@'):
                        mentions.append(token[1:])
                lines.append(self.apply_lemmatizer(sentence))
                line_count += 1
        if limit >= 1:
            return lines[0:limit]
        return lines

    def read_unity_dataset_training(self):
        print("File directory found at " + config.unity_training_file)
        timestamp_regex = "\[\d\d\:\d\d\]"
        timestamp_pattern = re.compile(timestamp_regex)
        user_regex = "<([a-zA-Z0-9_ ]+)>"
        user_pattern = re.compile(user_regex)
        bag_of_words_output = []
        with open(config.unity_training_file, "rt",
                  encoding="utf-8") as training:
            for line in training:
                output = line
                timestamp_match = timestamp_pattern.search(line)
                if timestamp_match:
                    output = re.sub(timestamp_regex, "", output)
                user_match = user_pattern.search(line)
                if user_match:
                    output = re.sub(user_regex, "", output)
                bag_of_words_output.append(self.apply_lemmatizer(output))
        print("Training Data Entries: {}".format(len(bag_of_words_output)))
        return bag_of_words_output

    def read_unity_dataset_testing(self):
        print("File directory found at " + config.unity_test_file)
        timestamp_regex = "\[\d\d\:\d\d\]"
        timestamp_pattern = re.compile(timestamp_regex)
        user_regex = "<([a-zA-Z0-9_ ]+)>"
        user_pattern = re.compile(user_regex)
        bag_of_words_output = []
        with open(config.unity_test_file, "rt", encoding="utf-8") as training:
            for line in training:
                print("line is: ", line)
                output = line
                timestamp_match = timestamp_pattern.search(line)
                if timestamp_match:
                    output = re.sub(timestamp_regex, "", output)
                user_match = user_pattern.search(line)
                if user_match:
                    output = re.sub(user_regex, "", output)
                bag_of_words_output.append(self.apply_lemmatizer(output))
        print("Training Data Entries: {}".format(len(bag_of_words_output)))
        return bag_of_words_output

    def apply_lemmatizer(self, sentence):
        temp = [
            self.lemmatizer.lemmatize(t)
            for t in self.tokenizer.tokenize(sentence)
        ]
        # join tokens as vectorizer will split them again
        lemmas = " ".join(temp)
        return lemmas

    def build_topic_model(self,
                          num_of_topics=config.num_of_topics,
                          max_iterations=config.max_iterations):
        corpus = self.read_twitter_training_dataset()
        test = self.read_twitter_testing_dataset()

        if self.optimising:
            corpus = self.read_twitter_testing_dataset()
        training_tfidf = self.vectorizer.fit_transform(corpus)
        print("number of tfidf features: %d" % training_tfidf.get_shape()[1])
        feat_names = self.vectorizer.get_feature_names()
        # Run LDA
        lda = LatentDirichletAllocation(n_components=num_of_topics,
                                        max_iter=max_iterations,
                                        learning_method='online',
                                        learning_offset=50.,
                                        random_state=0).fit(training_tfidf)

        ldaModel = lda.fit_transform(training_tfidf)
        model = (feat_names, lda.components_, lda.exp_dirichlet_component_,
                 lda.doc_topic_prior_)

        if not self.optimising:
            # if used in production save the model
            print(f"Saving model to file: {config.lda_model_file_name}")
            with open(config.lda_model_file_name, 'wb') as fp:
                joblib.dump(model, fp)
        else:
            self.evaluate_model(model, training_tfidf)

    def load_model_for_eval(self):
        model = (features, components_, exp_dirichlet_component_,
                 doc_topic_prior_) = joblib.load(config.lda_model_file_name)
        # self.tf_vectorizer = CountVectorizer(vocabulary=self.features, stop_words='english')
        self.evaluate_model(model)

    def evaluate_model(self, model, data=None):
        (features, components_, exp_dirichlet_component_,
         doc_topic_prior_) = model
        print(f"First 10 words in the vocabulary: %s", features[0:10])

        # Print words associated with each topic
        for topic_idx, topic in enumerate(components_):
            print("Topic %d:" % (topic_idx))
            print(" ".join([
                features[i]
                for i in topic.argsort()[:-config.num_of_topics - 1:-1]
            ]))

        if not data is None:
            # extract from vectrorised data
            termFreq = data.sum(axis=0).getA1()
            docLength = data.sum(axis=1).getA1()
            termDists = components_ / components_.sum(axis=1)[:, None]
            print("Data present, ", termFreq, docLength)

        # when optimising output graphs
        for compNum in range(0, config.num_of_topics):
            comp = components_[compNum]
            indeces = np.argsort(comp).tolist()
            indeces.reverse()
            terms = [features[weightIndex] for weightIndex in indeces[0:10]]
            weights = [comp[weightIndex] for weightIndex in indeces[0:10]]
            terms.reverse()
            weights.reverse()
            positions = np.arange(10) + .5

            # plot strongest terms for each term
            plt.plot(compNum)
            plt.barh(positions, weights, align='center')
            plt.yticks(positions, terms)
            plt.xlabel('Weight')
            plt.title('Strongest terms for component %d' % compNum)
            plt.grid(True)
            plt.savefig(config.topics_results_dir + "topic%d" % compNum +
                        ".png")
            plt.close()
        return
Esempio n. 32
0
print ("PART 1\n")
#Loading the data
sample_docs = []
with open("top1000_movie_summaries.tsv") as fi:
    tsvReader = csv.reader(fi, delimiter='\t')
    for i, (title, plot) in enumerate(tsvReader):
        sample_docs.append(plot)

#Tfidf Vectorizer
vectorizer = TfidfVectorizer(min_df=1)
tfidf_matrix = vectorizer.fit_transform(sample_docs)
tfidf_array = tfidf_matrix.toarray()


#Displaying Top 10 Terms and their Tfidf Scores
terms_list = [(score, term)  for score, term in zip(tfidf_array[0], vectorizer.get_feature_names()) if score > 0]
terms_list = sorted(terms_list, key=lambda x: x[0],reverse = True)
i = 0
print ("Displaying Top 10 Terms and their Tfidf Scores")
for tuples in terms_list:
    i+=1
    print (tuples)
    if i>=10:
        break

#Using K-Means Algorithm to compute clusters
kmeans = KMeans(n_clusters=20, random_state=0)
kmeans.fit(tfidf_matrix)

#Exploring Cluster Sizes
cluster_labels =  kmeans.labels_
with open('custom_stopwords.txt', 'r') as myfile:
    data=myfile.readlines()#.replace('\n', '')
cus_stop = [i.replace('\n','') for i in data]
cus_stop.append(str(curr_year))

punc = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}',"%"]
stop_words = feature_extraction.text.ENGLISH_STOP_WORDS.union(punc)
stop_words = stop_words.union(cus_stop) #Join English + Custom Stop Words

stopwords = nltk.corpus.stopwords.words('english')

vectorizer = TfidfVectorizer(stop_words=stop_words)
X = vectorizer.fit_transform(df['content'].values.astype('U'))

word_features = vectorizer.get_feature_names()

stemmer = SnowballStemmer('english')
tokenizer = RegexpTokenizer(r'[a-zA-Z\']+')

def tokenize(text):
    return [stemmer.stem(word) for word in tokenizer.tokenize(text.lower())]

vectorizer2 = TfidfVectorizer(stop_words = stop_words, tokenizer = tokenize)
X2 = vectorizer2.fit_transform(df['content'].values.astype('U'))
word_features2 = vectorizer2.get_feature_names()

vectorizer3 = TfidfVectorizer(stop_words = stop_words, tokenizer = tokenize, max_features = 1000)
X3 = vectorizer3.fit_transform(df['content'].values.astype('U'))
words = vectorizer3.get_feature_names()
def give_top_utility_score_features(tweet, corpus, local_ner_corpus):
    final_segments = {}
    local_weight = 0
    global_weight = 0
    final_weight_for_segment = 0
    list_of_segments = ng.generate_ngrams(tweet)
    tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
    tfs = tfidf.fit_transform(corpus)
    #local_ngrams
    response = tfidf.transform([tweet])
    local_list_of_ngrams = tfidf.get_feature_names()
    local_ngrams = {}
    for col in response.nonzero()[1]:
        local_ngrams[local_list_of_ngrams[col]] = response[0, col]
    #print(local_ngrams)

    #local_list_of_ngrams=list(set(local_list_of_ngrams))
    #global_ner
    global_list_of_ners = []
    tokenized_text = word_tokenize(tweet)
    classified_text = st.tag(tokenized_text)
    filteredList = filter(lambda x: x[1] != 'O', classified_text)
    filteredList = list(filteredList)
    if filteredList:
        for objects in filteredList:
            global_list_of_ners.append(objects[0])
            #global_ngram
    global_microsoft_list_of_ngrams = []
    try:
        global_microsoft_list_of_ngrams = microsost_ngram_service(tweet)
    except:
        pass
    for segment in list_of_segments:
        final_weight_for_segment = 0
        if segment in global_list_of_ners:
            global_weight += 0.3
        if segment in global_microsoft_list_of_ngrams:
            global_weight += 0.3
        if segment in local_ner_corpus:
            local_weight += 0.3
        if segment in local_ngrams:
            local_weight += local_ngrams[segment]
        final_weight_for_segment = ((global_weight)) + (local_weight)
        #print(segment," : ",final_weight_for_segment)
        if (final_weight_for_segment >= 0.3):
            final_segments[segment] = final_weight_for_segment
        #print(segment," ; " , final_weight_for_segment)
#             new_final_segments = dict(sorted(final_segments.iteritems(), key=operator.itemgetter(1), reverse=True)[:5])

    new_final_segments = sorted(final_segments,
                                key=final_segments.get,
                                reverse=True)[:5]
    #print(new_final_segments)
    new_final_segments = new_final_segments + list(global_list_of_ners) + list(
        global_microsoft_list_of_ngrams)
    #print(new_final_segments)
    tweet = tp.clean(tweet)
    tweet = ip.imply_preprocess(tweet)
    corpus.append(tweet)

    local_ner_corpus = set(local_ner_corpus) | set(global_list_of_ners)

    return (new_final_segments, corpus, local_ner_corpus)
    #text = [wn.lemmatize(word) for word in text]
    return text


#Removing original text column
del data['text']

list(data)
data.shape

#Vectorizing processed text column i.e. p_text
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['p_text'])
print(X_tfidf.shape)

print(tfidf_vect.get_feature_names())

X_tfidf_df = pd.DataFrame(X_tfidf.toarray())
X_tfidf_df.columns = tfidf_vect.get_feature_names()

#Taking independent variables together
X_features = pd.concat([
    data[data.columns[1:18]].reset_index(drop=True),
    X_tfidf_df.reset_index(drop=True)
],
                       axis=1)

X_features.head()

#Divide data in train and test
X_train, X_test, y_train, y_test = train_test_split(X_features,
# In[26]:


x


# In[27]:


v


# In[28]:


df1 = pd.DataFrame(x.toarray(), columns=v.get_feature_names())


# In[29]:


df.drop('text', axis=1, inplace=True)
df = pd.concat([df, df1], axis=1)


# In[30]:


df.columns #these contains all columns previously selected previously

Esempio n. 37
0
def represent():
    try:
        li = os.listdir('CodeExample')
    except FileNotFoundError:
        return

    for name in sorted(li):
        is_ok = True
        rep_cluster_arr = []
        try:
            sample = os.listdir('CodeExampleJson/{}'.format(name))
        except FileNotFoundError:
            continue
        tmp = os.listdir('CodeExample/{}'.format(name))
        sample = [s for s in sample if '{}.txt'.format(s[:-5]) in tmp]
        # sample.sort()
        tmp = []
        for s in sample:
            with open('CodeExampleJson/{}/{}'.format(name, s), 'r') as f:
                data = json.load(f)
                tmp.append([len(data['lines']), s])
        tmp.sort(key=lambda x: x[0])
        sample = [t[1] for t in tmp]
        try:
            sz = max([
                int(df.at[(name, '{}.txt'.format(s[:-5])), 'cluster'])
                for s in sample
            ])
        except KeyError:
            sz = 0

        tfidf_vectorizer = TfidfVectorizer(input='filename',
                                           max_df=0.5,
                                           min_df=1,
                                           max_features=3,
                                           norm='l2')
        for i in range(sz + 1):
            try:
                tmp = [
                    s[:-5] for s in sample
                    if i == int(df.at[(name, '{}.txt'.format(s[:-5])),
                                      'cluster'])
                ]

                files = ['CodeExample/{}/{}.txt'.format(name, t) for t in tmp]
                if not files:
                    is_ok = False
                    break
                try:
                    tfidf = tfidf_vectorizer.fit_transform(files)
                except ValueError:
                    tfidf_vectorizer = TfidfVectorizer(input='filename',
                                                       max_df=1.0,
                                                       min_df=1,
                                                       max_features=3,
                                                       norm='l2')
                    tfidf = tfidf_vectorizer.fit_transform(files)
                feature = tfidf_vectorizer.get_feature_names()

                y = len(tmp)
                for j in range(len(tmp)):
                    x = tmp[j]
                    try:
                        with open('CodeAst/_{}_{}.txt'.format(name, x),
                                  'r') as f:
                            rd = f.read()
                    except FileNotFoundError:
                        continue
                    p = 0
                    with open('ast_seqs.txt', 'r') as f:
                        for k, comm in enumerate(f):
                            if comm == rd:
                                p = k
                                break
                    with open('jd_comms.txt', 'r') as f:
                        z = f.read().split('\n')[p]
                    break
                else:
                    x = tmp[0]

            except KeyError:
                x = sample[0][:-5]

                files = ['CodeExample/{}/{}.txt'.format(name, x)]
                if not files:
                    is_ok = False
                    break
                try:
                    tfidf = tfidf_vectorizer.fit_transform(files)
                except ValueError:
                    tfidf_vectorizer = TfidfVectorizer(input='filename',
                                                       max_df=1.0,
                                                       min_df=1,
                                                       max_features=3,
                                                       norm='l2')
                    tfidf = tfidf_vectorizer.fit_transform(files)
                feature = tfidf_vectorizer.get_feature_names()

                y = 1
                try:
                    with open('CodeAst/_{}_{}.txt'.format(name, x), 'r') as f:
                        rd = f.read()
                except FileNotFoundError:
                    continue
                p = 0
                with open('ast_seqs.txt', 'r') as f:
                    for k, comm in enumerate(f):
                        if comm == rd:
                            p = k
                            break
                with open('jd_comms.txt', 'r') as f:
                    z = f.read().split('\n')[p]

            rep_cluster_arr.append({
                'id': x,
                'num': y,
                'comment': z,
                'feature': feature
            })

        if not is_ok:
            continue

        text = '# {}\n\n***\n\n'.format(name)
        for i, dic in enumerate(rep_cluster_arr):
            with open('CodeExampleJson/{}/{}.json'.format(name, dic['id']),
                      'r') as f:
                data = json.load(f)
                if not data['lines']:
                    continue
                try:
                    text += '## [Cluster {} ({}, {}, {})](./{})\n'.format(
                        i + 1, dic['feature'][0], dic['feature'][1],
                        dic['feature'][2], i + 1)
                except IndexError:
                    text += '## [Cluster {}](./{})\n'.format(i + 1, i + 1)
                text += '{} results\n'.format(dic['num'])
                text += '> {}\n'.format(dic['comment'])
                text += '{% highlight java %}\n'
                for j, line in data['lines'].items():
                    text += '{0}. {1}\n'.format(j, line.split('\n')[0])
                text += '{% endhighlight %}\n\n***\n\n'

            catalog(name, i, dic['feature'])

        os.makedirs('docs/{}'.format(name), exist_ok=True)
        with open('docs/{}/index.md'.format(name), 'w') as f:
            f.write(text)

        print(text)
Esempio n. 38
0
def get_text_features(fnum, fname, df, nvalues, vectorize, ngrams_max):
    r"""Transform text features with count vectorization and TF-IDF,
    or alternatively factorization.

    Parameters
    ----------
    fnum : int
        Feature number, strictly for logging purposes
    fname : str
        Name of the text column in the dataframe ``df``.
    df : pandas.DataFrame
        Dataframe containing the column ``fname``.
    nvalues : int
        The number of unique values.
    vectorize : bool
        If ``True``, then attempt count vectorization.
    ngrams_max : int
        The maximum number of n-grams for count vectorization.

    Returns
    -------
    new_features : numpy array
        The vectorized or factorized text features.
    new_fnames : list
        The new feature name(s) for the numerical variable.

    References
    ----------
    To use count vectorization and TF-IDF, you can find more
    information here [TFE]_.

    """
    feature = df[fname]
    min_length = int(feature.astype(str).str.len().min())
    max_length = int(feature.astype(str).str.len().max())
    if len(feature) == nvalues:
        logger.info(
            "Feature %d: %s is a text feature [%d:%d] with maximum number of values %d",
            fnum, fname, min_length, max_length, nvalues)
    else:
        logger.info(
            "Feature %d: %s is a text feature [%d:%d] with %d unique values",
            fnum, fname, min_length, max_length, nvalues)
    # need a null text placeholder for vectorization
    feature.fillna(value=NULLTEXT, inplace=True)
    # vectorization creates many columns, otherwise just factorize
    if vectorize:
        logger.info("Feature %d: %s => Attempting Vectorization", fnum, fname)
        vectorizer = TfidfVectorizer(ngram_range=[1, ngrams_max])
        try:
            new_features = vectorizer.fit_transform(feature)
            new_fnames = vectorizer.get_feature_names()
            logger.info("Feature %d: %s => Vectorization Succeeded", fnum,
                        fname)
        except:
            logger.info("Feature %d: %s => Vectorization Failed", fnum, fname)
            new_features, _ = pd.factorize(feature)
            new_fnames = [USEP.join([fname, 'factor'])]
    else:
        logger.info("Feature %d: %s => Factorization", fnum, fname)
        new_features, _ = pd.factorize(feature)
        new_fnames = [USEP.join([fname, 'factor'])]
    return new_features, new_fnames
from sklearn.feature_extraction import stop_words

count_vect = CountVectorizer()
tfidf_vect = TfidfVectorizer()
#print(stop_words.ENGLISH_STOP_WORDS) # removes stop words

text1 = 'How are you, are you doing fine?'
text2 = "What's up?"

count_test_vect_text = count_vect.fit_transform([text1, text2]) # train count vect
tfidf_vect_text = tfidf_vect.fit_transform([text1, text2]) # train tfidf vect

## tfidf-vectorizer
print(tfidf_vect_text)
print(tfidf_vect_text.toarray())
print(tfidf_vect.get_feature_names())
print(tfidf_vect_text[0])
print("#"*20)
print(tfidf_vect_text[1])
print(tfidf_vect.inverse_transform(tfidf_vect_text[1]))
print("-"*20)

##count vectorizer
print(count_test_vect_text)
print(count_test_vect_text.toarray())
print(count_vect.get_feature_names())
print(count_test_vect_text[0])
print("#"*20)
print(count_test_vect_text[1])
print(count_vect.inverse_transform(count_test_vect_text[0]))
Esempio n. 40
0
### remainder go into training)
### feature matrices changed to dense representations for compatibility with
### classifier functions in versions 0.15.2 and earlier
from sklearn import cross_validation
features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(
    word_data, authors, test_size=0.1, random_state=42)

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True,
                             max_df=0.5,
                             stop_words='english')
features_train = vectorizer.fit_transform(features_train)
features_test = vectorizer.transform(features_test).toarray()

### a classic way to overfit is to use a small number
### of data points and a large number of features;
### train on only 150 events to put ourselves in this regime
features_train = features_train[:150].toarray()
labels_train = labels_train[:150]

### your code goes here
from sklearn.tree import DecisionTreeClassifier
dtree = DecisionTreeClassifier()
dtree.fit(features_train, labels_train)
print "score: ", dtree.score(features_test, labels_test)
fimp = dtree.feature_importances_
print "max: ", max(fimp)
print "len: ", len(fimp)
print "idx: ", numpy.where(fimp == max(fimp))
print vectorizer.get_feature_names()[numpy.where(fimp == max(fimp))[0][0]]
Esempio n. 41
0
from sklearn.feature_extraction.text import TfidfVectorizer

# Create a TfidfVectorizer: tfidf
tfidf = TfidfVectorizer()

# Apply fit_transform to document: csr_mat
csr_mat = tfidf.fit_transform(documents)

# Print result of toarray() method
print(csr_mat.toarray())
# [[0.51785612 0.         0.         0.68091856 0.51785612 0.        ]
#  [0.         0.         0.51785612 0.         0.51785612 0.68091856]
#  [0.51785612 0.68091856 0.51785612 0.         0.         0.        ]]

# Get the words: words
words = tfidf.get_feature_names()

# Print words
print(words)
# ['cats', 'chase', 'dogs', 'meow', 'say', 'woof']

# Clustering Wikipedia part I
# You saw in the video that TruncatedSVD is able to perform PCA on sparse arrays in csr_matrix format, such as word-frequency arrays. Combine your knowledge of TruncatedSVD and k-means to cluster some popular pages from Wikipedia. In this exercise, build the pipeline. In the next exercise, you'll apply it to the word-frequency array of some Wikipedia articles.

# Create a Pipeline object consisting of a TruncatedSVD followed by KMeans. (This time, we've precomputed the word-frequency matrix for you, so there's no need for a TfidfVectorizer).

# The Wikipedia dataset you will be working with was obtained from here.
# Perform the necessary imports
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
Esempio n. 42
0
        #      from_data.append(1)

        from_data.append(0 if name == "sara" else 1)

        email.close()

print "emails processed"
from_sara.close()
from_chris.close()

pickle.dump(word_data, open("your_word_data.pkl", "w"))
pickle.dump(from_data, open("your_email_authors.pkl", "w"))

# The string that you get for word_data[152]
word_data[152]

### in Part 4, do TfIdf vectorization here
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words="english")
X = vectorizer.fit_transform(word_data)
transformer = TfidfTransformer()
tfidf = transformer.fit_transform(X)

vector = vectorizer.get_feature_names()

# How many unique words are there in your Tfldf?
print len(vector)
# What is word number 34597 in your TfIdf?
vector[34597]
Esempio n. 43
0
title = data.Title
#abstract_title = pd.Series()
#for i in range(len(title)):
    #abstract_title[str(i)] = title[i] + abstract[i]

tf = abstract

###############################################################################
#remove dominant words
##td-idf#######################################################################
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words = 'english')
X = vectorizer.fit_transform(tf)
dense_X = X.todense()
idf = vectorizer.idf_
featurename1 = vectorizer.get_feature_names()
#print(dict(zip(vectorizer.get_feature_names(), idf)))

#get dominant words
one = dense_X > 0
frequency1 = sum(one)
#plt.plot(np.transpose(frequency1))
#By looking at the frequency of each word, find the threshold
#400, frequency > 500 are dominant words
do = pd.Series(frequency1.getA()[0],index = featurename1)
freq_sort1 = do.sort_values(ascending=False)
c1 = freq_sort1[:20].index


index = np.where(frequency1 > 1000)[1]
stopwords = [featurename1[x] for x in index]
def weighted_embeddings(esco_df, eperusteet_df, model):
    """
    Create TFIDF weighted embeddings for ESCO and ePerusteet.
    The input sentences should be separated with newlines.

    Args:
        esco_df (DataFrame) : Requires cols 'label' and 'text', where 'text' contains textual representation of ESCO.
        eperusteet_df (DataFrame) : Requires cols 'label' and 'text', where 'text' contains textual representation of ePerusteet.
        model (fasttext.model) : Model for word-embeddings.

    Return:
        X_esco (xArray) : Embeddings for ESCO texts.
        X_eperusteet (xArray) : Embeddings for ePerusteet texts.
    """
    assert isinstance(esco_df, pd.DataFrame)
    assert isinstance(eperusteet_df, pd.DataFrame)

    text_esco = esco_df["text"]
    text_eperusteet = eperusteet_df["text"]

    # Do not sort - to we can resplit using the indices
    combined_texts = pd.concat([text_esco, text_eperusteet], sort=False)

    vectorizer = TfidfVectorizer()
    vectorizer.fit(combined_texts)
    tokenizer = vectorizer.build_tokenizer()
    feature_array = vectorizer.get_feature_names()

    identifiers = []
    embeddings = []

    for _, row in tqdm(esco_df.iterrows(),
                       total=esco_df.shape[0],
                       desc="Computing embeddings for ESCOs"):
        identifiers.append(row["label"])

        texts = row["text"].split("\n")

        # Take average over the sentences
        competence_embedding = xr.DataArray(np.zeros(model.get_dimension()),
                                            dims=["embedding"])

        for text in texts:

            sentence_embedding = xr.DataArray(np.zeros(model.get_dimension()),
                                              dims=["embedding"])

            weights = vectorizer.transform([text])

            nonzero_indexes = weights.nonzero()
            weights = np.asarray(weights[nonzero_indexes][0]).reshape((-1, ))
            weights = [w / sum(weights) for w in weights]

            weight_dict = {
                feature_array[idx]: weights[i]
                for i, idx in enumerate(nonzero_indexes[1])
            }

            for word in text.split(" "):
                try:
                    token = tokenizer(word)[0]
                except IndexError:
                    continue
                weight = weight_dict[token]
                sentence_embedding += (model[word] * weight)

            competence_embedding += sentence_embedding

        # If the texts was empty, avoid division and add the 0-vector
        if not texts:
            competence_embedding = competence_embedding / len(texts)

        embeddings.append(competence_embedding)

    embeddings = np.stack(embeddings, axis=0)

    esco_embeddings = xr.DataArray(embeddings,
                                   coords={"ESCO": identifiers},
                                   dims=["ESCO", "embedding"])

    identifiers = []
    embeddings = []

    for _, row in tqdm(eperusteet_df.iterrows(),
                       total=eperusteet_df.shape[0],
                       desc="Computing embeddings for ePerusteet"):
        identifiers.append(row["label"])

        texts = row["text"].split("\n")

        # Take average over the sentences
        degree_embedding = xr.DataArray(np.zeros(model.get_dimension()),
                                        dims=["embedding"])

        for text in texts:
            sentence_embedding = xr.DataArray(np.zeros(model.get_dimension()),
                                              dims=["embedding"])

            weights = vectorizer.transform([text])

            nonzero_indexes = weights.nonzero()
            weights = np.asarray(weights[nonzero_indexes][0]).reshape((-1, ))
            weights = [w / sum(weights) for w in weights]

            weights = {
                feature_array[idx]: weights[i]
                for i, idx in enumerate(nonzero_indexes[1])
            }

            for word in text.split(" "):
                try:
                    token = tokenizer(word)[0]
                except IndexError:
                    continue
                weight = weights[token]
                sentence_embedding += (model[word] * weight)

            degree_embedding += sentence_embedding

        # If the texts was empty, avoid division and add the 0-vector
        if not texts:
            degree_embedding = degree_embedding / len(texts)

        embeddings.append(degree_embedding)

    embeddings = np.stack(embeddings, axis=0)

    eperusteet_embeddings = xr.DataArray(embeddings,
                                         coords={"ePerusteet": identifiers},
                                         dims=["ePerusteet", "embedding"])

    return esco_embeddings, eperusteet_embeddings
Esempio n. 45
0
df["title"] = df['Name'].apply(lambda row: title_extractor(row))
print(df[["Name", "title"]].head(10))

df.title.value_counts()

# Vectorizing text
## tf/idf - method for weighting of each word
### tf = term frequency (# of times word appears in doc)
### idf = inverse document frequency (# of docs / total # of docs containing term t)
# Source: https://www.kaggle.com/edchen/tf-idf

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()

corpus = [
    'the brown fox jumped over the brown dog', 'the quick brown fox',
    'the brown brown dog', 'the fox ate the dog'
]

X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names())
print(X.toarray())

print(X.shape)
print(tfidf.vocabulary_)  # dict w/ vocabulary and index value of each

vocab = {v: k for k, v in tfidf.vocabulary_.items()}
#Genera un modelo de vectorizacion
ModeloVectorizacion = TfidfVectorizer(analyzer='word',
                                      ngram_range=(1, 2),
                                      min_df=0.003,
                                      max_df=0.5,
                                      max_features=5000,
                                      stop_words=stopwords_list)
##
#Convierte en lista los elementos
item_ids = articulos['contentId'].tolist()
#Le hace fitting al modelo de vectorizacion con  las columnas de titulo y articulo de acuerdo a las palabras pone sus valores
#de aparicion en la matriz
tfidf_matrix = ModeloVectorizacion.fit_transform(articulos['title'] + "" +
                                                 articulos['text'])
tfidf_feature_names = ModeloVectorizacion.get_feature_names()
print(tfidf_feature_names)

##Evaluacion del modelo
#SE haca cross validation con 20%
interactions_train_df, interactions_test_df = train_test_split(
    dataSetInteracciones,
    stratify=dataSetInteracciones['personId'],
    test_size=0.20,
    random_state=42)


#print(interactions_train_df.head())
##Diseño del perfil de usuario
#Obtiene el perfil de los items
def getPerfilObjeto(item_id):
Esempio n. 47
0
from sklearn import tree
clf = tree.DecisionTreeClassifier()

#print features_train[0]
#for i in range(len(features_train[0])):
#    if features_train[0][i]
print len(features_train[0])

t0 = time()
clf.fit(features_train, labels_train)
print "training time:", round(time() - t0, 3), "s"

t0 = time()
pred = clf.predict(features_test)
print "predict time:", round(time() - t0, 3), "s"

t0 = time()
from sklearn.metrics import accuracy_score
score = accuracy_score(labels_test, pred)
print score
print "score time:", round(time() - t0, 3), "s"

for i, f in enumerate(clf.feature_importances_):
    if f > 0.2:
        print i
        print f
        print vectorizer.get_feature_names()[i]

print clf.score(features_test, labels_test)
Esempio n. 48
0
import numpy as np
from sklearn import datasets
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.svm import SVC

newsgroups = datasets.fetch_20newsgroups(
    subset='all', categories=['alt.atheism', 'sci.space'])

transformer = TfidfVectorizer()

transformed = transformer.fit_transform(newsgroups.data)

feature_mapping = transformer.get_feature_names()
for i in feature_mapping:
    print(i)

grid = {'C': np.power(10.0, np.arange(-5, 6))}
cv = KFold(n_splits=5, shuffle=True, random_state=241)
clf = SVC(kernel='linear', random_state=241)
gs = GridSearchCV(estimator=clf,
                  param_grid=grid,
                  scoring='accuracy',
                  cv=cv,
                  return_train_score=True)
gs.fit(transformed, newsgroups.target)
clf._get_coef()

# for i in gs.cv_results_:
# print(i.mean_validation_score)
# print(i.parameters)
Esempio n. 49
0
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print()

# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
      "n_samples=%d and n_features=%d..." % (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components, random_state=1, alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in NMF model (Frobenius norm):")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(nmf, tfidf_feature_names, n_top_words)

# Fit the NMF model
print("Fitting the NMF model (generalized Kullback-Leibler divergence) with "
      "tf-idf features, n_samples=%d and n_features=%d..." %
      (n_samples, n_features))
t0 = time()
nmf = NMF(n_components=n_components,
          random_state=1,
          beta_loss='kullback-leibler',
          solver='mu',
          max_iter=1000,
          alpha=.1,
          l1_ratio=.5).fit(tfidf)
print("done in %0.3fs." % (time() - t0))
Esempio n. 50
0
	def get_vocabulary(self, linked_pages, categories_links):
		"""Scrapp a wiki page to get vocabulary for each category"""
		total_vocabulary = {}
		unique_vocabulary = []
		unique_vocabulary_tfidf = []
		# For each category
		for parent, pages in linked_pages.items():
			children_pages = []
			downloaded_pages = []
			# For every pages linked to this wategory on Wiki
			for page in pages:
				sys.stdout.write('\t{} / {} pages downloaded for [{}] category.\r'.format(len(children_pages)-1, len(pages), parent))
				sys.stdout.flush()
				# Get data
				wiki_url = 'https://en.wikipedia.org/wiki/{}'.format(page)
				data = requests.get(wiki_url)
				data_soup = BeautifulSoup(data.text, 'html.parser')
				paragraphs = [str(paragraph) for paragraph in data_soup.find_all('p')]
				paragraphs_joined = ' '.join(paragraphs)
				# Clean, tokenize, stemm and rebuild the document
				page_vocabulary = []
				cleaned_data = self.clean_xml(text=paragraphs_joined.strip())
				tokenized_data = self.tokenizer.tokenize(cleaned_data)
				for token in tokenized_data:
					if token.lower() not in self.stopwords:
						word = self.lemmatizer.lemmatize(token.lower())
						# Check if the word is correct
						if self.english_dict.check(word) is True:
							page_vocabulary.append(word)
							# Track total vocabulary
							if word not in unique_vocabulary:
								unique_vocabulary.append(word)
							# Here, why not Levenstein for correction, but gonna be long
				page_nlp_treated = ' '.join(page_vocabulary)
				if len(children_pages) >= self.configuration['options']['pages_per_category'] or len(children_pages) == len(pages):
					break
				else:
					children_pages.append(page_nlp_treated)
					downloaded_pages.append(page)
				# Wikipedia is cool, be cool with their servers.
				time.sleep(self.configuration['options']['waiting_time'])
			# StdOut summary
			print('\n\t\t- ' + '\n\t\t- '.join(downloaded_pages))
			# TF_IDF for vocabulary of each category and get top score
			tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 5), min_df=0, stop_words=self.stopwords)
			try:
				tfidf_matrix = tf.fit_transform(children_pages)
			except ValueError:  # In case of an old empty page
				continue
			feature_names = tf.get_feature_names()
			dense = tfidf_matrix.todense()
			episode = dense[0].tolist()[0]
			phrase_scores = [pair for pair in zip(range(0, len(episode)), episode) if pair[1] > 0]
			sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1)
			category_words = []
			for word, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores][:self.configuration['options']['word_per_page']]:
				category_words.append({word: score})
				if word not in unique_vocabulary_tfidf:
					unique_vocabulary_tfidf.append(word)
			# Get linked categories to category
			linked_categories = []
			for relation in relations:
				if relation[0] == parent and relation[0] not in linked_categories:
					linked_categories.append(relation[1])
				if relation[1] == parent and relation[1] not in linked_categories:
					linked_categories.append(relation[0])
			# Get linked pages to category
			for category, pages in linked_pages.items():
				if category == parent:
					linked_pages_to_category = pages
			category_details = {}
			category_details['terminology'] = category_words
			category_details['linked_pages_to_category'] = linked_pages_to_category
			category_details['linked_categories'] = linked_categories
			total_vocabulary[parent] = category_details
		# Statistics about our terminology
		print('\nA total of {} words have been scanned to extract {} important words covering {} categories.'.format(len(unique_vocabulary), len(unique_vocabulary_tfidf), len(linked_pages)))
		return total_vocabulary
from sklearn.feature_extraction.text import TfidfVectorizer
# http://stackoverflow.com/questions/23792781/
#  tf-idf-feature-weights-using-sklearn-feature-extraction-text-tfidfvectorizer

# http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
corpus = ["This is very strange",
          "This is very nice",
          "This is a flower"]
vectorizer = TfidfVectorizer(min_df=1, ngram_range=(1, 3))
X = vectorizer.fit_transform(corpus)
idf = vectorizer.idf_
d= dict(zip(vectorizer.get_feature_names(), idf)) 
for key in d:
    print("{} = {}".format(key, d[key]))
            ### use str.replace() to remove any instances of the words
            ### ["sara", "shackleton", "chris", "germani"]

            ### append the text to word_data
            word_data.append(text)
            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
            if name == "sara":
                from_data.append(0)
            else:
                from_data.append(1)

            email.close()

print "emails processed"
from_sara.close()
from_chris.close()

pickle.dump( word_data, open("your_word_data.pkl", "w") )
pickle.dump( from_data, open("your_email_authors.pkl", "w") )
print word_data[152]




### in Part 4, do TfIdf vectorization here
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words="english")
transformed_word_data = vectorizer.fit_transform(word_data)
print "count of words: ", len(vectorizer.get_feature_names())
print "word 34597: ", vectorizer.get_feature_names()[34597]
Esempio n. 53
0
movies.genres = movies.genres.str.split('|')

movies.head()

movies.genres = movies.genres.fillna("").astype('str')
movies.head()

from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['genres'])
tfidf_matrix.shape

tfidf_matrix

print(tf.get_feature_names())

"""## Cosine Similarity"""

from sklearn.metrics.pairwise import cosine_similarity
sim = cosine_similarity(tfidf_matrix)

sim.shape

sim[:4, :4]

"""##Predictions"""

# Build a 1-dimensional array with movie titles
titles = movies['title']
indices = pd.Series(movies.index, index=movies['title'])
Esempio n. 54
0
def getFeature(foldername):
    filenamelist = []
    # foldername = 'ratings2020'
    for subdir, dirs, files in os.walk(foldername):
        for file in os.listdir(subdir):
            filepath = subdir + os.sep + file
            re.sub(r"\\", "/", filepath)
            if ".csv" in filepath:
                filenamelist.append(filepath)

    # ----------------> Merging all the data in one csv
    df_merged = (pd.read_csv(filepath_or_buffer=file,
                             sep=',',
                             encoding='utf-16',
                             error_bad_lines=False,
                             engine='python') for file in filenamelist)
    df_merged = pd.concat(df_merged, ignore_index=True)
    df_merged.to_csv("merged.csv")
    df_merged.columns = [
        column.replace(" ", "_") for column in df_merged.columns
    ]
    df = df_merged[[
        "Star_Rating", "Reviewer_Language", "Review_Text", "App_Version_Code"
    ]]
    pd.set_option('mode.chained_assignment',
                  None)  # to remove SettingwithcopyWarning

    df['Positively_Rated'] = np.where(df['Star_Rating'] >= 3, 1, 0)
    # @@@@@@@@@@@@@@@@@@@ UI FEATURE 1: @@@@@@@@@@@@@@@@@@@@@@@@@@
    total_rating = len(df['Star_Rating'])
    pd.set_option('mode.chained_assignment', None)
    df.dropna(inplace=True, how='any')
    total_reviews = len(df(l1['Review_Text']))

    # In version 1.0 , we'll be checking only english revviews....
    df = df[df.Reviewer_Language == 'en']

    # Telling the positive and negative Cont and propotion for a particular version
    latest_version = max(df["App_Version_Code"])
    VrsnRating = df[df.App_Version_Code ==
                    latest_version].Positively_Rated.mean()

    VrsnRating = round(VrsnRating * 100, 2)

    ########## DATA CLEANING ##################333
    df['Review'] = df['Review_Text'].apply(lambda x: x.lower())
    df['Review'] = df['Review'].apply(
        lambda x: re.sub(r"\W", " ", x))  # non -word charactrer
    df['Review'] = df['Review'].apply(
        lambda x: re.sub(r"\d", " ", x))  # removing digits
    df['Review'] = df['Review'].apply(
        lambda x: re.sub("([^\x00-\x7F])+", " ", x))  # removing emojis
    df['Review'] = df['Review'].apply(
        lambda x: re.sub(' \w{1,4} ', ' ', x))  # removing 2  char wrds
    df['Review'] = df['Review'].apply(lambda x: re.sub(r"\s+", " ", x))
    df['Review'] = lemma(df['Review'])
    df['Review'] = df['Review'].apply(stp)
    nan_value = float("NaN")
    df.replace("", nan_value, inplace=True)
    df.dropna(inplace=True)
    df.isnull()
    df['Review'] = tagme(df['Review'])

    sid = SentimentIntensityAnalyzer()
    df["sentiments"] = df["Review_Text"].apply(lambda x: sid.polarity_scores(
        x))  #'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound':..
    df = pd.concat(
        [df.drop(['sentiments'], axis=1), df['sentiments'].apply(pd.Series)],
        axis=1)

    # add number of characters column
    df["nb_chars"] = df["Review_Text"].apply(lambda x: len(x))
    # add number of words column
    df["nb_words"] = df["Review_Text"].apply(lambda x: len(x.split(" ")))

    documents = [
        TaggedDocument(doc, [i]) for i, doc in enumerate(df["Review"].apply(
            lambda x: str(x).split(" ")))
    ]
    # train a Doc2Vec model with our text data
    model = Doc2Vec(documents,
                    vector_size=30,
                    window=2,
                    min_count=1,
                    workers=4)
    # transform each document into a vector data
    doc2vec_df = df["Review"].apply(
        lambda x: model.infer_vector(str(x).split(" "))).apply(pd.Series)
    doc2vec_df.columns = [
        "doc2vec_vector_" + str(x) for x in doc2vec_df.columns
    ]
    df = pd.concat([df, doc2vec_df], axis=1)

    corpus = []
    for sentences in df["Review"]:
        corpus.append([word for word, tag in sentences])

    df['cln_Reviews'] = [" ".join(review) for review in corpus]

    # add tf-idfs columns
    tfidf = TfidfVectorizer(
        min_df=5)  # ignore terms appearing less than 5 documents
    tfidf_result = tfidf.fit_transform(df["cln_Reviews"]).toarray()
    tfidf_df = pd.DataFrame(tfidf_result, columns=tfidf.get_feature_names())
    tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]
    tfidf_df.index = df.index
    reviews_df = pd.concat([df, tfidf_df], axis=1)

    wrdcldimg = show_wordcloud_fn(corpus)

    best_negsentences = reviews_df[reviews_df["nb_words"] >= 5].sort_values(
        "neg", ascending=False)[["Review_Text"]].head()
    #best_negsentences = reviews_df.sort_values("neg", ascending=False)[["Review_Text"]].head()
    best_negsentences = best_negsentences.to_string(index=False)

    pos_best_sentences = reviews_df[reviews_df["nb_words"] >= 5].sort_values(
        "pos", ascending=False)[["Review_Text"]].head()
    #pos_best_sentences = reviews_df.sort_values("pos", ascending=False)[["Review_Text"]].head()
    pos_best_sentences = pos_best_sentences.to_string(index=False)

    # apprtngimg = appvsrating(reviews_df)

    return (best_negsentences, pos_best_sentences, total_rating, total_reviews,
            VrsnRating, latest_version, wrdcldimg)
Esempio n. 55
0
data.columns = ['labels', 'texts']

# Explore the dataset
print('Out of {} rows, {} are spam, {} are ham'.format(len(data), len(data[data['labels']=='spam']), len(data[data['labels']=='ham'])))
# Check the Number of missing data
print('Number of null in labels: {} and number of null in texts: {}'.format(data['labels'].isnull().sum(), data['texts'].isnull().sum()))

# stopwords removal
stopwords = nltk.corpus.stopwords.words('english')

# Wordnetlemmatizer
wm = nltk.WordNetLemmatizer()
# pre-processing data
def data_clean(texts):
    text = "".join([char for char in texts if char not in string.punctuation])
    tokens = re.split('W+', text)
    text = [wm.lemmatize(word) for word in tokens if word not in stopwords]
    return text

data['cleaned_text'] = data['texts'].apply(lambda x: data_clean(x.lower()))

# Vectorizing
tfidf_vect = TfidfVectorizer(analyzer=data_clean)
X_tfidf = tfidf_vect.fit_transform(data['cleaned_text'])
import ipdb; ipdb.set_trace()
print(X_tfidf.shape, tfidf_vect.get_feature_names())




Esempio n. 56
0
tv = TfidfVectorizer(min_df=0.,
                     max_df=1.,
                     norm="l2",
                     use_idf=True,
                     smooth_idf=True)

tv_train_features = tv.fit_transform(train_corpus)
tv_test_features = tv.transform(test_corpus)

print('TF-IDF model:> Train features shape:', tv_train_features.shape,
      ' Test features shape:', tv_test_features.shape)
# -

tv_matrix = tv_train_features.toarray()
vocab = tv.get_feature_names()
pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)

# ### ML algorithms on TF-IDF model

import time
import warnings
warnings.filterwarnings('ignore')
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate

# +
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
Esempio n. 57
0
class KeywordsGenerator(BaseEstimator, TransformerMixin):
    """Class to extract list of keywords from text.

    It is compatible with scikit-learn API (i.e. contains fit, transform
    methods).

    Parameters
    ----------

    max_tfidf_features : int, optional
        Size of vocabulary for tfidf.
        Default value, 10000.

    keywords : list, optional
        Keywords to extracted as priority.
        Default value, "keywords" list defined in conf file.

    stopwords : list, optional
        Stopwords not to be extracted.
        Default value, "names" and "stopwords" lists defined in conf file.

    resample : bool, optional
        True if dataset must be resampled according to class distribution,
        else False.
        Default value, True.

    n_jobs : int, optional
        Number of cores used for computation.
        Default value, 20.

    copy : bool, optional
        Make a copy of DataFrame.
        Default value, True.

    n_max_keywords : int, optional
        Maximum number of keywords to be returned.
        Default value, 6.

    n_min_keywords : int, optional
        Minimum number of keywords to be returned.
        Default value, 0.

    threshold_keywords : float, optional
        Minimum tf-idf score for word to be selected as keyword.
        Default value, 0.0.

    n_docs_in_class : int, optional
        Number of documents in each classes.
        Default value, 100.

    keywords_coef : int, optional
        Coefficient multiplied with the tf-idf scores of each keywords.
        Default value, 10.

    Attributes
    ----------
    max_tfidf_features, keywords, stopwords, resample, n_jobs, progress_bar,
    copy, n_max_keywords, n_min_keywords, threshold_keywords, n_docs_in_class,
    keywords_coef,

    tfidf_vectorizer : TfidfVectorizer instance from sklearn,

    dict_scores_ : dictionary,
        Tf-idf scores for each tokens.

    max_score_ : np.array,

    Examples
    --------
    >>> from melusine.summarizer.keywords_generator import KeywordsGenerator
    >>> keywords_generator = KeywordsGenerator()
    >>> keywords_generator.fit(X, y)
    >>> keywords_generator.transform(X)
    >>> print(X['keywords'])

    """

    def __init__(self,
                 max_tfidf_features=10000,
                 keywords=keywords,
                 stopwords=stopwords,
                 resample=False,
                 n_jobs=20,
                 progress_bar=True,
                 copy=True,
                 n_max_keywords=6,
                 n_min_keywords=0,
                 threshold_keywords=0.0,
                 n_docs_in_class=100,
                 keywords_coef=10):
        self.max_tfidf_features_ = max_tfidf_features
        self.tfidf_vectorizer = TfidfVectorizer(max_features=max_tfidf_features
                                                )
        self.keywords = keywords
        self.stopwords = stopwords
        self.resample = resample
        self.n_jobs = n_jobs
        self.progress_bar = progress_bar
        self.copy = copy
        self.n_max_keywords = n_max_keywords
        self.n_min_keywords = n_min_keywords
        self.threshold_keywords = threshold_keywords
        self.n_docs_in_class = n_docs_in_class
        self.keywords_coef = keywords_coef

    def fit(self, X, y=None):
        """Fit the weighted tf-idf model with input data.

        If resample attribute is True the dataset will be resampled according
        to class distribution.

        Parameters
        ----------
        X : pandas.DataFrame, shape (n_samples, n_features)
            X must contain ['tokens'] column.

        y : Ignored

        Returns
        -------
        self : object
            Returns the instance itself.
        """
        if self.resample:
            X_resample = self.resample_docs(X, y)
        else:
            X_resample = X

        X_resample['tokens'] = X_resample['tokens'].apply(self._remove_stopwords)

        # fit tf-idf on resample data set
        tokens_joined = X_resample['tokens'].apply(lambda x: ' '.join(x))
        self.tfidf_vectorizer.fit(tokens_joined)

        # modify the idf weights given frequency in the corpus
        idf_weights = self._add_tf_to_idf(X_resample)
        self.tfidf_vectorizer._tfidf._idf_diag = sp.spdiags(idf_weights,
                                                            diags=0,
                                                            m=len(idf_weights),
                                                            n=len(idf_weights))

        # return vetorizer with binary term frequency atribute
        self.dict_scores_ = dict(zip(self.tfidf_vectorizer.get_feature_names(),
                                     self.tfidf_vectorizer.idf_))
        self.max_score_ = np.max(self.tfidf_vectorizer.idf_)

        return self

    def transform(self, X):
        """Returns list of keywords in apparition order for each document
        with the weighted tf-idf already fitted.

        Parameters
        ----------
        X : pandas.DataFrame, shape (n_samples, n_features)
            X must contain ['tokens'] column.

        Returns
        -------
        X_new : pandas.DataFrame, shape (n_samples, n_components)
        """
        if self.copy:
            X_ = X.copy()
        else:
            X_ = X

        X_['keywords'] = apply_by_multiprocessing(df=X_[['tokens']],
                                                  func=self.get_keywords,
                                                  axis=1,
                                                  workers=self.n_jobs,
                                                  progress_bar=self.progress_bar)

        return X_

    def get_keywords(self, row):
        """Returns list of keywords in apparition order with the
        weighted tf-idf already fitted.

        Parameters
        ----------
        row : row of pd.Dataframe, columns ['tokens']

        Returns
        -------
        list of strings
        """
        tokens = self._remove_stopwords(row['tokens'])
        tokens = [x for x in tokens if not x.isdigit()]
        scores = Counter({t: self.dict_scores_.get(t, 0) for t in tokens})
        n = sum(i > self.threshold_keywords for i in list(scores.values()))
        n = min(n, self.n_max_keywords)
        n = max(n, self.n_min_keywords)
        keywords = [x[0] for x in scores.most_common(n)]
        index_sorted = [(k, tokens.index(k)) for k in keywords if k in tokens]
        index_sorted = sorted(index_sorted, key=lambda x: x[1])
        keywords_sorted = [i[0] for i in index_sorted]

        return keywords_sorted

    def resample_docs(self, X, y=None):
        """Method for resampling documents according to class distribution."""
        X_ = X.copy()
        if y is not None:
            X_['label'] = y
        X_['split'] = 0
        for c in X_.label.unique():
            N_c = X_[X_["label"] == c].shape[0]
            I_c = np.random.randint(0, self.n_docs_in_class+1, N_c)
            X_.loc[X_["label"] == c, 'split'] = I_c

        X_resample = pd.DataFrame(
            X_[['label', 'split', 'tokens']]
            .groupby(['label', 'split'], as_index=False)['tokens']
            .sum()
        )

        return X_resample

    def _remove_stopwords(self, tokens):
        """Method to filter stopwords from potential list of keywords."""
        return [t for t in tokens if t not in self.stopwords]

    def _add_tf_to_idf(self, X):
        """Returns the tf-idf weights of each tokens"""
        tokens_joined = X['tokens'].apply(lambda x: ' '.join(x))
        X_vec = self.tfidf_vectorizer.transform(tokens_joined)
        feature_names = self.tfidf_vectorizer.get_feature_names()
        idf_weights = self._get_weights(X_vec.toarray(),
                                        self.keywords,
                                        feature_names)

        return idf_weights

    def _get_weights(self, X_vec, keywords_list, feature_names):
        """Put max weights for each word of redistributed mails."""
        max_ = np.max(X_vec, axis=0)
        mmax_ = np.max(max_)
        for k in keywords_list:
            if k in feature_names:
                max_[feature_names.index(k)] = mmax_ * self.keywords_coef

        return max_
Esempio n. 58
0
    files = os.listdir(gen_path + category)
    files = [file for file in files if file.find('txt') > 0]
    for name in files:
        path = gen_path + category + '/' + name
        with open(path, 'r') as f:
            data = f.read()
        text.append(clean_str(data))
        cat_list.append(start)
    start += 1

result = np.zeros((1, len(cat_list)), dtype=np.int)
result = result.tolist()[0]

tfidf = TfidfVectorizer(strip_accents='ascii')
sparse_matrix = tfidf.fit_transform(text)
vocab = tfidf.get_feature_names()
print type(sparse_matrix)
print sparse_matrix.shape

print 'Vocabulary Loaded:'

pkl_file = open('/Users/HENGJIE/Desktop/text repo/bbcsport/w2v_bbc.pkl', 'rb')
w2v = cPickle.load(pkl_file)
pkl_file.close()

pkl_file = open('/Users/HENGJIE/Desktop/text repo/bbcsport/glove_bbc.pkl',
                'rb')
glove = cPickle.load(pkl_file)
pkl_file.close()

# w2v = load_bin_vec('/Users/HENGJIE/Desktop/FYP Python/wv_google.bin',vocab)
Esempio n. 59
0
                    use_idf=True,
                    min_df=1,
                    smooth_idf=True,
                    norm='')
base = pd.read_csv("films.csv")
# pridanie novych prazdnych stlpcov z tfidf
base['tfidf1'] = 0
base['tfidf2'] = 0
base['tfidf3'] = 0
base['tfidf4'] = 0

# pocitanie idf
x = v.fit_transform(base.loc[:, 'storyline'].values.astype('U'))
idf = v.idf_
# zrobenie dictionary v tvare -> token : hodnota idf
dictineri = dict(zip(v.get_feature_names(), idf))

for i, row in base.iterrows():
    accStoryline = list(map(lambda x: x.lower(), row['storyline'].split()))
    trol = dict()
    # ulozenie hodnot tfidf s tokenmi do trola
    for accWord in accStoryline:
        foo = accWord.replace('.', '')
        if foo in dictineri:
            if foo in trol:
                trol[foo] += dictineri[foo]
            else:
                trol[foo] = dictineri[foo]

    # normalizovanie trola podla logE
    for k, v in trol.items():
Esempio n. 60
0
        for word in bagOfWordsA:
            numOfWordsA[word] += 1
    return numOfWordsA


# array_negative=count_vectorizer.fit_transform(data_negative.splitlines())
# array_positive=count_vectorizer.fit_transform(data_positive.splitlines())
# tfidf_negative = tfidf_vector.fit_transform(data_negative.splitlines())
# tfidf_positive = tfidf_vector.fit_transform(data_positive.splitlines())

# print(tfidf_negative)

data = open("test.txt").read().splitlines()

tf_idf = tfidf_vector.fit_transform(data)
print(tfidf_vector.get_feature_names())

array_train = []
list = []
for i in data_negative.splitlines():
    array_train.append(0)
for i in data_positive.splitlines():
    array_train.append(1)

print(tf_idf.toarray())
# X = count_vectorizer.fit_transform(data)
# tfidf_vector
# print(count_vectorizer.vocabulary_)
# array=X.toarray()
# print(array)
# for x in array: