Esempio n. 1
1
def main():
    # create tweets dataframe
    tweets = tfidf.build_corpus_from_csv(dataFile)
    # create just a list of tweets
    tweets_only = [tweet for tweet in tweets['Tweet']]
    # define stopset
    stopset = set(stopwords.words('english'))
    # tokenize the tweets in place
    tweets['Tweet'] = tfidf.tokenize_corpus(tweets['Tweet'], stopset)
    # print the 10 most frequent words for each tweet
    get_most_frequent_words(tweets, 10)

    ##############################

    # create vectorizer
    vectorizer = TfidfVectorizer(input='content', stop_words=stopset)
    # fit the vectorizer
    vectorizer.fit_transform(tweets_only)
    # get feature names
    tweet_features = vectorizer.get_feature_names()

    # Generate frequency distrubutions for each tweet
    freqs = []
    indices = []
    for (num, entry) in tweets.iterrows():
        freqs.append(FreqDist(entry['Tweet']))
        indices.append(num)
    # loop over the features, and insert frequences in the dataframe
    for feature in tweet_features:
        tweets[feature] = pd.Series(
            [fd[feature] for fd in freqs],
            index=indices
        )
    # output a csv
    tweets.to_csv('frequencies.csv')
class Q3Transformer(base.BaseEstimator, base.TransformerMixin):
    '''
        class variable: self.col; self.vectorizer
    '''
    def __init__(self):      
        self.col = 'categories' # initialize the column name

    def fit(self, X, y=None):
        # pick the column
        pick_category = pick(self.col, X)
        category_train = [' '.join(pick_category[i].values()[0]) for i in range(0,len(pick_category))]
        
        # transform the training records
        self.vectorizer = TfidfVectorizer(min_df=1)  
        self.vectorizer.fit_transform(category_train)
        
        return self

    
    def transform(self, X):
        # transform the test record
        if type(X) is list:
            pick_category = pick(self.col, X)
            category_X = [' '.join(pick_category[i].values()[0]) for i in range(0,len(pick_category))]
        else:
            category_X = [' '.join(X[self.col])]
        
        X_trans = self.vectorizer.transform(category_X)
        return X_trans 
Esempio n. 3
0
def getNewsContext(newsObj,ent_ind,ents,vocab,window):          
    ent_text = {}
    for e in ent_ind:
        ent_text[e] = ''

    sentencesIn = []            
    sentencesInObj= []            
    entsIn = []

    # binary matrix
    
    indices = []
    indptr = [0]
    for news in newsObj:
        h_ent = news.h_ent
        s = makeEntText(h_ent,ent_text,ent_ind,indices,indptr,window)
        if s:
            sentencesIn.append( s )
            sentencesInObj.append(Sentence(s,news.created_at,h_ent,news.title))
        b_ent = news.b_ent
        for sentence in sent_detector.tokenize(b_ent.strip()):
            s = makeEntText(sentence,ent_text,ent_ind,indices,indptr,window)
            if s:
                sentencesIn.append( s )
                sentencesInObj.append(Sentence(s,news.created_at,sentence,news.title))
    newsVectorizer = TfidfVectorizer(stop_words='english',vocabulary=vocab,#use_idf=False,
        tokenizer=lambda text: news_tokenizer(text,'reg'))
    XN = newsVectorizer.fit_transform(sentencesIn) #

    for e in ents:
        entsIn.append(ent_text[e])
    XEn = newsVectorizer.fit_transform(entsIn)    

    NEb = csr_matrix((np.ones(len(indices)), indices, indptr), shape=(len(sentencesIn),len(ents) ))
    return XN,XEn,NEb,sentencesIn,sentencesInObj,ent_text
Esempio n. 4
0
def getTweetContext(tweetsObj,ent_ind,ents,vocab,window):          
    ent_text = {}
    for e in ent_ind:
        ent_text[e] = ''

    t0 = time()
    tweetsIn = []            
    tweetsInObj = []            
    entsIn = []
    indices = []
    indptr = [0]
    for i in tweetsObj:
        tweet = tweetsObj[i]
        tokens_ent = tweet.tokens_ent
        t = makeEntText(tokens_ent,ent_text,ent_ind,indices,indptr,window)
        if t:
            tweetsIn.append( t )
            tweetsInObj.append( tweet )

    print( "append in "+str(time() - t0))
    t0 = time()
    tweetVectorizer = TfidfVectorizer(stop_words='english',vocabulary=vocab,#use_idf=False,
        tokenizer=lambda text: tweet_tokenizer(text,'reg'))
    XT = tweetVectorizer.fit_transform(tweetsIn) 
    print( "vectorize in "+str(time() - t0))
    t0 = time()
    for e in ents:
        entsIn.append(ent_text[e])
    XEt = tweetVectorizer.fit_transform(entsIn)    
    print( "ents append + vec in "+str(time() - t0))

    TEb = csr_matrix((np.ones(len(indices)), indices, indptr), shape=(len(tweetsIn),len(ents) ))
    return XT,XEt,TEb,tweetsIn,tweetsInObj,ent_text
Esempio n. 5
0
def get_bow_vect_data_test(classif_data):
	vect = TfidfVectorizer()
	vect.fit_transform([classif_data["corpus"]])

	#Before we begin, get rid of any test articles with no topic
	vect_token_sets = []
	vect_test_sets = []

	#Transform testing and training data
	for i in classif_data["train_tokens"]:
		vect_token_sets.append(vect.transform([i]).toarray())

	for i in classif_data["test_tokens"]:
		vect_test_sets.append(vect.transform([i]).toarray())


	train_set = []
	test_set = []
	for i in vect_token_sets:
		train_set.append(i[0])
	for i in vect_test_sets:
		test_set.append(i[0])

	return {
		"vectorizer": vect,
		"train_vect": train_set,
		"test_vect": test_set
	}
Esempio n. 6
0
class Classifier(object):
	def __init__(self):

		self.classifier = LogisticRegression(intercept_scaling=100)
		self.vectorizer = TfidfVectorizer()
	
	def trainvectorizer(self,corpus):
		
		self.vectorizer.fit_transform(corpus)
		file1 = open("feature_names.txt","w")
		names = self.vectorizer.get_feature_names()
		print len(names)
		for name in names:
			file1.write(name.encode('utf8')+"\n")
		file1.close()
		print "vectrizer train is over...."


	def trainclassifier(self,train_X,train_Y):
		
		self.classifier.fit(train_X,train_Y)
		print "classifier train is over ...."

	def getfeature(self,text):#return a feature array
		matrx = self.vectorizer.transform([text]).toarray()
		array = matrx[0]
		return array
		
	def getresult(self,feature):#return true or false
		
		return self.classifier.predict(feature)
Esempio n. 7
0
def tfidf_score(train_set, test_set):

    stopwords = nltk.corpus.stopwords.words('english')
    vectorizer = TfidfVectorizer(min_df=1, stop_words=set(stopwords))
    #Remove all the None Types from the input datasets
    train_set = filter(None, train_set)
    test_set = filter(None, test_set)
    vectorizer.fit_transform(train_set)
    #print "Word Index is {0} \n".format(vectorizer.vocabulary_)
    smatrix = vectorizer.transform(test_set)
    tfidf = TfidfTransformer(norm="l2")
    tfidf.fit(smatrix)
    #print "IDF scores:", tfidf.idf_
    tf_idf_matrix = tfidf.transform(smatrix)
    pairwise_similarity = tf_idf_matrix * tf_idf_matrix.T
    msum = tf_idf_matrix.sum(axis=1)
    cos_sum = pairwise_similarity.sum(axis=1)
    mlist = msum.tolist()
    cos_sim = cos_sum.tolist()
    count = 0
    tfidfscores = {}
    for s in train_set:
        tfidfscores[s] = []
        tfidfscores[s].append(mlist[count][0])
        tfidfscores[s].append(cos_sim[count][0])
        count += 1
    return tfidfscores
Esempio n. 8
0
def readFile(filename):
    
    global vectorizer
    
    train_data = pd.read_csv(filename, header=0, delimiter='\t', quoting=3)
    train_size = train_data.shape[0]
    
    
    
    clean_train = []
    for i in xrange(0,train_size):
        clean_train.append(filter(train_data['review'][i]))
        #if i%1000 ==0:
        #    print '%d reviews processed...' %i
   
    
    #vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, preprocessor = None, stop_words = None, max_features = 5000)
    if vectorizer==None:
        vectorizer = TfidfVectorizer(sublinear_tf=True,max_df=0.5, max_features = 50000)
        train_data_feature = vectorizer.fit_transform(clean_train)
    else:
        vec = TfidfVectorizer(vocabulary=vectorizer.vocabulary_)
        train_data_feature = vec.fit_transform(clean_train)
        

    print train_data_feature.shape
    if 'test' in filename:
        return train_data['id'], train_data_feature
    else:
        return train_data['id'], train_data_feature, train_data['sentiment']
Esempio n. 9
0
def createTDIDF():
    ## Bag of words
    with open("./data/movies.csv") as f:
        train_set1 = [line.lower().rstrip() for line in f]
    with open("./data/dvd.csv") as f:
        train_set2 = [line.lower().rstrip() for line in f]

    train_set = sorted(list(set(train_set1 + train_set2)))
    # Create dictionary to find movie
    dictTrain = dict()
    for i,movie in enumerate(train_set):
        dictTrain[movie] = i

    # Find weitghts
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix_train = tfidf_vectorizer.fit_transform(train_set)

    ## Tri-grams
    lenGram  = 3
    train_setBigrams = []
    for mov in train_set:
        temp = [mov[i:i+lenGram] for i in range(len(mov)-1)]
        temp = [elem for elem in temp if len(elem) == lenGram]
        train_setBigrams.append(' '.join(temp))

    train_setBigrams = sorted(list(set(train_setBigrams)))
    dictTrainBigrams = dict()
    for i,movie in enumerate(train_setBigrams):
        dictTrainBigrams[movie] = i
    tfidf_vectorizerBigrams = TfidfVectorizer()
    tfidf_matrix_trainBigrams = tfidf_vectorizerBigrams.fit_transform(train_setBigrams)

    return [tfidf_matrix_train,dictTrain,tfidf_matrix_trainBigrams,dictTrainBigrams,lenGram]
def get_IDF_topn_words(data=[], n=3, vocabulary=None):
    vect = TfidfVectorizer(vocabulary=vocabulary)
    vect.fit_transform(data)
    indices = np.argsort(vect.idf_)[::-1] # idf_ and tfidf could also be used
    features = vect.get_feature_names()
    top_features = [features[i] for i in indices[:n]]
    return top_features
Esempio n. 11
0
def classify_svm(text):

	coarse_X = sets['coarse_training_qs']
	coarse_Y = sets['coarse_training_targets']
	fine_X = sets['fine_training_qs']
	fine_Y = sets['fine_training_targets']

	vectz = TfidfVectorizer(min_df=2, decode_error="ignore")
	coarse_X = vectz.fit_transform(coarse_X)	
	fine_X = vectz.fit_transform(fine_X)
	array_to_classify = vectz.transform([text]).toarray()

	
	# coarse
	svm_coarse = SVC(C=1000, gamma = 0.001, kernel='rbf')
	svm_coarse.fit(coarse_X, coarse_Y)
	# predict
	coarse_predict = svm_coarse.predict(array_to_classify)

	# fine
	svm_fine = SVC(C=1000, gamma = 0.001, kernel='rbf')
	svm_fine.fit(fine_X, fine_Y)
	# predict
	fine_predict = svm_fine.predict(array_to_classify)

	results={}
	results['coarse_class'] = coarse_predict[0] 
	results['fine_class'] = fine_predict[0]

	return results
Esempio n. 12
0
def doTFIDF(train, test1, test2):
	steemedTrain = stemIt(train)
	steemedTest1 = stemIt(test1)
	steemedTest2 = stemIt(test2)
	print "done stemming tweets"

	regTrain = processIt(train)
	regTest1 = processIt(test1)
	regTest2 = processIt(test2)

	vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=1)

	X = vectorizer.fit_transform(regTrain) 
	Xtest1 = vectorizer.transform(regTest1)
	Xtest2 = vectorizer.transform(regTest2)
	scipy.io.mmwrite('train_reg_dataM',X, field='real')
	scipy.io.mmwrite('test1_reg_dataM',Xtest1, field='real')
	scipy.io.mmwrite('test2_reg_dataM',Xtest2, field='real')

	vectorizer = TfidfVectorizer(ngram_range=(1, 3), min_df=1)

	X = vectorizer.fit_transform(steemedTrain) 
	Xtest1 = vectorizer.transform(steemedTest1)
	Xtest2 = vectorizer.transform(steemedTest2)
	scipy.io.mmwrite('train_stem_dataM',X, field='real')
	scipy.io.mmwrite('test1_stem_dataM',Xtest1, field='real')
	scipy.io.mmwrite('test2_stem_dataM',Xtest2, field='real')
Esempio n. 13
0
    def get_features(vocab):
        vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
        X_train_head = vectorizer_head.fit_transform(headlines)

        vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
        X_train_body = vectorizer_body.fit_transform(bodies)

        # calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The
        # more important topic words a body contains of a certain topic, the higher its value for this topic
        lda_body = LatentDirichletAllocation(n_topics=n_topics, learning_method='online', random_state=0, n_jobs=3)

        print("latent_dirichlet_allocation_cos: fit and transform body")
        t0 = time()
        lda_body_matrix = lda_body.fit_transform(X_train_body)
        print("done in %0.3fs." % (time() - t0))

        print("latent_dirichlet_allocation_cos: transform head")
        # use the lda trained for body topcis on the headlines => if the headlines and bodies share topics
        # their vectors should be similar
        lda_head_matrix = lda_body.transform(X_train_head)

        #print_top_words(lda_body, vectorizer_body.get_feature_names(), 100)

        print('latent_dirichlet_allocation_cos: calculating cosine distance between head and body')
        # calculate cosine distance between the body and head
        X = []
        for i in range(len(lda_head_matrix)):
            X_head_vector = np.array(lda_head_matrix[i]).reshape((1, -1)) #1d array is deprecated
            X_body_vector = np.array(lda_body_matrix[i]).reshape((1, -1))
            cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten()
            X.append(cos_dist.tolist())
        return X
Esempio n. 14
0
def get_samples_predictions(all_words, percent):

    def tokenizer(string):
        stemmer = snowball.EnglishStemmer(ignore_stopwords=True)
        regex = re.compile('\w\w+')
        return tuple(stemmer.stem(w) for w in regex.findall(string))

    vectorizer = TfidfVectorizer(
            input='filename',
            tokenizer=tokenizer,
            ngram_range=(1, 3),
            stop_words=stopwords.words(),
            max_df=0.95, # ignore words with a term frequency higher than 95% (corpus specific stopwords)
            vocabulary=all_words,
            use_idf=True, # use inverse-document-frequency reweighting
            sublinear_tf=True # tf is 1-log(tf)
            )
    sample_fids, predictions = list(), list()
    for category in CATEGORIES:
        for fid in corpus.fileids(categories=category):
            sample_fids.append(os.path.join(FID_DIRECTORY, fid))
            predictions.append(CATEGORIES.index(category))
    shuffle_list(sample_fids, seed=123)
    shuffle_list(predictions, seed=123)
    training_fids, test_fids = split_list(sample_fids, percent=percent)
    training_samples = vectorizer.fit_transform(training_fids)
    test_samples = vectorizer.fit_transform(test_fids)
    training_predictions, test_predictions = split_list(predictions, percent=percent)
    return training_samples, training_predictions, test_samples, test_predictions
Esempio n. 15
0
File: lang.py Progetto: uotter/weibo
def feature_tfidf(train_lines, test_lines, train_text_index, test_text_index):
    start = time.time()
    train_text_arr, forward_train, comment_train, like_train = file_to_arr(train_lines, train_text_index, 'train')

    test_text_arr = file_to_arr(test_lines, test_text_index, 'test')
    end = time.time()
    print 'train and test file to array fininshed with: ' + str(end - start)
    start = time.time()
    # debug start
    # train_text_arr_nozero = []
    # comment_train_nozero = []
    # for i in range(len(comment_train)):
    #     if int(comment_train[i]) != 0:
    #         train_text_arr_nozero.append(train_text_arr[i])
    #         comment_train_nozero.append(comment_train[i])
    # train_text_arr = train_text_arr_nozero
    # comment_train = comment_train_nozero
    # debug end

    tv = TfidfVectorizer(sublinear_tf=True, max_df=0.5)
    tfidf_train = tv.fit_transform(train_text_arr)
    tv2 = TfidfVectorizer(vocabulary=tv.vocabulary_)
    tfidf_test = tv2.fit_transform(test_text_arr)
    end = time.time()
    print 'train and test array to tfidf feature fininshed with: ' + str(end - start)
    return tfidf_train, tfidf_test, forward_train, comment_train, like_train
Esempio n. 16
0
class text_similarity():
    
    def __init__(self, booklist):
        self.booklist = booklist
        self.alltext = ''
        self.merged = pd.DataFrame(columns = ['Book', 'Chapter', 'Verse', 'Original Text', 'Formatted Text'])
        self.booklistnames = []
        for i, book in enumerate(booklist):
            self.alltext += book.all_text
            self.merged = pd.merge(self.merged, book.df, how = 'outer')
            self.booklistnames = np.append(self.booklistnames, book.unique_books)
        self.vocabulary = self.alltext.split()
        self.vocabulary = [word for word in self.vocabulary if len(word) > 2]
        self.cosinedf = pd.DataFrame(columns=self.booklistnames, index=self.booklistnames)
        self.merged['Source'] = '' 
        sources = []
        for i, book in enumerate(self.merged['Book']):
            for books in self.booklist:
                if book in books.unique_books:
                    sources.append(books.name)
        self.merged['Source'] = sources
        self.vect = TfidfVectorizer(stop_words='english')
        self.vect.fit_transform(self.vocabulary)
        self.vectorize()
        self.cosine()
    
    def vectorize(self):
        self.tfidf_df = pd.DataFrame(columns= ['Book', 'Vector'])
        self.tfidf_df['Book'] = self.booklistnames
        
        for i, book in enumerate(self.booklistnames):
            joined = " ".join(self.merged[self.merged['Book'] == book]['Formatted Text'].values)
            self.tfidf_df.iloc[i, 1] = self.vect.transform([joined])
        
        vectors = []
        for i, line in enumerate(self.merged['Formatted Text']):
            vectors.append(self.vect.transform([line]))
            if i % 10 == 0:
                print i
        self.merged['Vectors'] = vectors
        print "vectorization complete"

    def cosine(self):
        self.cosinedf['Source'] = ''
        sources = []
        
        for i, book in enumerate(self.cosinedf):
            for books in self.booklist:
                if book in books.unique_books:
                    sources.append(books.name)
        self.cosinedf['Source'] = sources

        for i, book1 in enumerate(self.booklistnames):
            for j, book2 in enumerate(self.booklistnames):
                if book1 == book2:
                    self.cosinedf[book1][book2] = 1.
                elif i<j:
                    self.cosinedf[book1][book2] = cosine_similarity(self.tfidf_df[self.tfidf_df['Book'] == book1]['Vector'].values[0], 
                                  self.tfidf_df[self.tfidf_df['Book'] == book2]['Vector'].values[0])[0][0]
        print "cosine similarity complete"    
Esempio n. 17
0
def create_tf_idf_sim_matrix( title_rev_log, desc_rev_log, cr_area_top_level, title_file_name):
    #print "Title- rev", title_rev_log
    #print "Desc-rev", desc_rev_log
    #print "cr_area_top_level", cr_area_top_level
    #print "title_file_name", title_file_name
    
   # tfidf_vectorizer = TfidfVectorizer(stop_words='english',decode_error='ignore')
    tfidf_vectorizer = TfidfVectorizer(decode_error='ignore')
    title_rev_log_tfidf_matrix     = tfidf_vectorizer.fit_transform(title_rev_log)
    desc_rev_log_tfidf_matrix      = tfidf_vectorizer.fit_transform(desc_rev_log)
    cr_area_top_level_tfidf_matrix = tfidf_vectorizer.fit_transform(cr_area_top_level)
    title_file_name_tfidf_matrix   = tfidf_vectorizer.fit_transform(title_file_name)
    
    #print  "size=", title_rev_log_tfidf_matrix.shape,  desc_rev_log_tfidf_matrix.shape,  cr_area_top_level_tfidf_matrix.shape, title_file_name_tfidf_matrix.shape         
    #print  "Title Rev Log=",  title_rev_log_tfidf_matrix
    #print "Desc rev log = ",  desc_rev_log_tfidf_matrix
    #print "cr area top level=", cr_area_top_level_tfidf_matrix
    #print  "title file name=", title_file_name_tfidf_matrix
                        
    title_rev_log_sim_matrix      = cosine_similarity(title_rev_log_tfidf_matrix[0:1], title_rev_log_tfidf_matrix)
    desc_rev_log_sim_matrix       = cosine_similarity(desc_rev_log_tfidf_matrix[0:1], desc_rev_log_tfidf_matrix)
    cr_area_top_level_sim_matrix  = cosine_similarity(cr_area_top_level_tfidf_matrix[0:1], cr_area_top_level_tfidf_matrix)
    title_file_name_sim_matrix    = cosine_similarity( title_file_name_tfidf_matrix[0:1],  title_file_name_tfidf_matrix)
    
    #print "sim title-rev log", title_rev_log_sim_matrix    
    #print "desc rev log", desc_rev_log_sim_matrix      
    #print "cr area top", cr_area_top_level_sim_matrix 
    #print "title file name", title_file_name_sim_matrix
    
    return   title_rev_log_sim_matrix, desc_rev_log_sim_matrix, cr_area_top_level_sim_matrix, title_file_name_sim_matrix
Esempio n. 18
0
def readFile(filename):
    global vectorizer
    
    train_data = pd.read_csv(filename, header=0, delimiter='\t', quoting=3)
    train_size = train_data.shape[0]
    
    clean_train = []
    for i in xrange(0,train_size):
        clean_train.append(filter(train_data['review'][i]))
        if i%1000 ==0:
            print '%d reviews processed...' %i
    from sklearn.feature_extraction.text import TfidfVectorizer
    
    if vectorizer==None:
        vectorizer = TfidfVectorizer(sublinear_tf=True,max_df=0.9,ngram_range=(1,3),max_features=100000)
        train_data_feature = vectorizer.fit_transform(clean_train)
    else:
        vec = TfidfVectorizer(vocabulary=vectorizer.vocabulary_)
        train_data_feature = vec.fit_transform(clean_train)
        

    print train_data_feature.shape
    if 'test' in filename:
        return train_data['id'], train_data_feature
    else:
        return train_data['id'], train_data_feature, train_data['sentiment']
Esempio n. 19
0
def classify(good_deals,bad_deals,dictionary):
    word_with_low_freq = [word for word in dictionary.elements() if dictionary[word]<1]
    for word in word_with_low_freq:
        del dictionary[word]
    
    tfidf_vectorizer = TfidfVectorizer(vocabulary=dictionary)
    good_tfidf = tfidf_vectorizer.fit_transform(good_deals)
    bad_tfidf = tfidf_vectorizer.fit_transform(bad_deals)
    good_tfidf = good_tfidf.todense()
    bad_tfidf = bad_tfidf.todense()
    svm_data = []
    svm_data.append(good_tfidf)
    svm_data.append(bad_tfidf)
    svm_data = np.concatenate(svm_data)
    svm_pos_lables = np.ones(len(good_tfidf))
    svm_neg_lables = np.zeros(len(bad_tfidf))
    labels= []
    labels.append(svm_pos_lables)
    labels.append(svm_neg_lables)
    svm_labels  = np.concatenate(labels)
    
    param_grid = [
                  {'C': [1, 10, 100, 1000], 'gamma': [1,0.1,0.001, 0.0001],'kernel': ['linear']},
                  {'C': [1, 10, 100, 1000], 'gamma': [1,0.1,0.001, 0.0001], 'kernel': ['rbf']},
                  ]
    svc = svm.SVC()
    clf = grid_search.GridSearchCV(estimator=svc, param_grid=param_grid,n_jobs=1)
    print "Training SVM classifier for grid of C and gamma values to select best parameter\n"
    clf.fit(svm_data,svm_labels)
    print "svm score",clf.best_score
    print "svm gamma value",clf.best_estimator.gamma
    print "svm C value",clf.best_estimator.C
    print "svm kernel",clf.best_estimator.kernel
    return clf
def score(testStr,candList1):
    batch_sz=1000
    from sklearn.feature_extraction.text import TfidfVectorizer
    totCandidate=[];totInd=[]
    batch_num=int(math.ceil(len(candList)/float(batch_sz))) #51/50.0->2.0
    for batch in range(batch_num)[:]:
        corpus=np.array(candList1)[batch*batch_sz:(batch+1)*batch_sz];#list print corpus[0],corpus[1] #'北京市 海淀区 西三旗' '人民日报社 爱玛 客 餐厅'
        #############
        # tf idf
        vectorizer = TfidfVectorizer(ngram_range=(1,1),min_df=1)
        corpus=list(corpus)
        corpus.append(testStr)
        rst=vectorizer.fit_transform(corpus)#the last one is testStr
        #print vectorizer.get_feature_names()
        #for w in vectorizer.get_feature_names():
            # print w ##no '客'
        rst=rst.toarray()
        #print 'feature',rst.shape #[n,dim]
        ################
        # calculate distance
        test=rst[-1,:].reshape((1,-1))#[1,d]
        compare=rst[:-1,:] #[n,d]
        dist=calc_EuDistance(test,compare);#print 'eu-dist min max',np.min(dist),np.max(dist)
        rank=np.argsort(dist)[:50]#index ,from smallScore->largeScore sort
        candidateList=[corpus[ii] for ii in rank]#list
        totCandidate=totCandidate+candidateList
        #
        indList=[batch*batch_sz+ij for ij in rank]
        totInd=totInd+indList
        #score=dist[rank] #array
        #for i in range(len(candidateList))[:]:
         #   print candidateList[i],'eu-dist',score[i]
    #############
    print 'tot candidate',len(totCandidate)
    ###################
    # total candidate
    ## idf
    vectorizer = TfidfVectorizer(ngram_range=(1,1),min_df=1)
    corpus=totCandidate
    corpus.append(testStr)
    rst=vectorizer.fit_transform(corpus)
    rst=rst.toarray()
    # distance
    test=rst[-1,:].reshape((1,-1))#[1,d]
    compare=rst[:-1,:] #[n,d]
    dist=calc_EuDistance(test,compare)
    # pick up distance<=1.2
    distInd=np.where(dist<=1.2)[0]#row index
    dist=dist[distInd]
    corpus=[corpus[ij] for ij in distInd]
    totIndArr=np.array(totInd)[distInd]
    #
    rank=np.argsort(dist)#[:20]#index ,from smallScore->largeScore sort
    candidateList=[corpus[ii] for ii in rank]#list
    score=dist[rank] #array
    for i in range(len(candidateList))[:]:
        print strUnique(candidateList[i]),'eu-dist',score[i]
    ############
    return totIndArr
Esempio n. 21
0
def vectorize(msg_input):
    from sklearn.feature_extraction.text import TfidfVectorizer
    from nltk.corpus import stopwords

    vectorizer = TfidfVectorizer(stop_words='english')
    vectorizer.fit_transform(msg_input)

    return vectorizer.get_feature_names()
Esempio n. 22
0
def main():
    start = time.time()
    args = get_args()

    if args.class_file:
        wid_to_class = OrderedDict()
        groups = OrderedDict()
        for line in args.class_file:
            splt = line.strip().split(',')
            groups[splt[1]] = groups.get(splt[1], []) + [int(splt[0])]
            wid_to_class[int(splt[0])] = splt[1]
        classes = groups.keys()

    logger.info(u"Loading CSV...")
    lines = [line.decode(u'utf8').strip() for line in args.infile if line.strip()]
    wid_to_features = OrderedDict([(int(splt[0]), u" ".join(splt[1:])) for splt in
                                   [line.split(u',') for line in lines]
                                   if int(splt[0]) in wid_to_class
                                   ])

    unknowns = OrderedDict([(int(splt[0]), u" ".join(splt[1:])) for splt in
                            [line.split(u',') for line in lines]
                            if int(splt[0]) not in wid_to_class
                            ])

    logger.info(u"Vectorizing...")
    vectorizer = TfidfVectorizer()
    feature_keys, feature_rows = zip(*[(classes.index(wid_to_class[int(key)]), features)
                                       for key, features in wid_to_features.items()
                                       if int(key) in wid_to_class])

    vectorizer.fit_transform(feature_rows)
    logger.info(u"Vectorized feature rows")
    training_vectors = vectorizer.transform(feature_rows).toarray()
    logger.info(u"Vectorized training features")

    logger.info(u"Training %d classifiers" % len(args.classifiers))

    classifiers = dict()
    for classifier_string in args.classifiers:
        clf = Classifiers.get(classifier_string)
        classifier_name = Classifiers.classifier_keys_to_names[classifier_string]

        logger.info(u"Training a %s classifier on %d instances..." % (classifier_name, len(training_vectors)))
        clf.fit(training_vectors, feature_keys)
        classifiers[classifier_string] = clf
        logger.info(u"Trained.")

    for counter, (wid, unknown) in enumerate(unknowns.items()):
        prediction_matrix = [classifier.predict_proba(vectorizer.transform([unknown]).toarray())
                             for classifier in classifiers.values()]
        summed_probabilities = np.sum(prediction_matrix, axis=0)[0]
        unknown_class = classes[list(summed_probabilities).index(max(summed_probabilities))]
        args.outfile.write(u"%s,%s\n" % (wid, unknown_class))
        if counter % 1000 == 0:
            logger.info(counter)

    logger.info(u"Finished in %.2f seconds" % (time.time() - start))
Esempio n. 23
0
def train_test(args):
    
    # unpack arguments and make train/test data/label dicts/lists
    train, test, features, classifier = args

    # create tf idf spare matrix from training data
    if features == 'tfidf':
        fe = TfidfVectorizer(tokenizer=tokenize, stop_words='english', max_features=1290)
        trainfe = fe.fit_transform(train['data'])
    elif features == 'dict':
        fe = CountVectorizer(tokenizer=tokenize, stop_words='english', binary=True)
        trainfe = fe.fit_transform(train['data'])
    elif features == 'lsa':
        svd = TruncatedSVD(n_components=100, random_state=42)
        fe = TfidfVectorizer(tokenizer=tokenize, stop_words='english', max_df=0.115, max_features=11500)
        trainfe = svd.fit_transform(fe.fit_transform(train['data']))
    elif features == 'rule':
        hamfe = CountVectorizer(tokenizer=tokenize, stop_words='english', max_features=1150)
        spamfe = CountVectorizer(tokenizer=tokenize, stop_words='english', max_features=1150)
        hamfit = hamfe.fit_transform(train['data'].loc[train['labels'] == 0])
        spamfit = spamfe.fit_transform(train['data'].loc[train['labels'] == 1])

    # train multinomial nb classifier on training data
    if classifier == 'mnb':
        from sklearn.naive_bayes import MultinomialNB
        clf = MultinomialNB().fit(trainfe, train['labels'])
    elif classifier == 'gnb':
        from sklearn.naive_bayes import GaussianNB
        clf = GaussianNB().fit(trainfe.toarray(), train['labels'])
    elif classifier == 'svm':
        from sklearn.linear_model import SGDClassifier
        clf = SGDClassifier(loss='squared_hinge', penalty='l2').fit(trainfe, train['labels'])
    elif classifier == 'log':
        from sklearn.linear_model import SGDClassifier
        clf = SGDClassifier(loss='log', penalty='l2').fit(trainfe, train['labels'])
    elif classifier == 'rule':
        hamfeats = hamfe.transform(test['data'])
        spamfeats = spamfe.transform(test['data'])
        hyp = np.array(hamfeats.sum(axis=1) < spamfeats.sum(axis=1)).reshape(-1).T
        
    # extract features from test data
    if features == 'lsa':
        feats = svd.transform(fe.transform(test['data']))
    else:
        feats = fe.transform(test['data'])
    # use trained classifier to generate class predictions from test features
    if classifier == 'gnb':
        hyp = clf.predict(feats.toarray())
    elif classifier == 'rule':
        pass
    else:
        hyp = clf.predict(feats)

    # compare predictions with test labels
    score = np.mean(hyp == test['labels'])

    return score
Esempio n. 24
0
def vectorize_on_dict(full_paper):
	""" receives a list of articles and vectorizes on keywords from articles"""
	articles = [' '.join(f.keywords) for f in full_paper]	
	#articles = list(itertools.chain(*articles))
	#print articles[0]
	#print articles
	vectorizer = TfidfVectorizer()
	vectorizer.fit_transform(articles)
	return vectorizer, vectorizer.fit_transform(articles)
Esempio n. 25
0
class Corpus(object):
    def buildCorpus(self, region, time_interval, element_type='photos', paras={}):
        # time_interval should be [start, end]
        text = []
        if element_type == 'photos':
            ei = PhotoInterface()
            cur = ei.rangeQuery(region, time_interval, 'caption.text')
        else:
            ei = TweetInterface()
            cur = ei.rangeQuery(region, time_interval, 'text')
        for t in cur:
            try:
                if element_type == 'photos':
                    text.append(t['caption']['text'])
                else:
                    text.append(t['text'])
            except:
                pass

        # it is not proper here to set up stopwords
        self._vectorizer = TfidfVectorizer(max_df=paras.get('max_df', 0.2),
                                           min_df=paras.get('min_df', 0.0),
                                           strip_accents=paras.get('strip_accents', 'ascii'),
                                           preprocessor=paras.get('preprocessor', tool.textPreprocessor),
                                           smooth_idf=paras.get('smooth_idf', True),
                                           sublinear_tf=paras.get('sublinear_tf', True),
                                           norm=paras.get('norm', 'l2'),
                                           analyzer=paras.get('analyzer', 'word'),
                                           ngram_range=paras.get('ngram_range', (1, 1)),
                                           stop_words=paras.get('stop_words', 'english')
        )

        # If the program do not break here, we may ignore the bug
        try:
            self._vectorizer.fit_transform(text)
        except Exception as error :
            logging.warn(error)

    def getVectorizer(self):
        return self._vectorizer

    def chooseTopWordWithHighestTDIDF(self, text, k=10):
        voc = self._vectorizer.get_feature_names()
        tf_vec = self._vectorizer.transform([text]).mean(axis=0)
        nonzeros = np.nonzero(tf_vec)[1]
        res_list = nonzeros.ravel().tolist()[0]
        values = []
        words = []
        for n in res_list:
            words.append(voc[n])
            values.append(tf_vec[0, n])
        while len(values) < k:
            values.append(0)
            #return res_list, words, values
        return values
def get_feature_cosine_similarity(train):
  feature_prod_title = []
  feature_prod_desc = []
  #ensure the size is as required
  vect = TfidfVectorizer(min_df=1)
  for _ , row in train.iterrows():
    cos_prod_title = vect.fit_transform([row["product_title"],row["search_term"]])
    cos_prod_desc = vect.fit_transform([row["product_description"],row["search_term"]])
    feature_prod_title.append((cos_prod_title*cos_prod_title.T).A[0][1])
    feature_prod_desc.append((cos_prod_desc*cos_prod_desc.T).A[0][1])
  return feature_prod_title,feature_prod_desc
    def init_tfidf(self):
        train_data = pd.read_csv('%s/train.csv' % self.config.get('DIRECTORY', 'origin_pt')).fillna(value="")  # [:100]
        test_data = pd.read_csv('%s/test.csv' % self.config.get('DIRECTORY', 'origin_pt')).fillna(value="")  # [:100]

        tfidf = TfidfVectorizer(stop_words='english', ngram_range=(1, 1))
        tfidf_txt = pd.Series(
            train_data['question1'].tolist() + train_data['question2'].tolist() + test_data['question1'].tolist() +
            test_data['question2'].tolist()).astype(str)
        tfidf.fit_transform(tfidf_txt)
        LogUtil.log("INFO", "init tfidf done ")
        return tfidf
Esempio n. 28
0
def get_X_train(data, wn=False, ignore=False, max_n_gram=1, lowercase=True, nopunc=False, lemmatize=False, stem=False, remove_stop_words=True, tfidf=False, verbose=True):

    if verbose:
        print('Using n-grams of up to %d words in length' % max_n_gram)

    if lowercase and verbose:
        print('Converting all text to lowercase')

    if lemmatize:
        tokenizer = LemmaTokenizer(nopunc)
        if verbose:
            print('Lemmatizing all words')
    elif stem:
        tokenizer = StemTokenizer(nopunc)
        if verbose:
            print('Stemming all words')
    else:
        tokenizer = None

    if remove_stop_words:
        stop_words = 'english'
        if verbose:
            print('Removing English stop words')
    else:
        stop_words = None

    t0 = time()
    if tfidf:
        if verbose:
            print()
            print('Extracting features from the test data using a tfidf vectorizer')
        vectorizer = TfidfVectorizer(lowercase=lowercase, tokenizer=tokenizer, stop_words=stop_words, ngram_range=(1, max_n_gram))
        X_train = vectorizer.fit_transform(data)
    else:
        if verbose:
            print('Extracting features from the test data using a count vectorizer')
        vectorizer = CountVectorizer(lowercase=lowercase, tokenizer=tokenizer, stop_words=stop_words, ngram_range=(1, max_n_gram))
        if wn:
            print('Learning a vocabulary dictionary with a count vectorizer')
            vectorizer.fit(data)
            print('Done learning vocabulary dictionary')
            vectorizer = WordNetVectorizer(vectorizer)
            print('Getting wordnet based feature vectors...')
            X_train = vectorizer.get_word_net_feature_vecs(data, ignore)
            print('Done getting wordnet based feature vectors')
        else:
            X_train = vectorizer.fit_transform(data)
    duration = time() - t0
    if verbose:
        data_train_size_mb = size_mb(data)
        print('done in %fs at %0.3fMB/s' % (duration, data_train_size_mb / duration))
        print('n_samples: %d, n_features: %d' % X_train.shape)
        print()
    return X_train, vectorizer
class Cluster:

    def __init__(self):
        self.train_file = os.path.join('data', 'sample')

    def run_main(self):
        self.load_data()
        self.vectorize()

        #KMeans - K++
        print "KMeans - K++"
        self.kmeans = KMeans(n_clusters=3, init='k-means++', n_init=10000)
        self.train()
        self.get_metrics()

        #MiniBatchKMeans - K++
        print "MiniBatchKMeans - K++"
        self.kmeans = MiniBatchKMeans(n_clusters=3, init='k-means++', n_init=10000)       
        self.train()
        self.get_metrics()
 
        #KMeans - Random
        print "KMeans - Random"
        self.kmeans = KMeans(n_clusters=3, init='random', n_init=10000)
        self.train()
        self.get_metrics()

        #MiniBatchKMeans - K++
        print "MiniBatchKMeans - Random"
        self.kmeans = MiniBatchKMeans(n_clusters=3, init='random', n_init=10000)       
        self.train()
        self.get_metrics()


    def load_data(self):
        self.training_data = []
        with open(self.train_file, 'r') as fd:
            for line in fd.readlines():
                self.training_data.append(line)

    def vectorize(self):
        self.vect = TfidfVectorizer(stop_words='english')  
        self.X = self.vect.fit_transform(self.training_data)

    def train(self):
        self.kmeans.fit(self.X)        

    def get_metrics(self):
        print self.kmeans.labels_ 

    def test(self):
        self.test_data = ["I know both Ashok and Harini"]
        self.Y = self.vect.fit_transform(self.test_data)
        print self.kmeans.predict(self.Y)
Esempio n. 30
0
def tf_idf_threading(table,dates):
    for date in dates:
        corpus = reviews[date]
        if len(corpus) == 1 and len(corpus[0]) == 1:
            continue
        print("TF-IDF processing " + date)

        vectorizer = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
        vectorizer.fit_transform(corpus)
        idf = vectorizer._tfidf.idf_
        table[date] = dict(zip(vectorizer.get_feature_names(), idf))
    return
Esempio n. 31
0
vectorizer = CountVectorizer(max_features=2000)
ingredients = train['ingredients']
words_list = [' '.join(x) for x in ingredients]

#Make label encoder
le = preprocessing.LabelEncoder()
le.fit(train["cuisine"])

#create a bag of words and convert to a array and then print the shape
bag_of_words = vectorizer.fit(words_list)
bag_of_words = vectorizer.transform(words_list).toarray()
print(bag_of_words.shape)

vectorizertfidf = TfidfVectorizer(min_df=1)
tfidf = vectorizertfidf.fit_transform(words_list).toarray()
print tfidf.shape

X = bag_of_words
y = le.transform(train["cuisine"])

print X.shape
print y.shape

dtrain = xgb.DMatrix(X, label=y)

n_classes = len(list(set(y)))

param = {
    'max_depth': 14,
    'eta': 1,
Esempio n. 32
0
"""# Content Based

## TF-IDF
"""

movies.genres = movies.genres.str.split('|')

movies.head()

movies.genres = movies.genres.fillna("").astype('str')
movies.head()

from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['genres'])
tfidf_matrix.shape

tfidf_matrix

print(tf.get_feature_names())

"""## Cosine Similarity"""

from sklearn.metrics.pairwise import cosine_similarity
sim = cosine_similarity(tfidf_matrix)

sim.shape

sim[:4, :4]
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV 
from sklearn.svm import SVC as svc 
from sklearn.metrics import make_scorer, roc_auc_score
from scipy import stats
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split

 
# DATA PREPARATION
data = pd.read_csv('/home/sergi/CityRoad_Disruptions/DataSet/preprocessed.csv', sep='\t',
                    lineterminator='\n')
text = data['Text']
target = data['Class']

tf_vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2))
train_matrix = tf_vectorizer.fit_transform(text.values.astype('U')).toarray()
X_train, X_test, y_train, y_test = train_test_split(train_matrix, target, test_size=.3)
 
# DEFINE MODEL AND PERFORMANCE MEASURE
mdl = svc(probability = True, random_state = 1)
auc = make_scorer(roc_auc_score)
 
# GRID SEARCH FOR 20 COMBINATIONS OF PARAMETERS
grid_list = {"C": np.arange(2, 10, 2),
             "gamma": np.arange(0.1, 1, 0.2)}
 
grid_search = GridSearchCV(mdl, param_grid = grid_list, n_jobs = 4, cv = 3, scoring = auc) 
grid_search.fit(X_train, y_train) 
grid_search.cv_results_
 
def report(results, n_top=3):
def tfidf_vec(corpus):
    tfidf = TfidfVectorizer()
    train_vec = tfidf.fit_transform(corpus)
    # for test data
    # tfidf.transform(['ya Allah meri sister Affia ki madad farma', 'khud chahta a is umar main shadi'])
    return train_vec, tfidf
Esempio n. 35
0
for row in dataset.itertuples():
    # make sure its a string
    if isinstance(row.SYMPTOM_TEXT, str):
        curr_words = []
        tokenized = nltk.tokenize.word_tokenize(row.SYMPTOM_TEXT)
        for word in tokenized:

            pre = preprocess(word)

            if pre != '':
                pre = nltk.PorterStemmer().stem(pre)
                curr_words.append(pre)
        sentences.append(' '.join(curr_words))

vect = TfidfVectorizer(max_features=1500, min_df=5, max_df=0.7)
tfidf_matrix = vect.fit_transform(sentences).toarray()
feature_names = vect.get_feature_names()

dataset['SERIOUS'][100:1000] = 'Y'

X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix,
                                                    dataset['SERIOUS'],
                                                    test_size=0.2,
                                                    random_state=0)

classifier = RandomForestClassifier(n_estimators=1000, random_state=0)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)

print(confusion_matrix(y_test, y_pred))

# In[24]:


df.columns


# # Step 2: TF-IDF Factorization of text coloumn

# In[25]:


from sklearn.feature_extraction.text import TfidfVectorizer
v = TfidfVectorizer(sublinear_tf=True, max_df=0.3,min_df=0.10,max_features=500, analyzer='word', stop_words='english', ngram_range =(1,3), use_idf = True)
x = v.fit_transform(df['text'])


# In[26]:


x


# In[27]:


v


# In[28]:
Esempio n. 37
0
        bagOfWordsA = x.split()
        for word in bagOfWordsA:
            numOfWordsA[word] += 1
    return numOfWordsA


# array_negative=count_vectorizer.fit_transform(data_negative.splitlines())
# array_positive=count_vectorizer.fit_transform(data_positive.splitlines())
# tfidf_negative = tfidf_vector.fit_transform(data_negative.splitlines())
# tfidf_positive = tfidf_vector.fit_transform(data_positive.splitlines())

# print(tfidf_negative)

data = open("test.txt").read().splitlines()

tf_idf = tfidf_vector.fit_transform(data)
print(tfidf_vector.get_feature_names())

array_train = []
list = []
for i in data_negative.splitlines():
    array_train.append(0)
for i in data_positive.splitlines():
    array_train.append(1)

print(tf_idf.toarray())
# X = count_vectorizer.fit_transform(data)
# tfidf_vector
# print(count_vectorizer.vocabulary_)
# array=X.toarray()
# print(array)
Esempio n. 38
0
              columns=['category', 'Train Count', 'Test Count'
                       ]).sort_values(by=['Train Count', 'Test Count'],
                                      ascending=False))

# ### TD_IDF Model

# +
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0.,
                     max_df=1.,
                     norm="l2",
                     use_idf=True,
                     smooth_idf=True)

tv_train_features = tv.fit_transform(train_corpus)
tv_test_features = tv.transform(test_corpus)

print('TF-IDF model:> Train features shape:', tv_train_features.shape,
      ' Test features shape:', tv_test_features.shape)
# -

tv_matrix = tv_train_features.toarray()
vocab = tv.get_feature_names()
pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)

# ### ML algorithms on TF-IDF model

import time
import warnings
warnings.filterwarnings('ignore')
Esempio n. 39
0
                data.append(twt)
                labels.append(c)
                print feature, twt
        # break

L = len(full_data)
random.shuffle(full_data)
train_data = [i[1] for i in full_data[:int(0.8 * L)]]
train_features = [i[0] for i in full_data[:int(0.8 * L)]]
train_labels = [i[2] for i in full_data[:int(0.8 * L)]]
test_data = [i[1] for i in full_data[int(0.8 * L):]]
test_features = [i[0] for i in full_data[int(0.8 * L):]]
test_labels = [i[2] for i in full_data[int(0.8 * L):]]
vectorizer = TfidfVectorizer(min_df=5,
                             max_df=0.8,
                             sublinear_tf=True,
                             use_idf=True,
                             decode_error='ignore')

train_vectors = vectorizer.fit_transform(train_data)
test_vectors = vectorizer.transform(test_data)
final_train = np.hstack([train_features, train_vectors.toarray()])
final_test = np.hstack([test_features, test_vectors.toarray()])
print final_train

# classifier_rbf = svm.SVC(kernel='rbf')
# classifier_rbf.fit(final_train,train_labels)
# prediction_rbf=classifier_rbf.predict(final_test)
# print(classification_report(test_labels, prediction_rbf))
# print(accuracy_score(test_labels, prediction_rbf))
from nlpia.data.loaders import harry_docs as docs
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = docs
vectorizer = TfidfVectorizer(min_df=1)
model = vectorizer.fit_transform(corpus)
print(model.todense().round(2))
Esempio n. 41
0
def train_sklearn():
    good_heads = [
        "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12", "13",
        "14", "15", "16"
    ]
    bad = pandas.read_csv("bad.csv", delimiter=";")['address'].sample(frac=1)
    good = pandas.read_csv("good.csv", delimiter=";",
                           names=good_heads)["2"].sample(frac=1)

    train_good = good[:130000]
    train_bad = bad[:130000]
    test_good = good[130000:]
    test_bad = bad[130000:]

    train_data = []
    for i in train_good:
        train_data.append([i, 1])
    for i in train_bad:
        train_data.append([i, 0])

    test_data = []
    for i in test_good:
        test_data.append([i, 1])
    for i in test_bad:
        test_data.append([i, 0])

    np.random.shuffle(train_data)
    np.random.shuffle(test_data)

    train_x = []
    train_y = []
    for i in train_data:
        train_x.append(i[0])
        train_y.append(i[1])

    test_x = []
    test_y = []
    for i in test_data:
        test_x.append(i[0])
        test_y.append(i[1])

    vectorizer = TfidfVectorizer(min_df=5)
    train_x = vectorizer.fit_transform(train_x)
    test_x = vectorizer.transform(test_x)

    model = LogisticRegression(random_state=42)
    # model = GradientBoostingClassifier(n_estimators=250, random_state=42, verbose=1, max_features='sqrt')
    # model = RandomForestClassifier(n_estimators=10, verbose=1, random_state=241, n_jobs=-1, max_features='sqrt')
    model.fit(train_x, train_y)

    # scores_train = list(map(lambda i: roc_auc_score(train_y, i[:, 1]), list(model.staged_predict_proba(train_x))))
    # scores_test = list(map(lambda i: roc_auc_score(test_y, i[:, 1]), list(model.staged_predict_proba(test_x))))

    # scores_train = list(model.staged_predict_proba(train_x))
    # scores_test = list(model.staged_predict_proba(test_x))

    # plt.figure()
    # plt.plot(scores_train, 'r', linewidth=2)
    # plt.plot(scores_test, 'g', linewidth=2)
    # plt.legend(['test', 'train'])
    # plt.show()

    # score = roc_auc_score(test_y, model.predict_proba(test_x)[:, 1])
    pred = model.predict(test_x)
    score_f1 = f1_score(test_y, pred)
    score_recall = recall_score(test_y, pred)
    score_accuracy = accuracy_score(test_y, pred)

    print("f1:       ", score_f1)
    print("recall:   ", score_recall)
    print("accuracy: ", score_accuracy)

    if not os.path.exists('models'):
        os.makedirs('models')
    dump(model, 'models/lr1.model')
    dump(vectorizer, 'models/vectorizer1.model')
authors = pickle.load(open(authors_file, "r"))

### test_size is the percentage of events assigned to the test set (the
### remainder go into training)
### feature matrices changed to dense representations for compatibility with
### classifier functions in versions 0.15.2 and earlier
#from sklearn import cross_validation
from sklearn.model_selection import train_test_split
features_train, features_test, labels_train, labels_test = train_test_split(
    word_data, authors, test_size=0.1, random_state=42)

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(sublinear_tf=True,
                             max_df=0.5,
                             stop_words='english')
features_train = vectorizer.fit_transform(features_train)
features_test = vectorizer.transform(features_test).toarray()

### a classic way to overfit is to use a small number
### of data points and a large number of features;
### train on only 150 events to put ourselves in this regime
features_train = features_train[:150].toarray()
labels_train = labels_train[:150]

### your code goes here
from sklearn import tree
from sklearn.metrics import accuracy_score

clf = tree.DecisionTreeClassifier()
clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
Esempio n. 43
0
with open('datasets\\dataset_10.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    documents = [""]
    comptageLigne = 0
    for row in csv_reader:
        documents = numpy.append(documents, [row[3]])
        comptageLigne += 1
        documents = numpy.append(documents, [row[3]])
        comptageLigne += 1
csv_file.close()

#Initialisation de la liste des mots à igniorer lors de la recherche dans le fichier
listeCompleteMotsBloques = stopwords.words('english') + stopwords.words(
    'spanish')
vectoriseur = TfidfVectorizer(stop_words=listeCompleteMotsBloques)
X = vectoriseur.fit_transform(documents)

#Initalisation de la recherche de clusters
nombreClusters = 100
modele = KMeans(n_clusters=nombreClusters,
                init='k-means++',
                max_iter=6000,
                n_init=1)
modele.fit(X)

ordreCentroides = modele.cluster_centers_.argsort()[:, ::-1]
termes = vectoriseur.get_feature_names()

#Détection des clusters et enregistrement dans le fichier motsClusters.csv
with open('results\\motsClusters.csv', mode='w') as clusters_file:
    motsClusters = csv.writer(clusters_file,
    config = DBConfig(working_dir + "/db.ini").read_db_config()
    # Open database connection
    db = MySQLdb.connect(**config)
    data = sql.read_sql(queryAnswers % question_id, db)
    cursor = db.cursor()
    vectorizer = TfidfVectorizer(tokenizer=process_text,
                                 stop_words=stopwords,
                                 max_df=max_df,
                                 min_df=min_df,
                                 use_idf=True,
                                 lowercase=True)

    docs = data['Value'].tolist()
    ids = data['ID'].tolist()

    tfidf_model = vectorizer.fit_transform(docs)

    km = KMeans(n_clusters=num_clusters,
                init='k-means++',
                max_iter=100000,
                n_init=1)
    km.fit(tfidf_model)

    clusters = km.labels_.tolist()

    # create main data frame
    frame = pd.DataFrame({
        'ids': ids,
        'answers': docs,
        'cluster': clusters
    },
Esempio n. 45
0
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics.pairwise import cosine_similarity

s1 = 'The field of study that focuses on the interactions between human language and computers is called Natural Language Processing, or NLP for short'
s2 = 'Natural Language Processing is a field that covers computer understanding and manipulation of human language, and its ripe with possibilities for new gathering'
s3 = 'NLP is a way for computers to analyze, understand, and derive meaning from human language in a smart and useful way'
s4 = 'NLP is characterized as a hard problem in computer science.'
s5 = 'NLP algorithms are typically based on machine learning algorithms. Instead of hand-coding large sets of rules'

query = 'NLP sits at the intersection of computer science, artificial intelligence, and computational linguistics'

tfidf = TfidfVectorizer()

dataset = [query, s1, s2, s3, s4, s5]

matrix = tfidf.fit_transform(dataset)

dic = tfidf.vocabulary_

for key in dic.keys():
    print('{0}   {1}'.format(key, dic[key]))

print(cosine_similarity(matrix[0:1], matrix))
Esempio n. 46
0
data.columns = ['labels', 'texts']

# Explore the dataset
print('Out of {} rows, {} are spam, {} are ham'.format(len(data), len(data[data['labels']=='spam']), len(data[data['labels']=='ham'])))
# Check the Number of missing data
print('Number of null in labels: {} and number of null in texts: {}'.format(data['labels'].isnull().sum(), data['texts'].isnull().sum()))

# stopwords removal
stopwords = nltk.corpus.stopwords.words('english')

# Wordnetlemmatizer
wm = nltk.WordNetLemmatizer()
# pre-processing data
def data_clean(texts):
    text = "".join([char for char in texts if char not in string.punctuation])
    tokens = re.split('W+', text)
    text = [wm.lemmatize(word) for word in tokens if word not in stopwords]
    return text

data['cleaned_text'] = data['texts'].apply(lambda x: data_clean(x.lower()))

# Vectorizing
tfidf_vect = TfidfVectorizer(analyzer=data_clean)
X_tfidf = tfidf_vect.fit_transform(data['cleaned_text'])
import ipdb; ipdb.set_trace()
print(X_tfidf.shape, tfidf_vect.get_feature_names())




Esempio n. 47
0
#%%
# Bag of word
#%%
corpus = []
train_corpus = []
test_corpus = []
for text in train:
    corpus.append(" ".join(text))
for text in xtrain:
    train_corpus.append(" ".join(text))
for text in xtest:
    test_corpus.append(" ".join(text))
#%%
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
trainData = vectorizer.fit_transform(corpus)
xtrain1 = vectorizer.transform(train_corpus)
xtest1 = vectorizer.transform(test_corpus)
xtrain1 = xtrain1.toarray()
xtest1 = xtest1.toarray()
#%%
from sklearn import linear_model
log = linear_model.LogisticRegression(C=1).fit(xtrain1, ytrain)
#%%
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
y_log1 = log.predict(xtest1)
y_log2 = log.predict(xtrain1)
print("Accuracy score is: ", accuracy_score(y_log1, ytest))
print("F1 score is: ", f1_score(y_log1, ytest, average="macro"))
print("precision score is: ", precision_score(y_log1, ytest, average="macro"))
print("recall score is: ", recall_score(y_log1, ytest, average="macro"))
if 'MLCOMP_DATASETS_HOME' not in os.environ:
    print "MLCOMP_DATASETS_HOME not set; please follow the above instructions"
    sys.exit(0)

# Load the training set
print "Loading 20 newsgroups training set... "
news_train = load_mlcomp('20news-18828', 'train')
print news_train.DESCR
print "%d documents" % len(news_train.filenames)
print "%d categories" % len(news_train.target_names)

print "Extracting features from the dataset using a sparse vectorizer"
t0 = time()
vectorizer = TfidfVectorizer(charset='latin1')
X_train = vectorizer.fit_transform(
    (open(f).read() for f in news_train.filenames))
print "done in %fs" % (time() - t0)
print "n_samples: %d, n_features: %d" % X_train.shape
assert sp.issparse(X_train)
y_train = news_train.target

print "Loading 20 newsgroups test set... "
news_test = load_mlcomp('20news-18828', 'test')
t0 = time()
print "done in %fs" % (time() - t0)

print "Predicting the labels of the test set..."
print "%d documents" % len(news_test.filenames)
print "%d categories" % len(news_test.target_names)

print "Extracting features from the dataset using the same vectorizer"
Esempio n. 49
0
	def get_vocabulary(self, linked_pages, categories_links):
		"""Scrapp a wiki page to get vocabulary for each category"""
		total_vocabulary = {}
		unique_vocabulary = []
		unique_vocabulary_tfidf = []
		# For each category
		for parent, pages in linked_pages.items():
			children_pages = []
			downloaded_pages = []
			# For every pages linked to this wategory on Wiki
			for page in pages:
				sys.stdout.write('\t{} / {} pages downloaded for [{}] category.\r'.format(len(children_pages)-1, len(pages), parent))
				sys.stdout.flush()
				# Get data
				wiki_url = 'https://en.wikipedia.org/wiki/{}'.format(page)
				data = requests.get(wiki_url)
				data_soup = BeautifulSoup(data.text, 'html.parser')
				paragraphs = [str(paragraph) for paragraph in data_soup.find_all('p')]
				paragraphs_joined = ' '.join(paragraphs)
				# Clean, tokenize, stemm and rebuild the document
				page_vocabulary = []
				cleaned_data = self.clean_xml(text=paragraphs_joined.strip())
				tokenized_data = self.tokenizer.tokenize(cleaned_data)
				for token in tokenized_data:
					if token.lower() not in self.stopwords:
						word = self.lemmatizer.lemmatize(token.lower())
						# Check if the word is correct
						if self.english_dict.check(word) is True:
							page_vocabulary.append(word)
							# Track total vocabulary
							if word not in unique_vocabulary:
								unique_vocabulary.append(word)
							# Here, why not Levenstein for correction, but gonna be long
				page_nlp_treated = ' '.join(page_vocabulary)
				if len(children_pages) >= self.configuration['options']['pages_per_category'] or len(children_pages) == len(pages):
					break
				else:
					children_pages.append(page_nlp_treated)
					downloaded_pages.append(page)
				# Wikipedia is cool, be cool with their servers.
				time.sleep(self.configuration['options']['waiting_time'])
			# StdOut summary
			print('\n\t\t- ' + '\n\t\t- '.join(downloaded_pages))
			# TF_IDF for vocabulary of each category and get top score
			tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 5), min_df=0, stop_words=self.stopwords)
			try:
				tfidf_matrix = tf.fit_transform(children_pages)
			except ValueError:  # In case of an old empty page
				continue
			feature_names = tf.get_feature_names()
			dense = tfidf_matrix.todense()
			episode = dense[0].tolist()[0]
			phrase_scores = [pair for pair in zip(range(0, len(episode)), episode) if pair[1] > 0]
			sorted_phrase_scores = sorted(phrase_scores, key=lambda t: t[1] * -1)
			category_words = []
			for word, score in [(feature_names[word_id], score) for (word_id, score) in sorted_phrase_scores][:self.configuration['options']['word_per_page']]:
				category_words.append({word: score})
				if word not in unique_vocabulary_tfidf:
					unique_vocabulary_tfidf.append(word)
			# Get linked categories to category
			linked_categories = []
			for relation in relations:
				if relation[0] == parent and relation[0] not in linked_categories:
					linked_categories.append(relation[1])
				if relation[1] == parent and relation[1] not in linked_categories:
					linked_categories.append(relation[0])
			# Get linked pages to category
			for category, pages in linked_pages.items():
				if category == parent:
					linked_pages_to_category = pages
			category_details = {}
			category_details['terminology'] = category_words
			category_details['linked_pages_to_category'] = linked_pages_to_category
			category_details['linked_categories'] = linked_categories
			total_vocabulary[parent] = category_details
		# Statistics about our terminology
		print('\nA total of {} words have been scanned to extract {} important words covering {} categories.'.format(len(unique_vocabulary), len(unique_vocabulary_tfidf), len(linked_pages)))
		return total_vocabulary
Esempio n. 50
0
# only one document or in at least 95% of the documents are removed.

print("Loading dataset...")
t0 = time()
data_samples = pd.read_csv('data/raw_data/dataset.csv', sep=';',
                           index_col=0)['Required skill'].tolist()
print("done in %0.3fs." % (time() - t0))

# Use tf-idf features for NMF.
print("Extracting tf-idf features for NMF...")
tfidf_vectorizer = TfidfVectorizer(max_df=0.95,
                                   min_df=2,
                                   max_features=n_features,
                                   stop_words='english')
t0 = time()
tfidf = tfidf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95,
                                min_df=2,
                                max_features=n_features,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(data_samples)
print("done in %0.3fs." % (time() - t0))
print()

# Fit the NMF model
print("Fitting the NMF model (Frobenius norm) with tf-idf features, "
Esempio n. 51
0
    doc = normalize_corpus_words([text.lower()], synonyms=synonyms, stopwords=stopwords)[0]
    stems = [w for w in doc.split() if w in vocabulary]
    return stems


fun_words = vocabulary = 'cat dog apple lion nyc love big small'
fun_stems = normalize_corpus_words([fun_words])[0].split()[:NUM_WORDS]
fun_words = fun_words.split()


if SAVE_SORTED_CORPUS:
    tfidfer = TfidfVectorizer(min_df=2, max_df=.6, stop_words=None, token_pattern=r'(?u)\b\w+\b')

    corpus = get_data('cats_and_dogs')[:NUM_DOCS]
    docs = normalize_corpus_words(corpus, stemmer=None)
    tfidf_dense = pd.DataFrame(tfidfer.fit_transform(docs).todense())
    id_words = [(i, w) for (w, i) in tfidfer.vocabulary_.items()]
    tfidf_dense.columns = list(zip(*sorted(id_words)))[1]


    word_tfidf_dense = pd.DataFrame(tfidfer.transform(fun_stems).todense())
    word_tfidf_dense.columns = list(zip(*sorted(id_words)))[1]
    word_tfidf_dense.index = fun_stems
    """
    >>> word_tfidf_dense[fun_stems]
          cat  dog  anim  pet  citi  appl  nyc  car  bike  hat
    cat   1.0  0.0   0.0  0.0   0.0   0.0  0.0  0.0   0.0  0.0
    dog   0.0  1.0   0.0  0.0   0.0   0.0  0.0  0.0   0.0  0.0
    anim  0.0  0.0   1.0  0.0   0.0   0.0  0.0  0.0   0.0  0.0
    pet   0.0  0.0   0.0  1.0   0.0   0.0  0.0  0.0   0.0  0.0
    citi  0.0  0.0   0.0  0.0   1.0   0.0  0.0  0.0   0.0  0.0
print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))
print()

labels_ture = dataset.target
true_k = np.unique(labels_ture).shape[0]

print("Extracting features from the training dataset "
      "using a sparse vectorizer")

#矩阵和权值
vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000,
                                 min_df=2, stop_words='english',
                                 use_idf=True)
matrix = vectorizer.fit_transform(dataset.data)


print("n_samples: %d, n_features: %d" % matrix.shape)
print()

#降维
print("Performing dimensionality reduction using LSA")
t0 = time()
svd = TruncatedSVD(2)  #维度
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

matrix_l = lsa.fit_transform(matrix)

# 2D embedding of the digits dataset
Esempio n. 53
0
def main():
    args = sys.argv
    # param = Params(args[0])
    # csv_file = "/home/thiagodepaulo/exp/text-collections/Sequence_of_words_CSV/CSTR.csv"
    # n_pos = 5
    # k = 4
    # local_itr = 10
    # global_itr = 10
    # alpha = 0.05
    # beta = 0.0001
    csv_file = args[1]
    n_pos = int(args[2])
    k = int(args[3])
    local_itr = float(args[4])
    global_itr = float(args[5])
    alpha = float(args[6])
    beta = float(args[7])

    loader = pbg.util.Loader()
    X, y = loader.load_csv(csv_file, text_column="Text", class_column="Class")
    target_name = list(set(y))
    n_class = len(target_name)
    vect = TfidfVectorizer()
    X = vect.fit_transform(X)

    model = TPBG(
        k,
        alpha=alpha,
        beta=beta,
        local_max_itr=local_itr,
        global_max_itr=global_itr,
        local_threshold=1e-6,
        global_threshold=1e-6,
        save_interval=-1,
        feature_names=vect.get_feature_names_out(),
        silence=False,
    )

    # selecionar aleatoriamente uma classe e n_pos exemplo rotulado
    choosed_cls = target_name[randint(0, n_class - 1)]
    selected_idx = np.random.choice(np.where(y == choosed_cls)[0],
                                    size=n_pos,
                                    replace=False)

    # marcar com -1 todo o restante
    y_train = np.copy(y)
    y_train[[i for i in range(len(y)) if i not in selected_idx]] = -1

    X_test, y_test = remove_rows(X, y, selected_idx)

    def eval_func(model):
        y_predict = model.predict(X_test)
        y_predict = [1 if c == choosed_cls else 0 for c in y_predict]
        y_test2 = [1 if c == choosed_cls else 0 for c in y_test]

        # calcular a métrica
        labels = [0, 1]
        names = ["others", choosed_cls]
        report = classification_report(y_test2,
                                       y_predict,
                                       labels=labels,
                                       target_names=names)
        print('\n' + report + '\n')

    # insere função de avaliação
    model.eval_func = eval_func

    # treinar o modelo
    model.fit(X, y_train)
Esempio n. 54
0
    def remote_css(url):
        st.markdown(f'<link href="{url}" rel="stylesheet">',
                    unsafe_allow_html=True)

    local_css("style.css")
    remote_css('https://fonts.googleapis.com/icon?family=Material+Icons')

    #
    search_input = st.text_input("Enter keyword/s", "")
    button_clicked = st.button("Go")

    df['Title'] = df['Title'].astype(str)
    df['Keywords'] = df['Keywords'].astype(str)
    tfidf = TfidfVectorizer()
    tfidf_features = tfidf.fit_transform(df.Title)
    df = df.astype({'Dominant_Topic': int})
    df_topics = df.groupby(['Dominant_Topic',
                            'Keywords']).size().to_frame().reset_index()
    topics = df_topics[['Dominant_Topic', 'Keywords']]
    topics_dict = topics.set_index('Dominant_Topic').T.to_dict('list')
    keys_values = topics_dict.items()
    new_dict = {int(key): str(value) for key, value in keys_values}
    labels_map = new_dict

    X = df['Title'].astype(str)
    y = df['Dominant_Topic']

    random.seed(42)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score

captions = []
caption_file = open("cap.txt", encoding="utf8")
for caption in caption_file:
   captions.append(caption.split(' ', 1)[1])

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(captions)

true_k = 5
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1)
model.fit(X)

print("Top terms per cluster:")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),

print("\n")
print("Prediction")

Y = vectorizer.transform(["FLOOD WATERS CHURNING JUST UNDER THE DUCK RIVER BRIDGE ON HIGHWAY 50 NEAR I-40 EXIT 148."])
prediction = model.predict(Y)
print("FLOOD WATERS CHURNING JUST UNDER THE DUCK RIVER BRIDGE ON HIGHWAY 50 NEAR I-40 EXIT 148.")
print(prediction)
            ### use str.replace() to remove any instances of the words
            ### ["sara", "shackleton", "chris", "germani"]

            ### append the text to word_data
            word_data.append(text)
            ### append a 0 to from_data if email is from Sara, and 1 if email is from Chris
            if name == "sara":
                from_data.append(0)
            else:
                from_data.append(1)

            email.close()

print "emails processed"
from_sara.close()
from_chris.close()

pickle.dump( word_data, open("your_word_data.pkl", "w") )
pickle.dump( from_data, open("your_email_authors.pkl", "w") )
print word_data[152]




### in Part 4, do TfIdf vectorization here
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words="english")
transformed_word_data = vectorizer.fit_transform(word_data)
print "count of words: ", len(vectorizer.get_feature_names())
print "word 34597: ", vectorizer.get_feature_names()[34597]
Esempio n. 57
0
    
def main():
    process_resume_list()


save_model = 'finalized_model.sav'
save_vector = 'finalized_vectorizer.sav'

if __name__ == '__main__':
    main()

    label=np.array(labelList)
    
    vectorizer = TfidfVectorizer(analyzer='word', stop_words='english',max_features=250)
    resumes_train,resumes_test,y_train,y_test=train_test_split(resume_list,label,test_size=0.33,random_state=1)
    X_train = vectorizer.fit_transform(resumes_train)
    X_test = vectorizer.fit_transform(resumes_test)
    
    X_train_array = X_train.toarray()
    X_test_array  = X_test.toarray()
    y_test1=y_test.reshape(-1,1)
    
    
    print(vectorizer.get_feature_names())
    pickle.dump(vectorizer, open(save_vector, 'wb'))
    
    #Implementing Bernoulli Naive Bayes
    naive_bayes = BernoulliNB(alpha=1.0)
    naive_bayes.fit(X_train_array, y_train)
    predictions = naive_bayes.predict(X_test_array)
    naivescore=(naive_bayes.score(X_test_array, y_test1))*100
Esempio n. 58
0
def getFeature(foldername):
    filenamelist = []
    # foldername = 'ratings2020'
    for subdir, dirs, files in os.walk(foldername):
        for file in os.listdir(subdir):
            filepath = subdir + os.sep + file
            re.sub(r"\\", "/", filepath)
            if ".csv" in filepath:
                filenamelist.append(filepath)

    # ----------------> Merging all the data in one csv
    df_merged = (pd.read_csv(filepath_or_buffer=file,
                             sep=',',
                             encoding='utf-16',
                             error_bad_lines=False,
                             engine='python') for file in filenamelist)
    df_merged = pd.concat(df_merged, ignore_index=True)
    df_merged.to_csv("merged.csv")
    df_merged.columns = [
        column.replace(" ", "_") for column in df_merged.columns
    ]
    df = df_merged[[
        "Star_Rating", "Reviewer_Language", "Review_Text", "App_Version_Code"
    ]]
    pd.set_option('mode.chained_assignment',
                  None)  # to remove SettingwithcopyWarning

    df['Positively_Rated'] = np.where(df['Star_Rating'] >= 3, 1, 0)
    # @@@@@@@@@@@@@@@@@@@ UI FEATURE 1: @@@@@@@@@@@@@@@@@@@@@@@@@@
    total_rating = len(df['Star_Rating'])
    pd.set_option('mode.chained_assignment', None)
    df.dropna(inplace=True, how='any')
    total_reviews = len(df(l1['Review_Text']))

    # In version 1.0 , we'll be checking only english revviews....
    df = df[df.Reviewer_Language == 'en']

    # Telling the positive and negative Cont and propotion for a particular version
    latest_version = max(df["App_Version_Code"])
    VrsnRating = df[df.App_Version_Code ==
                    latest_version].Positively_Rated.mean()

    VrsnRating = round(VrsnRating * 100, 2)

    ########## DATA CLEANING ##################333
    df['Review'] = df['Review_Text'].apply(lambda x: x.lower())
    df['Review'] = df['Review'].apply(
        lambda x: re.sub(r"\W", " ", x))  # non -word charactrer
    df['Review'] = df['Review'].apply(
        lambda x: re.sub(r"\d", " ", x))  # removing digits
    df['Review'] = df['Review'].apply(
        lambda x: re.sub("([^\x00-\x7F])+", " ", x))  # removing emojis
    df['Review'] = df['Review'].apply(
        lambda x: re.sub(' \w{1,4} ', ' ', x))  # removing 2  char wrds
    df['Review'] = df['Review'].apply(lambda x: re.sub(r"\s+", " ", x))
    df['Review'] = lemma(df['Review'])
    df['Review'] = df['Review'].apply(stp)
    nan_value = float("NaN")
    df.replace("", nan_value, inplace=True)
    df.dropna(inplace=True)
    df.isnull()
    df['Review'] = tagme(df['Review'])

    sid = SentimentIntensityAnalyzer()
    df["sentiments"] = df["Review_Text"].apply(lambda x: sid.polarity_scores(
        x))  #'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound':..
    df = pd.concat(
        [df.drop(['sentiments'], axis=1), df['sentiments'].apply(pd.Series)],
        axis=1)

    # add number of characters column
    df["nb_chars"] = df["Review_Text"].apply(lambda x: len(x))
    # add number of words column
    df["nb_words"] = df["Review_Text"].apply(lambda x: len(x.split(" ")))

    documents = [
        TaggedDocument(doc, [i]) for i, doc in enumerate(df["Review"].apply(
            lambda x: str(x).split(" ")))
    ]
    # train a Doc2Vec model with our text data
    model = Doc2Vec(documents,
                    vector_size=30,
                    window=2,
                    min_count=1,
                    workers=4)
    # transform each document into a vector data
    doc2vec_df = df["Review"].apply(
        lambda x: model.infer_vector(str(x).split(" "))).apply(pd.Series)
    doc2vec_df.columns = [
        "doc2vec_vector_" + str(x) for x in doc2vec_df.columns
    ]
    df = pd.concat([df, doc2vec_df], axis=1)

    corpus = []
    for sentences in df["Review"]:
        corpus.append([word for word, tag in sentences])

    df['cln_Reviews'] = [" ".join(review) for review in corpus]

    # add tf-idfs columns
    tfidf = TfidfVectorizer(
        min_df=5)  # ignore terms appearing less than 5 documents
    tfidf_result = tfidf.fit_transform(df["cln_Reviews"]).toarray()
    tfidf_df = pd.DataFrame(tfidf_result, columns=tfidf.get_feature_names())
    tfidf_df.columns = ["word_" + str(x) for x in tfidf_df.columns]
    tfidf_df.index = df.index
    reviews_df = pd.concat([df, tfidf_df], axis=1)

    wrdcldimg = show_wordcloud_fn(corpus)

    best_negsentences = reviews_df[reviews_df["nb_words"] >= 5].sort_values(
        "neg", ascending=False)[["Review_Text"]].head()
    #best_negsentences = reviews_df.sort_values("neg", ascending=False)[["Review_Text"]].head()
    best_negsentences = best_negsentences.to_string(index=False)

    pos_best_sentences = reviews_df[reviews_df["nb_words"] >= 5].sort_values(
        "pos", ascending=False)[["Review_Text"]].head()
    #pos_best_sentences = reviews_df.sort_values("pos", ascending=False)[["Review_Text"]].head()
    pos_best_sentences = pos_best_sentences.to_string(index=False)

    # apprtngimg = appvsrating(reviews_df)

    return (best_negsentences, pos_best_sentences, total_rating, total_reviews,
            VrsnRating, latest_version, wrdcldimg)
Esempio n. 59
0
v = TfidfVectorizer(stop_words='english',
                    analyzer="word",
                    use_idf=True,
                    min_df=1,
                    smooth_idf=True,
                    norm='')
base = pd.read_csv("films.csv")
# pridanie novych prazdnych stlpcov z tfidf
base['tfidf1'] = 0
base['tfidf2'] = 0
base['tfidf3'] = 0
base['tfidf4'] = 0

# pocitanie idf
x = v.fit_transform(base.loc[:, 'storyline'].values.astype('U'))
idf = v.idf_
# zrobenie dictionary v tvare -> token : hodnota idf
dictineri = dict(zip(v.get_feature_names(), idf))

for i, row in base.iterrows():
    accStoryline = list(map(lambda x: x.lower(), row['storyline'].split()))
    trol = dict()
    # ulozenie hodnot tfidf s tokenmi do trola
    for accWord in accStoryline:
        foo = accWord.replace('.', '')
        if foo in dictineri:
            if foo in trol:
                trol[foo] += dictineri[foo]
            else:
                trol[foo] = dictineri[foo]
Esempio n. 60
0
data = pd.read_csv("D:/BERKELEY-GRADUATE/E295/round4/expansion_4_1.csv")
#data = data.dropna(axis=0, how='any')
abstract = data.Abstract
title = data.Title
#abstract_title = pd.Series()
#for i in range(len(title)):
    #abstract_title[str(i)] = title[i] + abstract[i]

tf = abstract

###############################################################################
#remove dominant words
##td-idf#######################################################################
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words = 'english')
X = vectorizer.fit_transform(tf)
dense_X = X.todense()
idf = vectorizer.idf_
featurename1 = vectorizer.get_feature_names()
#print(dict(zip(vectorizer.get_feature_names(), idf)))

#get dominant words
one = dense_X > 0
frequency1 = sum(one)
#plt.plot(np.transpose(frequency1))
#By looking at the frequency of each word, find the threshold
#400, frequency > 500 are dominant words
do = pd.Series(frequency1.getA()[0],index = featurename1)
freq_sort1 = do.sort_values(ascending=False)
c1 = freq_sort1[:20].index