예제 #1
0
def train_clusterer(model, commentList):  
    
    count = 0
    index2word_set = set(model.index2word)
    for commList in commentList.values():
        for comm in commList:      
            for word in comment_to_wordlist(comm.body, True):
                if word in index2word_set:
                    count += 1
                    
    print count           
    word_vectors = np.zeros((count, 300))
    i = 0
    for commList in commentList.values():
        for comm in commList:      
            for word in comment_to_wordlist(comm.body, True):
                if word in index2word_set:
                    word_vectors[i] = model[word]
                    i += 1
    
    
    
    num_clusters = word_vectors.shape[0] / 20
    print word_vectors.shape
    
    # Initalize a k-means object and use it to extract centroids
    #idx = KMeans( n_clusters = num_clusters).fit_predict( word_vectors )
    
    centroids,_  = kmeans(word_vectors, num_clusters,)
    
    idx,_ = vq(word_vectors,centroids)
    
    print "TRAINED"
    
    word_centroid_map = dict(zip( model.index2word, idx ))
    # For the first 10 clusters
    for cluster in xrange(0,10):
        #
        # Print the cluster number  
        print "\nCluster %d" % cluster
        #
        # Find all of the words for that cluster number, and print them out
        words = []
        for i in xrange(0,len(word_centroid_map.values())):
            if( word_centroid_map.values()[i] == cluster ):
                words.append(word_centroid_map.keys()[i])
        print words
        
    return word_centroid_map, num_clusters
예제 #2
0
def tfidf_weighted_sum_features(model, df_comments): 
    M = model.syn0.shape[1]
    N = df_comments.shape[0]
    feature_matrix = np.empty((N,M), dtype="float32")
    
    index = 0  
    
    index2word_set = set(model.index2word)
    
    global_body=[]
    
    # Create bag of words for comment
    for _,row in df_comments.iterrows():
        comm = row['comment_content']
        global_body += comment_to_wordlist(comm, True)
            
    global_freq_dist = nltk.FreqDist(global_body)
    
    IN_COUNT = 1
    OUT_COUNT = 1
    for _,row in df_comments.iterrows():
        comm = row['comment_content']
        sum = np.zeros((M,), dtype="float32")
        clean_comm = comment_to_wordlist(comm, True)
        fdist = nltk.FreqDist(clean_comm)
        for word in clean_comm:                
            tf = 0.5 + 0.5*float(fdist[word])/np.max(fdist.values())
            count = global_freq_dist[word]                        
        
            idf = math.log(N / float(1+count))
            if word in index2word_set:
                #print "IN - ", word
                IN_COUNT += 1
                # TFIDF weighted vector
                sum += tf * idf * model[word]
            else:
                #print "OUT - ", word
                OUT_COUNT += 1
        feature_matrix[index] = sum
        index += 1

  
    print IN_COUNT, 'words found'
    print OUT_COUNT, 'words not found'
    return feature_matrix
예제 #3
0
def bag_of_centroids_features(commentList, commentCount, num_clusters, centroid_map):       
    # Pre-allocate an array for the training set bags of centroids (for speed)
    feature_matrix = np.zeros( (getCommentCount(commentList), num_clusters), \
        dtype="float32" )
    
    # Transform the training set reviews into bags of centroids
    index = 0
    for commList in commentList.values():
        for comm in commList:               
            feature_matrix[index] = create_bag_of_centroids(comment_to_wordlist(comm.body, True), \
                centroid_map )
            index += 1
        
    
    return feature_matrix
예제 #4
0
def train_clusterer(model, commentList):  
    
    
    
    count = 0
    corpus = []
    words = []
    index2word_set = set(model.index2word)
    for commList in commentList.values():
        for comm in commList:      
            tokens = []
            for word in comment_to_wordlist(comm.body, True):
                if word in index2word_set:  
                    tokens.append(word)  
                    words.append(word)                
                    count += 1
            corpus.append(' '.join(tokens))
    
    fdist1 = FreqDist(words)   
    most_common = [item[0] for item in fdist1.most_common(10)]
    uniques = Counter(words).keys()
    print fdist1
    print len(most_common)
    print len(uniques)
    wordlist = [item for item in uniques if item not in most_common]
    N = len(wordlist)
    print N
                  
    word_vectors = np.zeros((N, 200))
    i = 0
    for word in wordlist:
        if word not in most_common:
            word_vectors[i] = model[word]
            i += 1
    
    
    
    num_clusters = word_vectors.shape[0] / 10
    print word_vectors.shape
    print num_clusters
    
    # Initalize a k-means object and use it to extract centroids
    #idx = KMeans( n_clusters = num_clusters).fit_predict( word_vectors )
    
    centroids,_  = kmeans(word_vectors, num_clusters,)
    
    idx,_ = vq(word_vectors,centroids)
    
    print "TRAINED"
    
    word_centroid_map = dict(zip( model.index2word, idx ))
    # For the first 10 clusters
    for cluster in xrange(0,10):
        #
        # Print the cluster number  
        print "\nCluster %d" % cluster
        #
        # Find all of the words for that cluster number, and print them out
        words = []
        for i in xrange(0,len(word_centroid_map.values())):
            if( word_centroid_map.values()[i] == cluster ):
                words.append(word_centroid_map.keys()[i])
        print words
        
    return word_centroid_map, num_clusters
예제 #5
0
 def __iter__(self):
     for line in open(self.file):
         body = line.split("|")[1]
         yield comment_to_wordlist(body, remove_stops=True)