Esempio n. 1
0
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
stops = set(stopwords.words("english"))         





retrain = False
model_type = 1
if __name__ == '__main__':   
    
    articleList, commentList, parentList, commentCount = read_comments(comment_data_path + 'trainTestDataSet.txt')
     
    
    model = get_model(model_type)

    vectors = model.syn0
    cluster_ids, cluster_sums, numClusters = crp(vectors)
    
    print numClusters, "Clusters created"
    idx = model.index2word
    # For the first 10 clusters
    for i, cluster in enumerate(cluster_ids.values()):
        #
        # Print the cluster number  
        print "\nCluster %d" % i
        #
        # Find all of the words for that cluster number, and print them out
        words = []
        for id in cluster:
Esempio n. 2
0
def min_max_mean_features(model_type, commentList, commentCount):   
    model = get_model(model_type)
    M = model.syn0.shape[1]
    feature_matrix = np.empty([getCommentCount(model_type, commentList),M*3])
    
    index = 0
    if model_type == 2: #POS
        for commList in commentList.values():
            for comm in commList:
                body = comm.pos_body.split(" ")
                comment_matrix = np.zeros([len(body), M])
                for i in range(len(body)):
                    tok = body[i]
                    comment_matrix[i] = model[tok]
                    
                minVec = np.min(comment_matrix, axis=0)
                maxVec = np.max(comment_matrix, axis=0)
                meanVec = np.mean(comment_matrix, axis=0)
                feature_matrix[index] = np.concatenate([minVec,maxVec,meanVec])
                index += 1
    
    elif model_type == 4: # SENTENCE
        for commList in commentList.values():
            for comm in commList:
                sentences = nltk.sent_tokenize(comm.lemma_body)
                size = 0
                for sent in sentences:
                    body = nltk.word_tokenize(sent.strip('.'))
                    size += len(body)
                    
                comment_matrix = np.zeros([size, M])
                ind = 0
                for sent in sentences:
                    body = nltk.word_tokenize(sent.strip('.'))
                    for i in range(len(body)):
                        tok = body[i]
                        comment_matrix[ind] = model[tok]
                        ind += 1
                    if len(body) < 1:
                        continue
                minVec = np.min(comment_matrix, axis=0)
                maxVec = np.max(comment_matrix, axis=0)
                meanVec = np.mean(comment_matrix, axis=0)
                feature_matrix[index] = np.hstack((np.hstack((minVec,maxVec)), meanVec))
                index += 1
    elif model_type == 5: #BIGRAMS
        for commList in commentList.values():
            for comm in commList:
                bigrams = generate_bigrams(comm.lemma_body) # Make the bigrams
                comment_matrix = np.zeros([len(bigrams), M])
                for i in range(len(bigrams)):
                    tok = bigrams[i]
                    mod, er = model[tok]
                    if er == 1:
                        comment_matrix[i] = mod
                if len(bigrams) < 1:
                    continue
                    
                minVec = np.min(comment_matrix, axis=0)
                maxVec = np.max(comment_matrix, axis=0)
                meanVec = np.mean(comment_matrix, axis=0)
                feature_matrix[index] = np.hstack((np.hstack((minVec,maxVec)), meanVec))
                index += 1
                
    elif model_type == 99: # GOOGLE
        for commList in commentList.values():
            for comm in commList:
                body = words(comm.body)
                comment_matrix = np.zeros([len(body), M])
                for i in range(len(body)):
                    tok = body[i]
                    mod, er = model[tok]
                    if er == 1:
                        comment_matrix[i] = mod
                if len(body) < 1:
                    continue
                minVec = np.min(comment_matrix, axis=0)
                maxVec = np.max(comment_matrix, axis=0)
                meanVec = np.mean(comment_matrix, axis=0)
                feature_matrix[index] = np.hstack((np.hstack((minVec,maxVec)), meanVec))
                index += 1
        
    
    return feature_matrix  
Esempio n. 3
0
import numpy as np


tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
stops = set(stopwords.words("english"))         





retrain = False
model_type = 1
if __name__ == '__main__':   
         
    
    model = get_model(model_type)

    vectors = model.syn0
    cluster_ids, cluster_sums, numClusters = crp(vectors)
    
    print numClusters, "Clusters created"
    idx = model.index2word
    # For the first 10 clusters
    for i, cluster in enumerate(cluster_ids.values()):
        #
        # Print the cluster number  
        print "\nCluster %d" % i
        #
        # Find all of the words for that cluster number, and print them out
        words = []
        for id in cluster: