tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') stops = set(stopwords.words("english")) retrain = False model_type = 1 if __name__ == '__main__': articleList, commentList, parentList, commentCount = read_comments(comment_data_path + 'trainTestDataSet.txt') model = get_model(model_type) vectors = model.syn0 cluster_ids, cluster_sums, numClusters = crp(vectors) print numClusters, "Clusters created" idx = model.index2word # For the first 10 clusters for i, cluster in enumerate(cluster_ids.values()): # # Print the cluster number print "\nCluster %d" % i # # Find all of the words for that cluster number, and print them out words = [] for id in cluster:
def min_max_mean_features(model_type, commentList, commentCount): model = get_model(model_type) M = model.syn0.shape[1] feature_matrix = np.empty([getCommentCount(model_type, commentList),M*3]) index = 0 if model_type == 2: #POS for commList in commentList.values(): for comm in commList: body = comm.pos_body.split(" ") comment_matrix = np.zeros([len(body), M]) for i in range(len(body)): tok = body[i] comment_matrix[i] = model[tok] minVec = np.min(comment_matrix, axis=0) maxVec = np.max(comment_matrix, axis=0) meanVec = np.mean(comment_matrix, axis=0) feature_matrix[index] = np.concatenate([minVec,maxVec,meanVec]) index += 1 elif model_type == 4: # SENTENCE for commList in commentList.values(): for comm in commList: sentences = nltk.sent_tokenize(comm.lemma_body) size = 0 for sent in sentences: body = nltk.word_tokenize(sent.strip('.')) size += len(body) comment_matrix = np.zeros([size, M]) ind = 0 for sent in sentences: body = nltk.word_tokenize(sent.strip('.')) for i in range(len(body)): tok = body[i] comment_matrix[ind] = model[tok] ind += 1 if len(body) < 1: continue minVec = np.min(comment_matrix, axis=0) maxVec = np.max(comment_matrix, axis=0) meanVec = np.mean(comment_matrix, axis=0) feature_matrix[index] = np.hstack((np.hstack((minVec,maxVec)), meanVec)) index += 1 elif model_type == 5: #BIGRAMS for commList in commentList.values(): for comm in commList: bigrams = generate_bigrams(comm.lemma_body) # Make the bigrams comment_matrix = np.zeros([len(bigrams), M]) for i in range(len(bigrams)): tok = bigrams[i] mod, er = model[tok] if er == 1: comment_matrix[i] = mod if len(bigrams) < 1: continue minVec = np.min(comment_matrix, axis=0) maxVec = np.max(comment_matrix, axis=0) meanVec = np.mean(comment_matrix, axis=0) feature_matrix[index] = np.hstack((np.hstack((minVec,maxVec)), meanVec)) index += 1 elif model_type == 99: # GOOGLE for commList in commentList.values(): for comm in commList: body = words(comm.body) comment_matrix = np.zeros([len(body), M]) for i in range(len(body)): tok = body[i] mod, er = model[tok] if er == 1: comment_matrix[i] = mod if len(body) < 1: continue minVec = np.min(comment_matrix, axis=0) maxVec = np.max(comment_matrix, axis=0) meanVec = np.mean(comment_matrix, axis=0) feature_matrix[index] = np.hstack((np.hstack((minVec,maxVec)), meanVec)) index += 1 return feature_matrix
import numpy as np tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') stops = set(stopwords.words("english")) retrain = False model_type = 1 if __name__ == '__main__': model = get_model(model_type) vectors = model.syn0 cluster_ids, cluster_sums, numClusters = crp(vectors) print numClusters, "Clusters created" idx = model.index2word # For the first 10 clusters for i, cluster in enumerate(cluster_ids.values()): # # Print the cluster number print "\nCluster %d" % i # # Find all of the words for that cluster number, and print them out words = [] for id in cluster: