Ejemplo n.º 1
0
	# Uncomment below lines for loading saved cluster assignments and probabaility of cluster assignments.
	idx_name = "gmm_latestclusmodel_len2alldata_80.pkl"
	idx_proba_name = "gmm_prob_latestclusmodel_len2alldata_80.pkl"
	idx, idx_proba = read_GMM(idx_name, idx_proba_name)

	# Create a Word / Index dictionary, mapping each vocabulary word to
	# a index number
	word_centroid_map = dict(zip( model.index2word, idx ))
	# Create a Word / Probability of cluster assignment dictionary, mapping each vocabulary word to
	# list of probabilities of cluster assignments.
	word_centroid_prob_map = dict(zip( model.index2word, idx_proba ))

	# Computing tf-idf values.
	traindata = []
	for i in range( 0, len(all["news"])):
		traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(all["news"][i], True)))

	num_topwords = int(sys.argv[3])

	prob_word = get_probability_words(word_centroid_map,traindata)
	doc_freq,doc_cofreq = get_doccofrequency(traindata)
	prob_topic,topic_centroid_prob_map = get_probability_topic_vectors(word_centroid_map, num_clusters, prob_word)
	topic_coherence, overall_coherence, top10words = get_coherence(doc_cofreq,doc_freq,num_clusters,num_topwords)
	topic_pmi, overall_pmi, top10words_pmi = get_pmi(doc_cofreq,doc_freq,num_clusters,num_topwords)

	outfile = open("coherence_j.txt", "w")
	outfile.write(str(overall_coherence))
	outfile.write("\n")
	for i in range(num_clusters):
		for item in top10words[i]:
			outfile.write(str(item))
Ejemplo n.º 2
0
	start = time.time()
	# The csv file might contain very huge fields, therefore set the field_size_limit to maximum.
	csv.field_size_limit(sys.maxsize)
	# Read train data.
	train_word_vector = pd.read_pickle('all.pkl')
	# Use the NLTK tokenizer to split the paragraph into sentences.
	tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
	sentences = []
	print "Parsing sentences from training set..."	

	# Loop over each news article.
	for review in train_word_vector["text"]:
		try:
			# Split a review into parsed sentences.
			sentences += KaggleWord2VecUtility.review_to_sentences(review, tokenizer)
		except:
			continue

	logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
        level=logging.INFO)

	num_features = int(sys.argv[1])     # Word vector dimensionality
	min_word_count = 20   # Minimum word count
	num_workers = 40       # Number of threads to run in parallel
	context = 10          # Context window size
	downsampling = 1e-3   # Downsample setting for frequent words

	print "Training Word2Vec model..."
	# Train Word2Vec model.
	model = Word2Vec(sentences, workers=num_workers, hs = 1, sg = 1, negative = 10, iter = 25,\
Ejemplo n.º 3
0
    #idx_name = "gmm_latestclusmodel_len2alldata.pkl"
    #idx_proba_name = "gmm_prob_latestclusmodel_len2alldata.pkl"
    #idx, idx_proba = read_GMM(idx_name, idx_proba_name)

    # Create a Word / Index dictionary, mapping each vocabulary word to
    # a cluster number
    word_centroid_map = dict(zip(model.wv.index2word, idx))
    # Create a Word / Probability of cluster assignment dictionary, mapping each vocabulary word to
    # list of probabilities of cluster assignments.
    word_centroid_prob_map = dict(zip(model.wv.index2word, idx_proba))

    # Computing tf-idf values.
    traindata = []
    for i in range(0, len(all["text"])):
        traindata.append(" ".join(
            KaggleWord2VecUtility.review_to_wordlist(all["text"][i], True)))

    tfv = TfidfVectorizer(strip_accents='unicode', dtype=np.float32)
    tfidfmatrix_traindata = tfv.fit_transform(traindata)
    featurenames = tfv.get_feature_names()
    idf = tfv._tfidf.idf_

    # Creating a dictionary with word mapped to its idf value
    print "Creating word-idf dictionary for Training set..."

    word_idf_dict = {}
    for pair in zip(featurenames, idf):
        word_idf_dict[pair[0]] = pair[1]

    # Pre-computing probability word-cluster vectors.
    prob_wordvecs = get_probability_word_vectors(featurenames,
Ejemplo n.º 4
0
	for pair in zip(featurenames, idf):
		word_idf_dict[pair[0]] = pair[1]

	temp_time = time.time() - start
	print "Creating Cluster Vectors and Graded Weighted Bag of Word Vectors...:", temp_time, "seconds."

	

	#bowv and gwbowv are matrices which contain normalised bag of word vectors and normalised gwbowv.
	bowv = np.zeros( (train["news"].size, num_clusters*num_features), dtype="float32")
	gwbowv = np.zeros( (train["news"].size, num_clusters*(num_features+1)), dtype="float32")

	counter = 0

	for review in train["news"]:
		words = KaggleWord2VecUtility.review_to_wordlist( review, \
            remove_stopwords=True )
		bowv[counter], gwbowv[counter] = create_cluster_vector_and_gwbowv(words, word_centroid_map, num_features, word_idf_dict, featurenames)
		counter+=1
		if counter % 1000 == 0:
			print "Train News Covered : ",counter

 #    #saving the bowv and gwbowv matrices
 	gwbowv_name = "GWBOWV_" + str(num_clusters) + "cluster_" + str(num_features) + "feature_matrix_orig.npy"
	np.save(gwbowv_name, gwbowv)

	#gwbowv = np.load(gwbowv_name)

	endtime_gwbowv = time.time() - start
	print "Created gwbowv_train: ", endtime_gwbowv, "seconds."	

	bowv_test = np.zeros( (test["news"].size, num_clusters*num_features), dtype="float32")
Ejemplo n.º 5
0
    idx_name = "gmm_latestclusmodel_len2alldata_80.pkl"
    idx_proba_name = "gmm_prob_latestclusmodel_len2alldata_80.pkl"
    idx, idx_proba = read_GMM(idx_name, idx_proba_name)

    # Create a Word / Index dictionary, mapping each vocabulary word to
    # a index number
    word_centroid_map = dict(zip(model.index2word, idx))
    # Create a Word / Probability of cluster assignment dictionary, mapping each vocabulary word to
    # list of probabilities of cluster assignments.
    word_centroid_prob_map = dict(zip(model.index2word, idx_proba))

    # Computing tf-idf values.
    traindata = []
    for i in range(0, len(all["news"])):
        traindata.append(" ".join(
            KaggleWord2VecUtility.review_to_wordlist(all["news"][i], True)))

    num_topwords = int(sys.argv[3])

    prob_word = get_probability_words(word_centroid_map, traindata)
    doc_freq, doc_cofreq = get_doccofrequency(traindata)
    prob_topic, topic_centroid_prob_map = get_probability_topic_vectors(
        word_centroid_map, num_clusters, prob_word)
    topic_coherence, overall_coherence, top10words = get_coherence(
        doc_cofreq, doc_freq, num_clusters, num_topwords)
    topic_pmi, overall_pmi, top10words_pmi = get_pmi(doc_cofreq, doc_freq,
                                                     num_clusters,
                                                     num_topwords)

    outfile = open("coherence_j.txt", "w")
    outfile.write(str(overall_coherence))
Ejemplo n.º 6
0
	word_centroid_map = dict(zip( model.index2word, idx ))

	lb = MultiLabelBinarizer()
	Y = lb.fit_transform(all.tags)
	train_data, test_data, Y_train, Y_test = train_test_split(all["text"], Y, test_size=0.3, random_state=42)
	
	train = DataFrame({'text': []})
	test = DataFrame({'text': []})

	train["text"] = train_data.reset_index(drop=True)
	test["text"] = test_data.reset_index(drop=True)

	traindata = []
	
	for i in range( 0, len(train["text"])):
		traindata.append(" ".join(KaggleWord2VecUtility.review_to_wordlist(train["text"][i], True)))
	
	tfv = TfidfVectorizer(min_df=5,strip_accents='unicode',dtype=np.float32)
	tfidfmatrix_traindata = tfv.fit_transform(traindata)
	featurenames = tfv.get_feature_names()
	idf = tfv._tfidf.idf_

	#Creating a dictionary with word mapped to its idf value 
	print "Creating word-idf dictionary for Training texts..."

	word_idf_dict = {}
	for pair in zip(featurenames, idf):
		word_idf_dict[pair[0]] = pair[1]

	print "Creating Cluster Vectors and Graded Weighted Bag of Word Vectors..."