Ejemplo n.º 1
0
def main(): #argv
    print("Usage: python ./main.py <number_of_topics> <maxiteration>")
    # load stop words list from file
    stopwordsfile = open("stopwords.txt", "r", encoding='utf-8')
    for word in stopwordsfile:  # a stop word in each line
        word = word.replace("\n", '')
        word = word.replace("\r\n", '')
        STOP_WORDS_SET.add(word)

    corpus = plsa.Corpus()  # instantiate corpus
    # iterate over the files in the directory.
    document_paths = ['./texts/grimm_fairy_tales', './texts/tech_blog_posts', './texts/nyt']
    # document_paths = ['./test/']
    for document_path in document_paths:
        for document_file in glob.glob(os.path.join(document_path, '*.txt')):
            print(document_file)
            document = plsa.Document(document_file)  # instantiate document
            document.split(STOP_WORDS_SET)  # tokenize
            corpus.add_document(document)  # push onto corpus documents list

    corpus.build_vocabulary()
    print("Vocabulary size:" + str(len(corpus.vocabulary)))
    print("Number of documents:" + str(len(corpus.documents)))
    #print("here:", argv)

    number_of_topics = 5 #int(argv[1])
    max_iterations = 20 #int(argv[2])
    corpus.plsa(number_of_topics, max_iterations)

    # print corpus.document_topic_prob
    # print corpus.topic_word_prob
    # cPickle.dump(corpus, open('./models/corpus.pickle', 'w'))

    print_topic_word_distribution(corpus, number_of_topics, 20, "./topic-word.txt")
    print_document_topic_distribution(corpus, number_of_topics, 10, "./document-topic.txt")
Ejemplo n.º 2
0
def main(argv):
    print "Usage: python ./main.py <number_of_topics> <maxiteration>"
    # load stop words list from file
    stopwordsfile = open("stopwords.txt", "r")
    for word in stopwordsfile: # a stop word in each line
        word = word.replace("\n", '')
        word = word.replace("\r\n", '')
        STOP_WORDS_SET.add(word)
    
    corpus = plsa.Corpus() # instantiate corpus
    # iterate over the files in the directory.
    document_paths = ['./texts/grimm_fairy_tales', './texts/tech_blog_posts', './texts/nyt']
    #document_paths = ['./test/']
    for document_path in document_paths:
        for document_file in glob.glob(os.path.join(document_path, '*.txt')):
            document = plsa.Document(document_file) # instantiate document
            document.split(STOP_WORDS_SET) # tokenize
            corpus.add_document(document) # push onto corpus documents list

    corpus.build_vocabulary() # Construct a list of unique words in the corpus.
    print "Vocabulary size:" + str(len(corpus.vocabulary))
    print "Number of documents:" + str(len(corpus.documents))
    
    number_of_topics = int(argv[1])
    max_iterations = int(argv[2])
    corpus.plsa(number_of_topics, max_iterations)
    
    #print corpus.document_topic_prob
    #print corpus.topic_word_prob

    #print top-k(20) word which occurs frequent in topic-(number_of_topics)
    print_topic_word_distribution(corpus, number_of_topics, 20, "./topic-word.txt")
    #
    print_document_topic_distribution(corpus, number_of_topics, 10, "./document-topic.txt")
Ejemplo n.º 3
0
def plsa(data_paths=[PATH_TO_RAW_DATA],
         topics_num=TOPICS_NUM,
         write_results_to=FILE_DEFAULT_PLSA_RESULTS):
    """ Performs topic modeling with PLSA method. """

    import plsa
    import glob
    import os
    corpus = plsa.Corpus()  # instantiate corpus
    document_paths = data_paths
    for document_path in document_paths:
        for document_file in glob.glob(os.path.join(document_path, '*.txt')):
            document = plsa.Document(document_file)  # instantiate document
            document.split(list_stopwords(lang='en'))  # tokenize
            corpus.add_document(document)  # push onto corpus documents list

    corpus.build_vocabulary()
    corpus.plsa(TOPICS_NUM, 1)

    V = len(corpus.vocabulary)
    assert (TOPICS_NUM < V)
    f = open(write_results_to, "w")
    for k in range(TOPICS_NUM):
        word_prob = corpus.topic_word_prob[k, :]
        word_index_prob = []
        for i in range(V):
            word_index_prob.append([i, word_prob[i]])
        word_index_prob = sorted(word_index_prob,
                                 key=itemgetter(1),
                                 reverse=True)  # sort by word count
        f.write("Topic #" + str(k) + ":\n")
        for i in range(TOPICS_NUM):
            index = word_index_prob[i][0]
            f.write(corpus.vocabulary[index] + " ")
        f.write("\n")

    f.close()
Ejemplo n.º 4
0
def main(argv):
	try:
		document_topk = int(argv[1])
		topic_topk = int(argv[2])
		number_of_topics = int(argv[3])
		max_iterations = int(argv[4])

		if document_topk > number_of_topics:
			raise Exception
	except:
		print "Usage: python ./main.py <document_topk> <topic_topk> <number_of_topics> <maxiteration> "
		print "Necessary condition: document_topk < number_of_topics"
		sys.exit(0)

	# load stop words list from file
	stopwordsfile = open("stopwords.txt", "r")
	for word in stopwordsfile: # a stop word in each line
		word = word.replace("\n", '')
		word = word.replace("\r\n", '')
		STOP_WORDS_SET.add(word)
	stopwordsfile.close()
	
	corpus = plsa.Corpus() # instantiate corpus
	# iterate over the files in the directory.
	document_paths =['./texts/txt']
	#document_paths = ['./test/']
	for document_path in document_paths:
		for document_file in glob.glob(os.path.join(document_path, '*.txt')):
			document = plsa.Document(document_file) # instantiate document
			document.split(STOP_WORDS_SET) # tokenize
			corpus.add_document(document) # push onto corpus documents list

	corpus.build_vocabulary()
	print "Vocabulary size:" + str(len(corpus.vocabulary))
	print "Number of documents:" + str(len(corpus.documents))

	corpus.plsa(number_of_topics, max_iterations)

	# My Code from here! [Saurabh]
	corpus.calculate_lts()
	corpus.calculate_lte()
	corpus.calculate_stat_lte()
	datapath = ['./texts/txt']
	doc_id = 0
	line_number = 1
	for document_path in datapath:
		for document_file in glob.glob(os.path.join(document_path, '*.txt')):
			sentenceList = []
			f = open(document_file, "r")
			print "Reading file ... ",os.path.basename(document_file)
			print "Calculating sentence score..."
			line_number = 1
			for line in f:
				bScore = BIGRAM_WEIGHT*bigramScore(line)
				tScore = TRIGRAM_WEIGHT*trigramScore(line)
				sScore = STAT_WEIGHT*statScore(line,doc_id,corpus)
				sentenceList.append([line, bScore + tScore + sScore, line_number])
				line_number = line_number+ 1 
			l = int(len(sentenceList)*TOP)
			print ("Extracting the top %0.2f percent sentences for summarization." %(TOP*100))
			sentenceList = sorted(sentenceList, key = lambda x: x[1], reverse = True)[:l]
			sentenceList = sorted(sentenceList, key = lambda x: x[2], reverse = False)[:l]            
			new_filename = "./dataset/summaries/set2"+os.path.basename(document_file).split('.')[0] + "Summaries.txt"
			print "Writing to file...", os.path.basename(new_filename)
			s = ""
			with open(new_filename, "w+") as f2:
				for ele in sentenceList:
					s += ele[0].encode('utf-8') + "\n"
				f2.write(s)
				f2.close()
			print ("-------------------------------------------------------------")
			f.close()
			doc_id += 1