def main(): #argv print("Usage: python ./main.py <number_of_topics> <maxiteration>") # load stop words list from file stopwordsfile = open("stopwords.txt", "r", encoding='utf-8') for word in stopwordsfile: # a stop word in each line word = word.replace("\n", '') word = word.replace("\r\n", '') STOP_WORDS_SET.add(word) corpus = plsa.Corpus() # instantiate corpus # iterate over the files in the directory. document_paths = ['./texts/grimm_fairy_tales', './texts/tech_blog_posts', './texts/nyt'] # document_paths = ['./test/'] for document_path in document_paths: for document_file in glob.glob(os.path.join(document_path, '*.txt')): print(document_file) document = plsa.Document(document_file) # instantiate document document.split(STOP_WORDS_SET) # tokenize corpus.add_document(document) # push onto corpus documents list corpus.build_vocabulary() print("Vocabulary size:" + str(len(corpus.vocabulary))) print("Number of documents:" + str(len(corpus.documents))) #print("here:", argv) number_of_topics = 5 #int(argv[1]) max_iterations = 20 #int(argv[2]) corpus.plsa(number_of_topics, max_iterations) # print corpus.document_topic_prob # print corpus.topic_word_prob # cPickle.dump(corpus, open('./models/corpus.pickle', 'w')) print_topic_word_distribution(corpus, number_of_topics, 20, "./topic-word.txt") print_document_topic_distribution(corpus, number_of_topics, 10, "./document-topic.txt")
def main(argv): print "Usage: python ./main.py <number_of_topics> <maxiteration>" # load stop words list from file stopwordsfile = open("stopwords.txt", "r") for word in stopwordsfile: # a stop word in each line word = word.replace("\n", '') word = word.replace("\r\n", '') STOP_WORDS_SET.add(word) corpus = plsa.Corpus() # instantiate corpus # iterate over the files in the directory. document_paths = ['./texts/grimm_fairy_tales', './texts/tech_blog_posts', './texts/nyt'] #document_paths = ['./test/'] for document_path in document_paths: for document_file in glob.glob(os.path.join(document_path, '*.txt')): document = plsa.Document(document_file) # instantiate document document.split(STOP_WORDS_SET) # tokenize corpus.add_document(document) # push onto corpus documents list corpus.build_vocabulary() # Construct a list of unique words in the corpus. print "Vocabulary size:" + str(len(corpus.vocabulary)) print "Number of documents:" + str(len(corpus.documents)) number_of_topics = int(argv[1]) max_iterations = int(argv[2]) corpus.plsa(number_of_topics, max_iterations) #print corpus.document_topic_prob #print corpus.topic_word_prob #print top-k(20) word which occurs frequent in topic-(number_of_topics) print_topic_word_distribution(corpus, number_of_topics, 20, "./topic-word.txt") # print_document_topic_distribution(corpus, number_of_topics, 10, "./document-topic.txt")
def plsa(data_paths=[PATH_TO_RAW_DATA], topics_num=TOPICS_NUM, write_results_to=FILE_DEFAULT_PLSA_RESULTS): """ Performs topic modeling with PLSA method. """ import plsa import glob import os corpus = plsa.Corpus() # instantiate corpus document_paths = data_paths for document_path in document_paths: for document_file in glob.glob(os.path.join(document_path, '*.txt')): document = plsa.Document(document_file) # instantiate document document.split(list_stopwords(lang='en')) # tokenize corpus.add_document(document) # push onto corpus documents list corpus.build_vocabulary() corpus.plsa(TOPICS_NUM, 1) V = len(corpus.vocabulary) assert (TOPICS_NUM < V) f = open(write_results_to, "w") for k in range(TOPICS_NUM): word_prob = corpus.topic_word_prob[k, :] word_index_prob = [] for i in range(V): word_index_prob.append([i, word_prob[i]]) word_index_prob = sorted(word_index_prob, key=itemgetter(1), reverse=True) # sort by word count f.write("Topic #" + str(k) + ":\n") for i in range(TOPICS_NUM): index = word_index_prob[i][0] f.write(corpus.vocabulary[index] + " ") f.write("\n") f.close()
def main(argv): try: document_topk = int(argv[1]) topic_topk = int(argv[2]) number_of_topics = int(argv[3]) max_iterations = int(argv[4]) if document_topk > number_of_topics: raise Exception except: print "Usage: python ./main.py <document_topk> <topic_topk> <number_of_topics> <maxiteration> " print "Necessary condition: document_topk < number_of_topics" sys.exit(0) # load stop words list from file stopwordsfile = open("stopwords.txt", "r") for word in stopwordsfile: # a stop word in each line word = word.replace("\n", '') word = word.replace("\r\n", '') STOP_WORDS_SET.add(word) stopwordsfile.close() corpus = plsa.Corpus() # instantiate corpus # iterate over the files in the directory. document_paths =['./texts/txt'] #document_paths = ['./test/'] for document_path in document_paths: for document_file in glob.glob(os.path.join(document_path, '*.txt')): document = plsa.Document(document_file) # instantiate document document.split(STOP_WORDS_SET) # tokenize corpus.add_document(document) # push onto corpus documents list corpus.build_vocabulary() print "Vocabulary size:" + str(len(corpus.vocabulary)) print "Number of documents:" + str(len(corpus.documents)) corpus.plsa(number_of_topics, max_iterations) # My Code from here! [Saurabh] corpus.calculate_lts() corpus.calculate_lte() corpus.calculate_stat_lte() datapath = ['./texts/txt'] doc_id = 0 line_number = 1 for document_path in datapath: for document_file in glob.glob(os.path.join(document_path, '*.txt')): sentenceList = [] f = open(document_file, "r") print "Reading file ... ",os.path.basename(document_file) print "Calculating sentence score..." line_number = 1 for line in f: bScore = BIGRAM_WEIGHT*bigramScore(line) tScore = TRIGRAM_WEIGHT*trigramScore(line) sScore = STAT_WEIGHT*statScore(line,doc_id,corpus) sentenceList.append([line, bScore + tScore + sScore, line_number]) line_number = line_number+ 1 l = int(len(sentenceList)*TOP) print ("Extracting the top %0.2f percent sentences for summarization." %(TOP*100)) sentenceList = sorted(sentenceList, key = lambda x: x[1], reverse = True)[:l] sentenceList = sorted(sentenceList, key = lambda x: x[2], reverse = False)[:l] new_filename = "./dataset/summaries/set2"+os.path.basename(document_file).split('.')[0] + "Summaries.txt" print "Writing to file...", os.path.basename(new_filename) s = "" with open(new_filename, "w+") as f2: for ele in sentenceList: s += ele[0].encode('utf-8') + "\n" f2.write(s) f2.close() print ("-------------------------------------------------------------") f.close() doc_id += 1