def wiki_50M_model_test(): """train lsa model with 50M wikipedia dumps, normal accuracy 24.2% with 2100 unseen words, definition accuracy 17% with 4000+ unseen words""" load_gre_answer(answer_path='/Users/apple/Dropbox/NLP/GREVerbal.txt') print 'finish loading answers' lsi = lsamodel.load('/Users/apple/graduate/Courses/544NLP/data/wiki_article/wiki_part_model.model') print 'finish loading lsa model' word2id = lsamodel.load_word2id(dic_txt_file='/Users/apple/graduate/Courses/544NLP/data/wiki_article/wiki_wordids.txt') print 'finish loading word2id dictionary' load_gre_sentence_definition(sentence_path="/Users/apple/Dropbox/NLP/bi_d_plaintext.txt", lsi=lsi, word2id=word2id); #load_gre_sentence(sentence_path="/Users/apple/Dropbox/NLP/bi_plaintext.txt", lsi=lsi, word2id=word2id); print 'finish loading and selecting gre sentence completion task answers' print 'accuracy: ' + str(eval.eval_accuracy(answer, cal_ans)) print len(unseen_word) print unseen_word
def wiki_10G_model_test(): """train lsa model with wiki 10G data, 753 unseen words, 23.4% accuracy, definition accuracy 20.6% with 1810 unseen words""" load_gre_answer(answer_path='/Users/apple/Dropbox/NLP/GREVerbal.txt') print 'finish loading answers' lsi = lsamodel.load('/Users/apple/graduate/Courses/544NLP/data/wiki_article/wiki_latest_model/lsi.model') print 'finish loading lsa model' word2id = lsamodel.load_word2id(dic_txt_file='/Users/apple/graduate/Courses/544NLP/data/wiki_article/wiki_latest_model/wiki_en_wordids.txt') print 'finish loading word2id dictionary' load_gre_sentence_definition(sentence_path="/Users/apple/Dropbox/NLP/bi_d_plaintext.txt", lsi=lsi, word2id=word2id, algorithm=TOTAL_SIMILARITY_WITH_RAKE); # load_gre_sentence(sentence_path="/Users/apple/Dropbox/NLP/bi_plaintext.txt", lsi=lsi, word2id=word2id, algorithm=TOTAL_SIMILARITY_WITH_RAKE); print 'finish loading and selecting gre sentence completion task answers' print 'accuracy: ' + str(eval.eval_accuracy(answer, cal_ans)) print len(unseen_word) print unseen_word
def wiki_10G_model_test(): """train lsa model with wiki 10G data, 753 unseen words, 23.4% accuracy, definition accuracy 20.6% with 1810 unseen words""" load_gre_answer(answer_path='/Users/junchen/Documents/CSCI544/project/GREVerbal.txt') print 'finish loading answers' lsi = lsamodel.load('/Users/junchen/Documents/CSCI544/project/lsi model/lsi.model') print 'finish loading lsa model' word2id = lsamodel.load_word2id(dic_txt_file='/Users/junchen/Documents/CSCI544/project/wiki_data/wiki_en_wordids.txt') print 'finish loading word2id dictionary' for i in range(2, 3): #load_gre_sentence_definition(sentence_path="/Users/junchen/Documents/CSCI544/project/bi_d_plaintext.txt", lsi=lsi, word2id=word2id); load_gre_sentence(sentence_path="/Users/junchen/Documents/CSCI544/project/bi_plaintext.txt", lsi=lsi, word2id=word2id, algorithm=TOTAL_SIMILARITY_WITH_RAKE , k=i) print 'finish loading and selecting gre sentence completion task answers' print str(i) + ': accuracy: ' + str(eval.eval_accuracy(answer, cal_ans)) clear_answer() # load_gre_sentence_definition(sentence_path="/Users/junchen/Documents/CSCI544/project/bi_d_plaintext.txt", lsi=lsi, word2id=word2id); # load_gre_sentence(sentence_path="/Users/apple/Dropbox/NLP/bi_plaintext.txt", lsi=lsi, word2id=word2id, algorithm=TOTAL_SIMILARITY_WITH_RAKE); print len(unseen_word) print unseen_word