def test_find_documents_by_name(self): """Test to check if we can find documents and display information""" self.new_documents.save_documents() new_test_documents = Documents("Twitter", "56789") new_test_documents.save_documents() found_document = Documents.find_by_name("Twitter") self.assertEqual(found_document.account_name, new_test_documents.account_name)
def main(model_name, train_dir, test_dir): # logger info build_method = "average word vector" if setting.build_option == 1: build_method = "average word vector with tf-idf" elif setting.build_option == 2: build_method = "cluster word vector" logger.debug("text process option: %s", str(setting.process_option)) logger.debug("use %s to build doc vector", build_method) model = Word2Vec.load(model_name) logger.info("finish loading model %s", model_name) # get doc vector logger.info("start building training set doc vector") start_time = timeit.default_timer() train_documents = Documents(train_dir) train_fv = docvector.build_doc_vector(train_documents, model, setting.build_option, setting.save_fv, setting.train_fv_name) print train_fv logger.info("training set doc vector built in %.4lfs", timeit.default_timer() - start_time) logger.info("training set doc vector saved to %s", setting.train_fv_name) logger.debug("training size: %s", str(train_fv.shape)) # train classifier logger.info("start training classifier") start_time = timeit.default_timer() forest = grid_search.GridSearchCV(RandomForestClassifier(), {'n_estimators':[100], 'n_jobs':[100]}, cv=5, scoring = 'f1_weighted', n_jobs=100) best_model = forest.fit(train_fv, list(train_documents.field_iterator(setting.csv_option.sentiment_name))) logger.info("finished training classifier in %.4lfs", timeit.default_timer() - start_time) # evaluate on test set logger.info("start building test set doc vector") start_time = timeit.default_timer() test_documents = Documents(test_dir) test_fv = docvector.build_doc_vector(test_documents, model, setting.build_option, setting.save_fv, setting.test_fv_name) print test_fv logger.info("test set doc vector built in %.4lfs", timeit.default_timer() - start_time) logger.info("test set doc vector saved to %s", setting.test_fv_name) logger.debug("test size: %s", str(test_fv.shape)) logger.info("start predicting test set sentiment") start_time = timeit.default_timer() predicted_sentiment = best_model.predict(test_fv) logger.info("finished prediction in %.4lfs", timeit.default_timer() - start_time) accuracy = np.mean(predicted_sentiment == list(test_documents.field_iterator(setting.csv_option.sentiment_name))) report = metrics.classification_report(list(test_documents.field_iterator(setting.csv_option.sentiment_name)), \ predicted_sentiment, target_names=['0', '1']) reports = report.split()[-4: -1] print report print "Test Set Accuracy = ", accuracy print reports
class TestDocuments(unittest.TestCase): """Test class that defines test cases for the Documents class behavior """ def setUp(self): """Set up method to run befor before each test case""" self.new_documents = Documents("Facebook", "12345") def test_documents_instance(self): """Method that tests whether the new_documents have been instantiated correctly""" self.assertEqual(self.new_documents.account_name, "Facebook") self.assertEqual(self.new_documents.account_password, "12345") def test_save_documents(self): """Method that tests whether the new document created has been saved""" self.new_documents.save_documents() self.assertEqual(len(Documents.documents_list), 1) def test_save_multiple_documents(self): """Method that saves multiple documents to documents_list""" self.new_documents.save_documents() new_test_documents = Documents("Twitter", "56789") new_test_documents.save_documents() self.assertEqual(len(Documents.documents_list), 2) def tearDown(self): """Method that clears the documents_list after every test to ensure that there is no error""" Documents.documents_list = [] def test_find_documents_by_name(self): """Test to check if we can find documents and display information""" self.new_documents.save_documents() new_test_documents = Documents("Twitter", "56789") new_test_documents.save_documents() found_document = Documents.find_by_name("Twitter") self.assertEqual(found_document.account_name, new_test_documents.account_name) def test_display_all_documents(self): """TestCase to test whether all contacts can be displayed""" self.assertEqual(Documents.display_documents(), Documents.documents_list)
from documents import Documents from config import settings documents = Documents(settings['blogs_root']) documents.process() documents.dump()
if(setting.to_scale): doc_vector = scale(doc_vector, copy=False) return doc_vector if __name__ == "__main__": model = Word2Vec.load(setting.model_name) with open(setting.sentic_corpus, 'r') as f: sentic_dic = json.load(f) for i in range(4): logger.debug("use %s to build doc vector", setting.build_methods[i]) for j in range(3): if(i <= 1): # train documents = Documents(setting.dbprefix + `j` + "/train") train_fv = build_doc_vector(i, sentic_dic, documents=documents, model=model, \ save_file=setting.saveprefix + "train_fv_" + `i` + "_" + `j`) if(i == 0): train_label = np.array(list(documents.field_iterator(setting.csv_option.sentiment_name))) np.save(setting.saveprefix + "train_label_" + `j`, train_label) # test documents = Documents(setting.dbprefix + `j` + "/test") test_fv = build_doc_vector(i, sentic_dic, documents=documents, model=model, \ save_file=setting.saveprefix + "test_fv_" + `i` + "_" + `j`) if(i == 0): test_label = np.array(list(documents.field_iterator(setting.csv_option.sentiment_name))) np.save(setting.saveprefix + "test_label_" + `j`, test_label) else: train_fv, test_fv = build_doc_vector(i, sentic_dic, \ train_file_name=setting.fsprefix + `j` + "/train.p", test_file_name=setting.fsprefix + `j` + "/test.p", \
def check_existing_documents(name): """Method that checks whether a particular account and its documents exist based on searched account_name""" return Documents.find_by_name(name)
def find_document(account_name): """Function that finds documents based on account_name given""" return Documents.find_by_name(account_name)
def setUp(self): """Set up method to run befor before each test case""" self.new_documents = Documents("Facebook", "12345")
def test_display_all_documents(self): """TestCase to test whether all contacts can be displayed""" self.assertEqual(Documents.display_documents(), Documents.documents_list)
def test_save_multiple_documents(self): """Method that saves multiple documents to documents_list""" self.new_documents.save_documents() new_test_documents = Documents("Twitter", "56789") new_test_documents.save_documents() self.assertEqual(len(Documents.documents_list), 2)
from gensim.models.phrases import Phraser, Phrases from documents import Documents from sentences import Sentences from gensim.models import Word2Vec import os # resultDir='/home/marcin/Documents/AGH/PJN/data/json/word2vec/' data_dir = "/home/marcin/Documents/AGH/PJN/data/json/word2vec/new" # # # print("finding 3-word phrases") documents = Documents(data_dir) bigram = Phraser(Phrases(documents)) trigram = Phraser(Phrases(bigram[documents])) sentences = open(os.path.join(data_dir, "sentences-3.txt"), "w+") print("saving sentences to file") for s in trigram[bigram[documents]]: for sentece in s: sentences.write("{}\t".format(sentece)) sentences.write("\n") sentences.close() # trigram = Phraser(Phrases(bigram[sentences])) #trigram = Phraser.load(os.path.join(data_dir,"phrases")) sentences = Sentences(os.path.join(data_dir, "sentences-3.txt")) # # print("training model") # model = Word2Vec(sentences=sentences,window=5,min_count=3,sg=0,size=300) # model.save(os.path.join(data_dir,"word2vec_model"))
""" 读入索引 解析 查询 进行索引 """ from time import time from query import Answer_query, INDEX from documents import Documents # DATA_DIR = '/home/luo/HomeWork/news_retrieval/Data/cnn_samples' # # q = "loop" # qa = Answer_query(q) # # start = time() # qa.search(DATA_DIR) # print("cost {}s".format(time()-start)) if __name__ == "__main__": DATA_DIR = "/home/luo/HomeWork/news_retrieval/Data/cnn_5000" start = time() print("正在建造索引...") docs = Documents(DATA_DIR, output_dir="DATA/cnn_5000/") docs.generate_index() point1 = time() print("花费时间: {}s ".format(point1 - start)) #39s float64 位存储 726M
def search(self, query_string, limit=0): """Search for documents in the database.""" mset = self._search(query_string, limit) return Documents(self, mset)