Ejemplo n.º 1
0
    def test_find_documents_by_name(self):
        """Test to check if we can find documents and display information"""
        self.new_documents.save_documents()
        new_test_documents = Documents("Twitter", "56789")
        new_test_documents.save_documents()

        found_document = Documents.find_by_name("Twitter")

        self.assertEqual(found_document.account_name, new_test_documents.account_name)
def main(model_name, train_dir, test_dir):

    # logger info
    build_method = "average word vector"
    if setting.build_option == 1:
        build_method = "average word vector with tf-idf"
    elif setting.build_option == 2:
        build_method = "cluster word vector"
    logger.debug("text process option: %s", str(setting.process_option))
    logger.debug("use %s to build doc vector", build_method)

    model = Word2Vec.load(model_name)
    logger.info("finish loading model %s", model_name)

    # get doc vector
    logger.info("start building training set doc vector")
    start_time = timeit.default_timer()
    train_documents = Documents(train_dir)
    train_fv = docvector.build_doc_vector(train_documents, model, setting.build_option, setting.save_fv, setting.train_fv_name)
    print train_fv
    logger.info("training set doc vector built in %.4lfs", timeit.default_timer() - start_time)
    logger.info("training set doc vector saved to %s", setting.train_fv_name)
    logger.debug("training size: %s", str(train_fv.shape))

    # train classifier
    logger.info("start training classifier")
    start_time = timeit.default_timer()
    forest = grid_search.GridSearchCV(RandomForestClassifier(), {'n_estimators':[100], 'n_jobs':[100]}, cv=5, scoring = 'f1_weighted', n_jobs=100)
    best_model = forest.fit(train_fv, list(train_documents.field_iterator(setting.csv_option.sentiment_name)))
    logger.info("finished training classifier in %.4lfs", timeit.default_timer() - start_time)


    # evaluate on test set
    logger.info("start building test set doc vector")
    start_time = timeit.default_timer()
    test_documents = Documents(test_dir)
    test_fv = docvector.build_doc_vector(test_documents, model, setting.build_option, setting.save_fv, setting.test_fv_name)
    print test_fv
    logger.info("test set doc vector built in %.4lfs", timeit.default_timer() - start_time)
    logger.info("test set doc vector saved to %s", setting.test_fv_name)
    logger.debug("test size: %s", str(test_fv.shape))

    logger.info("start predicting test set sentiment")
    start_time = timeit.default_timer()
    predicted_sentiment = best_model.predict(test_fv)
    logger.info("finished prediction in %.4lfs", timeit.default_timer() - start_time)

    accuracy = np.mean(predicted_sentiment == list(test_documents.field_iterator(setting.csv_option.sentiment_name)))
    report = metrics.classification_report(list(test_documents.field_iterator(setting.csv_option.sentiment_name)), \
            predicted_sentiment, target_names=['0', '1'])
    reports = report.split()[-4: -1]

    print report
    print "Test Set Accuracy = ", accuracy 
    print reports
Ejemplo n.º 3
0
class TestDocuments(unittest.TestCase):
    """Test class that defines test cases for the Documents class behavior
    """

    def setUp(self):
        """Set up method to run befor before each test case"""
        self.new_documents = Documents("Facebook", "12345")

    def test_documents_instance(self):
        """Method that tests whether the new_documents have been instantiated correctly"""
        self.assertEqual(self.new_documents.account_name, "Facebook")
        self.assertEqual(self.new_documents.account_password, "12345")

    def test_save_documents(self):
        """Method that tests whether the new document created has been saved"""
        self.new_documents.save_documents()
        self.assertEqual(len(Documents.documents_list), 1)

    def test_save_multiple_documents(self):
        """Method that saves multiple documents to documents_list"""
        self.new_documents.save_documents()
        new_test_documents = Documents("Twitter", "56789")
        new_test_documents.save_documents()
        self.assertEqual(len(Documents.documents_list), 2)

    def tearDown(self):
        """Method that clears the documents_list after every test to ensure that there is no error"""
        Documents.documents_list = []

    def test_find_documents_by_name(self):
        """Test to check if we can find documents and display information"""
        self.new_documents.save_documents()
        new_test_documents = Documents("Twitter", "56789")
        new_test_documents.save_documents()

        found_document = Documents.find_by_name("Twitter")

        self.assertEqual(found_document.account_name, new_test_documents.account_name)

    def test_display_all_documents(self):
        """TestCase to test whether all contacts can be displayed"""
        self.assertEqual(Documents.display_documents(), Documents.documents_list)
Ejemplo n.º 4
0
from documents import Documents
from config import settings

documents = Documents(settings['blogs_root'])
documents.process()
documents.dump()
Ejemplo n.º 5
0
    if(setting.to_scale):
        doc_vector = scale(doc_vector, copy=False)

    return doc_vector

if __name__ == "__main__":
    model = Word2Vec.load(setting.model_name)
    with open(setting.sentic_corpus, 'r') as f:
        sentic_dic = json.load(f)

    for i in range(4):
        logger.debug("use %s to build doc vector", setting.build_methods[i])
        for j in range(3):
            if(i <= 1):
                # train
                documents = Documents(setting.dbprefix + `j` + "/train")
                train_fv = build_doc_vector(i, sentic_dic, documents=documents, model=model, \
                    save_file=setting.saveprefix + "train_fv_" + `i` + "_" + `j`)
                if(i == 0):
                    train_label = np.array(list(documents.field_iterator(setting.csv_option.sentiment_name)))
                    np.save(setting.saveprefix + "train_label_" + `j`, train_label)
                # test
                documents = Documents(setting.dbprefix + `j` + "/test")
                test_fv = build_doc_vector(i, sentic_dic, documents=documents, model=model, \
                    save_file=setting.saveprefix + "test_fv_" + `i` + "_" + `j`)
                if(i == 0):
                    test_label = np.array(list(documents.field_iterator(setting.csv_option.sentiment_name)))
                    np.save(setting.saveprefix + "test_label_" + `j`, test_label)
            else:
                train_fv, test_fv = build_doc_vector(i, sentic_dic, \
                    train_file_name=setting.fsprefix + `j` + "/train.p", test_file_name=setting.fsprefix + `j` + "/test.p", \
Ejemplo n.º 6
0
def check_existing_documents(name):
    """Method that checks whether a particular account and its documents exist based on searched account_name"""
    return Documents.find_by_name(name)
Ejemplo n.º 7
0
def find_document(account_name):
    """Function that finds documents based on account_name given"""
    return Documents.find_by_name(account_name)
Ejemplo n.º 8
0
 def setUp(self):
     """Set up method to run befor before each test case"""
     self.new_documents = Documents("Facebook", "12345")
Ejemplo n.º 9
0
 def test_display_all_documents(self):
     """TestCase to test whether all contacts can be displayed"""
     self.assertEqual(Documents.display_documents(), Documents.documents_list)
Ejemplo n.º 10
0
 def test_save_multiple_documents(self):
     """Method that saves multiple documents to documents_list"""
     self.new_documents.save_documents()
     new_test_documents = Documents("Twitter", "56789")
     new_test_documents.save_documents()
     self.assertEqual(len(Documents.documents_list), 2)
Ejemplo n.º 11
0
from gensim.models.phrases import Phraser, Phrases
from documents import Documents
from sentences import Sentences
from gensim.models import Word2Vec
import os

# resultDir='/home/marcin/Documents/AGH/PJN/data/json/word2vec/'
data_dir = "/home/marcin/Documents/AGH/PJN/data/json/word2vec/new"
# #
# print("finding 3-word phrases")
documents = Documents(data_dir)

bigram = Phraser(Phrases(documents))
trigram = Phraser(Phrases(bigram[documents]))
sentences = open(os.path.join(data_dir, "sentences-3.txt"), "w+")
print("saving sentences to file")
for s in trigram[bigram[documents]]:
    for sentece in s:
        sentences.write("{}\t".format(sentece))
    sentences.write("\n")
sentences.close()

# trigram = Phraser(Phrases(bigram[sentences]))

#trigram = Phraser.load(os.path.join(data_dir,"phrases"))

sentences = Sentences(os.path.join(data_dir, "sentences-3.txt"))
#
# print("training model")
# model = Word2Vec(sentences=sentences,window=5,min_count=3,sg=0,size=300)
# model.save(os.path.join(data_dir,"word2vec_model"))
"""
读入索引
解析 查询

进行索引
"""
from time import time

from query import Answer_query, INDEX
from documents import Documents

# DATA_DIR = '/home/luo/HomeWork/news_retrieval/Data/cnn_samples'
#
# q = "loop"
# qa = Answer_query(q)
#
# start = time()
# qa.search(DATA_DIR)
# print("cost {}s".format(time()-start))

if __name__ == "__main__":
    DATA_DIR = "/home/luo/HomeWork/news_retrieval/Data/cnn_5000"
    start = time()
    print("正在建造索引...")
    docs = Documents(DATA_DIR, output_dir="DATA/cnn_5000/")
    docs.generate_index()
    point1 = time()
    print("花费时间: {}s ".format(point1 - start))
    #39s  float64 位存储  726M
Ejemplo n.º 13
0
 def search(self, query_string, limit=0):
     """Search for documents in the database."""
     mset = self._search(query_string, limit)
     return Documents(self, mset)