def test_find_documents_by_name(self):
        """Test to check if we can find documents and display information"""
        self.new_documents.save_documents()
        new_test_documents = Documents("Twitter", "56789")
        new_test_documents.save_documents()

        found_document = Documents.find_by_name("Twitter")

        self.assertEqual(found_document.account_name, new_test_documents.account_name)
from documents import Documents
from config import settings

documents = Documents(settings['blogs_root'])
documents.process()
documents.dump()
Exemple #3
0
 def get_session_documents(self):
     response = requests.get(self.documents)
     data = xmltodict.parse(response.text)
     return Documents(data)
Exemple #4
0
def create_new_document(account_name, account_password):
    """Function to create a new account and its documents"""
    new_document = Documents(account_name, account_password)
    return new_document
 def setUp(self):
     """Set up method to run befor before each test case"""
     self.new_documents = Documents("Facebook", "12345")
 def test_save_multiple_documents(self):
     """Method that saves multiple documents to documents_list"""
     self.new_documents.save_documents()
     new_test_documents = Documents("Twitter", "56789")
     new_test_documents.save_documents()
     self.assertEqual(len(Documents.documents_list), 2)
Exemple #7
0
from gensim.models.phrases import Phraser, Phrases
from documents import Documents
from sentences import Sentences
from gensim.models import Word2Vec
import os

# resultDir='/home/marcin/Documents/AGH/PJN/data/json/word2vec/'
data_dir = "/home/marcin/Documents/AGH/PJN/data/json/word2vec/new"
# #
# print("finding 3-word phrases")
documents = Documents(data_dir)

bigram = Phraser(Phrases(documents))
trigram = Phraser(Phrases(bigram[documents]))
sentences = open(os.path.join(data_dir, "sentences-3.txt"), "w+")
print("saving sentences to file")
for s in trigram[bigram[documents]]:
    for sentece in s:
        sentences.write("{}\t".format(sentece))
    sentences.write("\n")
sentences.close()

# trigram = Phraser(Phrases(bigram[sentences]))

#trigram = Phraser.load(os.path.join(data_dir,"phrases"))

sentences = Sentences(os.path.join(data_dir, "sentences-3.txt"))
#
# print("training model")
# model = Word2Vec(sentences=sentences,window=5,min_count=3,sg=0,size=300)
# model.save(os.path.join(data_dir,"word2vec_model"))
"""
读入索引
解析 查询

进行索引
"""
from time import time

from query import Answer_query, INDEX
from documents import Documents

# DATA_DIR = '/home/luo/HomeWork/news_retrieval/Data/cnn_samples'
#
# q = "loop"
# qa = Answer_query(q)
#
# start = time()
# qa.search(DATA_DIR)
# print("cost {}s".format(time()-start))

if __name__ == "__main__":
    DATA_DIR = "/home/luo/HomeWork/news_retrieval/Data/cnn_5000"
    start = time()
    print("正在建造索引...")
    docs = Documents(DATA_DIR, output_dir="DATA/cnn_5000/")
    docs.generate_index()
    point1 = time()
    print("花费时间: {}s ".format(point1 - start))
    #39s  float64 位存储  726M
Exemple #9
0
 def search(self, query_string, limit=0):
     """Search for documents in the database."""
     mset = self._search(query_string, limit)
     return Documents(self, mset)