def test_find_documents_by_name(self): """Test to check if we can find documents and display information""" self.new_documents.save_documents() new_test_documents = Documents("Twitter", "56789") new_test_documents.save_documents() found_document = Documents.find_by_name("Twitter") self.assertEqual(found_document.account_name, new_test_documents.account_name)
from documents import Documents from config import settings documents = Documents(settings['blogs_root']) documents.process() documents.dump()
def get_session_documents(self): response = requests.get(self.documents) data = xmltodict.parse(response.text) return Documents(data)
def create_new_document(account_name, account_password): """Function to create a new account and its documents""" new_document = Documents(account_name, account_password) return new_document
def setUp(self): """Set up method to run befor before each test case""" self.new_documents = Documents("Facebook", "12345")
def test_save_multiple_documents(self): """Method that saves multiple documents to documents_list""" self.new_documents.save_documents() new_test_documents = Documents("Twitter", "56789") new_test_documents.save_documents() self.assertEqual(len(Documents.documents_list), 2)
from gensim.models.phrases import Phraser, Phrases from documents import Documents from sentences import Sentences from gensim.models import Word2Vec import os # resultDir='/home/marcin/Documents/AGH/PJN/data/json/word2vec/' data_dir = "/home/marcin/Documents/AGH/PJN/data/json/word2vec/new" # # # print("finding 3-word phrases") documents = Documents(data_dir) bigram = Phraser(Phrases(documents)) trigram = Phraser(Phrases(bigram[documents])) sentences = open(os.path.join(data_dir, "sentences-3.txt"), "w+") print("saving sentences to file") for s in trigram[bigram[documents]]: for sentece in s: sentences.write("{}\t".format(sentece)) sentences.write("\n") sentences.close() # trigram = Phraser(Phrases(bigram[sentences])) #trigram = Phraser.load(os.path.join(data_dir,"phrases")) sentences = Sentences(os.path.join(data_dir, "sentences-3.txt")) # # print("training model") # model = Word2Vec(sentences=sentences,window=5,min_count=3,sg=0,size=300) # model.save(os.path.join(data_dir,"word2vec_model"))
""" 读入索引 解析 查询 进行索引 """ from time import time from query import Answer_query, INDEX from documents import Documents # DATA_DIR = '/home/luo/HomeWork/news_retrieval/Data/cnn_samples' # # q = "loop" # qa = Answer_query(q) # # start = time() # qa.search(DATA_DIR) # print("cost {}s".format(time()-start)) if __name__ == "__main__": DATA_DIR = "/home/luo/HomeWork/news_retrieval/Data/cnn_5000" start = time() print("正在建造索引...") docs = Documents(DATA_DIR, output_dir="DATA/cnn_5000/") docs.generate_index() point1 = time() print("花费时间: {}s ".format(point1 - start)) #39s float64 位存储 726M
def search(self, query_string, limit=0): """Search for documents in the database.""" mset = self._search(query_string, limit) return Documents(self, mset)