def __save_bucket(self, to_db=True): if to_db == True: storage = Storage() storage.connect('crawldb') storage.set_collection('crawlset') Helper.log("Insert count", len(self.crawlset_bucket)) storage.insert_documents(self.crawlset_bucket) else: FileStorage.bulk_write(self.crawlset_bucket, 'content')
def retrieve_crawlsets(to_crawlset = False): storage = Storage() storage.connect('crawldb') storage.set_collection('crawlset') documents = storage.retrieve_all_documents() if to_crawlset is not False: crawlsets = Crawlset.list_from_documents(documents) return crawlsets return documents
def retrieve_crawlsets(to_crawlset=False): storage = Storage() storage.connect('crawldb') storage.set_collection('crawlset') documents = storage.retrieve_all_documents() if to_crawlset is not False: crawlsets = Crawlset.list_from_documents(documents) return crawlsets return documents
def get_html(id): storage = Storage() storage.connect("crawldb") storage.set_collection("crawlset") document = storage.find_one_by_id(id) # TODO: Fix page encoding html = str(document["content"]) html = html.replace(r"b'", "", 1) html = html.replace(r"\r\n", "") html = html.replace(r"\n", "") return html