Beispiel #1
0
 def __save_bucket(self, to_db=True):
     if to_db == True:
         storage = Storage()
         storage.connect('crawldb')
         storage.set_collection('crawlset')
         Helper.log("Insert count", len(self.crawlset_bucket))
         storage.insert_documents(self.crawlset_bucket)
     else:
         FileStorage.bulk_write(self.crawlset_bucket, 'content')
Beispiel #2
0
def retrieve_crawlsets(to_crawlset = False):
    storage = Storage()
    storage.connect('crawldb')
    storage.set_collection('crawlset')
    documents = storage.retrieve_all_documents()
    if to_crawlset is not False:
        crawlsets = Crawlset.list_from_documents(documents)
        return crawlsets
    return documents
Beispiel #3
0
def retrieve_crawlsets(to_crawlset=False):
    storage = Storage()
    storage.connect('crawldb')
    storage.set_collection('crawlset')
    documents = storage.retrieve_all_documents()
    if to_crawlset is not False:
        crawlsets = Crawlset.list_from_documents(documents)
        return crawlsets
    return documents
Beispiel #4
0
def get_html(id):
    storage = Storage()
    storage.connect("crawldb")
    storage.set_collection("crawlset")
    document = storage.find_one_by_id(id)
    # TODO: Fix page encoding
    html = str(document["content"])
    html = html.replace(r"b'", "", 1)
    html = html.replace(r"\r\n", "")
    html = html.replace(r"\n", "")
    return html