Ejemplo n.º 1
0
 def save_bulk_data(self, data):
     for url_id in data:
         doc_id = slugify(url_id)
         self.es.index(id=doc_id,
                       body=data[url_id],
                       doc_type='document',
                       index=self.project_name + "-crawler-documents")
Ejemplo n.º 2
0
 def get_seen_urls(self):
     slugged_url = slugify(self.plugin_name)
     seen_urls = set()
     for fn in os.listdir(self.server['documents']):
         if slugged_url in fn:
             with open(os.path.join(self.server['documents'], fn)) as f:
                 seen_urls.add(json.load(f)['url'])
     return seen_urls
Ejemplo n.º 3
0
 def get_seen_urls(self):
     slugged_url = slugify(self.plugin_name)
     seen_urls = set()
     for fn in os.listdir(self.server['documents']):
         if slugged_url in fn:
             with open(os.path.join(self.server['documents'], fn)) as f:
                 seen_urls.add(json.load(f)['url'])
     return seen_urls
Ejemplo n.º 4
0
 def load_index(self):
     cache_data = {}
     for fn in os.listdir(self.server['cache']):
         slugged_plugin = slugify(self.plugin_name)
         for fn in os.listdir(self.server['cache']):
             if slugged_plugin in fn:
                 cache_data[fn] = False
     self.dict = cache_data
Ejemplo n.º 5
0
 def save_bulk_data(self, data):
     for url_id in data:
         data[url_id]['_id'] = slugify(url_id)
     # save bulk per max 10k per time
     for chunk in chunker(data.values(), 10000):
         if not chunk:
             return
         self.dbs['documents'].bulk_docs(*[x for x in chunk if x is not None]).result()
Ejemplo n.º 6
0
 def save_bulk_data(self, data):
     for url_id in data:
         data[url_id]['_id'] = slugify(url_id)
     # save bulk per max 10k per time
     for chunk in chunker(data.values(), 10000):
         if not chunk:
             return
         self.dbs['documents'].bulk_docs(
             *[x for x in chunk if x is not None]).result()
Ejemplo n.º 7
0
 def get_documents(self, maximum_number_of_documents=1000000):
     slugged_url = slugify(self.plugin_name)
     results = {}
     for num, fn in enumerate(os.listdir(self.server['documents'])):
         if num == maximum_number_of_documents:
             break
         if slugged_url in fn:
             with open(os.path.join(self.server['documents'], fn)) as f:
                 results[slugged_url] = json.load(f)
     return results
Ejemplo n.º 8
0
 def get_documents(self, maximum_number_of_documents=1000000):
     slugged_url = slugify(self.plugin_name)
     results = []
     for num, fn in enumerate(os.listdir(self.server['documents'])):
         if num == maximum_number_of_documents:
             break
         if slugged_url in fn:
             with open(os.path.join(self.server['documents'], fn)) as f:
                 results.append(json.load(f))
     return results
Ejemplo n.º 9
0
    def init_cache_storage(self):
        root = self.storage_object['path']

        self.prefix = slugify(self.plugin_name)

        self.server = {'cache':
                       os.path.join(root, self.project_name + '-crawler-cache', self.prefix)}

        if self.flush_cache:
            self.delete_cache()

        for paths in self.server.values():
            os.makedirs(paths, exist_ok=True)
Ejemplo n.º 10
0
 def save_data(self, data):
     self.server['documents'][slugify(data['url'])] = data
     transaction.commit()
Ejemplo n.º 11
0
 def save_bulk_data(self, data):
     for url_id in data:
         self.server['documents'][slugify(url_id)] = data[url_id]
     transaction.commit()
Ejemplo n.º 12
0
 def save_data(self, data):
     with open(os.path.join(self.server['documents'], slugify(data['url'])),
               'w') as f:
         json.dump(data, f)
Ejemplo n.º 13
0
 def save_bulk_data(self, data):
     for url_id in data:
         doc_id = slugify(url_id)
         self.es.index(id=doc_id, body=data[url_id], doc_type='document',
                       index=self.project_name + "-crawler-documents")
Ejemplo n.º 14
0
 def save_bulk_data(self, data):
     for url_id in data:
         data[url_id]['_id'] = slugify(url_id)
     return self.dbs['documents'].bulk_docs(*list(data.values())).result()
Ejemplo n.º 15
0
 def save_data(self, data):
     with open(os.path.join(self.server['documents'], slugify(data['url'])), 'w') as f:
         json.dump(data, f)
Ejemplo n.º 16
0
 def save_bulk_data(self, data):
     for url_id in data:
         self.server['documents'][slugify(url_id)] = data[url_id]
     transaction.commit()
Ejemplo n.º 17
0
 def delete_doc_url(self, url=None):
     return self.delete_doc_id(slugify(url))
Ejemplo n.º 18
0
 def save_bulk_data(self, data):
     for row in data:
         with open(
                 os.path.join(self.server['documents'],
                              slugify(data[row]['url'])), 'w') as f:
             json.dump(data[row], f)
Ejemplo n.º 19
0
 def save_data(self, data):
     self.server['documents'][slugify(data['url'])] = data
     transaction.commit()
Ejemplo n.º 20
0
 def save_bulk_data(self, data):
     for row in data:
         with open(os.path.join(self.server['documents'], slugify(data[row]['url'])), 'w') as f:
             json.dump(data[row], f)
Ejemplo n.º 21
0
 def save_data(self, data):
     self.es.index(index=self.project_name + "-crawler-documents",
                   doc_type='document',
                   id=slugify(data['url']),
                   body=data)
Ejemplo n.º 22
0
 def save_data(self, data):
     try:
         self.dbs['documents'][slugify(data['url'])] = data
     except requests.exceptions.HTTPError:
         print('conflict error', slugify(data['url']))
Ejemplo n.º 23
0
 def save_data(self, data):
     self.es.index(index=self.project_name + "-crawler-documents", doc_type='document',
                   id=slugify(data['url']), body=data)
Ejemplo n.º 24
0
 def save_data(self, data):
     try:
         self.dbs['documents'][slugify(data['url'])] = data
     except requests.exceptions.HTTPError:
         print('conflict error', slugify(data['url']))
Ejemplo n.º 25
0
 def delete_doc_url(self, url=None):
     return self.delete_doc_id(slugify(url))