def save_bulk_data(self, data): for url_id in data: doc_id = slugify(url_id) self.es.index(id=doc_id, body=data[url_id], doc_type='document', index=self.project_name + "-crawler-documents")
def get_seen_urls(self): slugged_url = slugify(self.plugin_name) seen_urls = set() for fn in os.listdir(self.server['documents']): if slugged_url in fn: with open(os.path.join(self.server['documents'], fn)) as f: seen_urls.add(json.load(f)['url']) return seen_urls
def load_index(self): cache_data = {} for fn in os.listdir(self.server['cache']): slugged_plugin = slugify(self.plugin_name) for fn in os.listdir(self.server['cache']): if slugged_plugin in fn: cache_data[fn] = False self.dict = cache_data
def save_bulk_data(self, data): for url_id in data: data[url_id]['_id'] = slugify(url_id) # save bulk per max 10k per time for chunk in chunker(data.values(), 10000): if not chunk: return self.dbs['documents'].bulk_docs(*[x for x in chunk if x is not None]).result()
def save_bulk_data(self, data): for url_id in data: data[url_id]['_id'] = slugify(url_id) # save bulk per max 10k per time for chunk in chunker(data.values(), 10000): if not chunk: return self.dbs['documents'].bulk_docs( *[x for x in chunk if x is not None]).result()
def get_documents(self, maximum_number_of_documents=1000000): slugged_url = slugify(self.plugin_name) results = {} for num, fn in enumerate(os.listdir(self.server['documents'])): if num == maximum_number_of_documents: break if slugged_url in fn: with open(os.path.join(self.server['documents'], fn)) as f: results[slugged_url] = json.load(f) return results
def get_documents(self, maximum_number_of_documents=1000000): slugged_url = slugify(self.plugin_name) results = [] for num, fn in enumerate(os.listdir(self.server['documents'])): if num == maximum_number_of_documents: break if slugged_url in fn: with open(os.path.join(self.server['documents'], fn)) as f: results.append(json.load(f)) return results
def init_cache_storage(self): root = self.storage_object['path'] self.prefix = slugify(self.plugin_name) self.server = {'cache': os.path.join(root, self.project_name + '-crawler-cache', self.prefix)} if self.flush_cache: self.delete_cache() for paths in self.server.values(): os.makedirs(paths, exist_ok=True)
def save_data(self, data): self.server['documents'][slugify(data['url'])] = data transaction.commit()
def save_bulk_data(self, data): for url_id in data: self.server['documents'][slugify(url_id)] = data[url_id] transaction.commit()
def save_data(self, data): with open(os.path.join(self.server['documents'], slugify(data['url'])), 'w') as f: json.dump(data, f)
def save_bulk_data(self, data): for url_id in data: data[url_id]['_id'] = slugify(url_id) return self.dbs['documents'].bulk_docs(*list(data.values())).result()
def delete_doc_url(self, url=None): return self.delete_doc_id(slugify(url))
def save_bulk_data(self, data): for row in data: with open( os.path.join(self.server['documents'], slugify(data[row]['url'])), 'w') as f: json.dump(data[row], f)
def save_bulk_data(self, data): for row in data: with open(os.path.join(self.server['documents'], slugify(data[row]['url'])), 'w') as f: json.dump(data[row], f)
def save_data(self, data): self.es.index(index=self.project_name + "-crawler-documents", doc_type='document', id=slugify(data['url']), body=data)
def save_data(self, data): try: self.dbs['documents'][slugify(data['url'])] = data except requests.exceptions.HTTPError: print('conflict error', slugify(data['url']))