def finish_volume(self): if self.volume_id is None: return self.cur.execute( """update directory set written=localtimestamp where id=%s""", (self.volume_id, )) if self.inst_id: self.cur.execute( """delete from locality where url_id in ( select url_id from content where volume_id=%s )""", (self.volume_id, )) self.cur.execute( """select url_id from content where volume_id=%s order by url_id""", (self.volume_id, )) rows = self.cur.fetchall() for row in rows: url_id = row[0] self.cond_remove(get_loose_path(url_id)) self.cond_remove(get_loose_path(url_id, True)) if self.member_count > 0: print("packed %d pages" % (self.member_count, )) self.member_count = 0 self.volume_id = None
def open_repre(self, url_id, alt_repre): f = None loose_path = get_loose_path(url_id, alt_repre=alt_repre) if os.path.exists(loose_path): f = open(loose_path, "rb") return f
def write_file(self, url_id, hdr, reader): loose_path = get_loose_path(url_id, hdr) writer = open(loose_path, "wb") try: shutil.copyfileobj(reader, writer) finally: writer.close()
def write(self, data): if self.body_target is None: if self.retrieve_body: self.body_target = open(get_loose_path(self.url_id), 'wb') else: return -1 return self.body_target.write(data)
def delete_storage(self, url_id): # for deletion, checking locality is required if not self.check_locality: return for hdr in (True, False): loose_path = get_loose_path(url_id, hdr) if os.path.exists(loose_path): os.remove(loose_path)
def run(self): self.cur.execute("""select count(*) from download_queue""") row = self.cur.fetchone() num_conn = row[0] if not num_conn: return self.lazy_init() batch_processed = 0 row = self.pop_work_item() while row: url_id = row[0] url = self.get_url(url_id) self.br.get(url) error_code = None try: WebDriverWait(self.br, 10).until( EC.presence_of_element_located((By.TAG_NAME, 'a'))) except exceptions.TimeoutException: error_code = 500 eff_id = url_id eff_url = self.br.current_url msg = "got " + eff_url if error_code: msg += " with %d" % error_code print(msg, file=sys.stderr) if url != eff_url: eff_id, known = self.add_redirect(url_id, eff_url) body = self.br.page_source with open(get_loose_path(url_id), 'w') as f: f.write(body) if error_code: self.cur.execute( """insert into download_error(url_id, error_code, failed) values(%s, %s, localtimestamp) on conflict(url_id) do update set error_code=%s, failed=localtimestamp""", (url_id, error_code, error_code)) self.br.close() self.br = None self.lazy_init() self.finish_page(url_id, eff_id, not error_code) batch_processed += 1 if batch_processed >= self.notification_threshold: self.cond_notify() batch_processed = 0 row = self.pop_work_item()
def close(self): self.header_target.close() self.header_target = None if self.body_target: self.body_target.close() self.body_target = None if self.url_id != self.eff_id: os.rename(get_loose_path(self.url_id, True), get_loose_path(self.eff_id, True)) old_path = get_loose_path(self.url_id) if os.path.exists(old_path): if self.retrieve_body: os.rename(old_path, get_loose_path(self.eff_id)) else: os.remove(old_path) self.owner.finish_page(self.url_id, self.eff_id, self.retrieve_body)
def open_page_ex(self, url_id, volume_id): if not self.alt_repre: return self.open_page(url_id, volume_id) f = None loose_path = get_loose_path(url_id, alt_repre=self.alt_repre) if os.path.exists(loose_path): f = open(loose_path, "rb") return f
def __init__(self, owner, url, url_id): self.owner = owner self.url = url self.url_id = url_id self.eff_id = url_id self.header_target = open(get_loose_path(url_id, True), 'wb') self.body_target = None self.retrieve_body = True self.http_code = None self.http_phrase = None self.retry_after = None
def get_body_size_ex(self, url_id, volume_id): if not self.alt_repre: return self.get_body_size(url_id, volume_id) sz = None loose_path = get_loose_path(url_id, alt_repre=self.alt_repre) if os.path.exists(loose_path): statinfo = os.stat(loose_path) sz = statinfo.st_size return sz
def open_headers(self, url_id, volume_id=None): f = None if volume_id is None: loose_path = get_loose_path(url_id, True) if os.path.exists(loose_path): f = open(loose_path, "rb") else: if volume_id != self.volume_id: self.change_volume(volume_id) try: info = self.zp.getinfo(str(url_id) + 'h') f = self.zp.open(info) except KeyError: pass return f
def get_body_size(self, url_id, volume_id=None): sz = None if volume_id is None: loose_path = get_loose_path(url_id) if os.path.exists(loose_path): statinfo = os.stat(loose_path) sz = statinfo.st_size else: if volume_id != self.volume_id: self.change_volume(volume_id) try: info = self.zp.getinfo(str(url_id)) sz = info.file_size except KeyError: pass return sz
def purge_fast(self, url_id): volume_id = self.get_volume_id(url_id) if volume_id is None: loose_path = get_loose_path(url_id) self.ensure_removed(loose_path) loose_path += 'h' self.ensure_removed(loose_path) else: self.shrunk.add(volume_id) self.doomed.add(url_id) self.purge_from_set(url_id) self.purge_to_set(url_id) self.cur.execute("""delete from edges where from_id=%s or to_id=%s""", (url_id, url_id)) self.cur.execute("""delete from nodes where url_id=%s""", (url_id,)) self.cur.execute("""delete from extra where url_id=%s""", (url_id,)) self.cur.execute("""delete from redirect where from_id=%s or to_id=%s""", (url_id, url_id)) self.cur.execute("""delete from parse_queue where url_id=%s""", (url_id,)) self.cur.execute("""delete from content where url_id=%s""", (url_id,)) self.cur.execute("""delete from download_error where url_id=%s""", (url_id,)) self.cur.execute("""delete from download_queue where url_id=%s""", (url_id,)) self.cur.execute("""delete from field where id=%s""", (url_id,))
def make_target(self): return open(get_loose_path(self.url_id, True), 'wb')
def add_member_half(self, url_id, hdr): path = get_loose_path(url_id, hdr) if os.path.exists(path): self.zip_front.write(path, os.path.basename(path))