Beispiel #1
0
    def finish_volume(self):
        if self.volume_id is None:
            return

        self.cur.execute(
            """update directory
set written=localtimestamp
where id=%s""", (self.volume_id, ))

        if self.inst_id:
            self.cur.execute(
                """delete from locality
where url_id in (
        select url_id
        from content
        where volume_id=%s
)""", (self.volume_id, ))

        self.cur.execute(
            """select url_id
from content
where volume_id=%s
order by url_id""", (self.volume_id, ))
        rows = self.cur.fetchall()
        for row in rows:
            url_id = row[0]
            self.cond_remove(get_loose_path(url_id))
            self.cond_remove(get_loose_path(url_id, True))

        if self.member_count > 0:
            print("packed %d pages" % (self.member_count, ))
            self.member_count = 0

        self.volume_id = None
Beispiel #2
0
    def open_repre(self, url_id, alt_repre):
        f = None
        loose_path = get_loose_path(url_id, alt_repre=alt_repre)
        if os.path.exists(loose_path):
            f = open(loose_path, "rb")

        return f
Beispiel #3
0
 def write_file(self, url_id, hdr, reader):
     loose_path = get_loose_path(url_id, hdr)
     writer = open(loose_path, "wb")
     try:
         shutil.copyfileobj(reader, writer)
     finally:
         writer.close()
Beispiel #4
0
    def write(self, data):
        if self.body_target is None:
            if self.retrieve_body:
                self.body_target = open(get_loose_path(self.url_id), 'wb')
            else:
                return -1

        return self.body_target.write(data)
Beispiel #5
0
    def delete_storage(self, url_id):
        # for deletion, checking locality is required
        if not self.check_locality:
            return

        for hdr in (True, False):
            loose_path = get_loose_path(url_id, hdr)
            if os.path.exists(loose_path):
                os.remove(loose_path)
Beispiel #6
0
    def run(self):
        self.cur.execute("""select count(*)
from download_queue""")
        row = self.cur.fetchone()
        num_conn = row[0]
        if not num_conn:
            return

        self.lazy_init()

        batch_processed = 0
        row = self.pop_work_item()
        while row:
            url_id = row[0]
            url = self.get_url(url_id)
            self.br.get(url)
            error_code = None
            try:
                WebDriverWait(self.br, 10).until(
                    EC.presence_of_element_located((By.TAG_NAME, 'a')))
            except exceptions.TimeoutException:
                error_code = 500

            eff_id = url_id
            eff_url = self.br.current_url
            msg = "got " + eff_url
            if error_code:
                msg += " with %d" % error_code

            print(msg, file=sys.stderr)

            if url != eff_url:
                eff_id, known = self.add_redirect(url_id, eff_url)

            body = self.br.page_source
            with open(get_loose_path(url_id), 'w') as f:
                f.write(body)

            if error_code:
                self.cur.execute(
                    """insert into download_error(url_id, error_code, failed)
values(%s, %s, localtimestamp)
on conflict(url_id) do update
set error_code=%s, failed=localtimestamp""", (url_id, error_code, error_code))

                self.br.close()
                self.br = None
                self.lazy_init()

            self.finish_page(url_id, eff_id, not error_code)

            batch_processed += 1
            if batch_processed >= self.notification_threshold:
                self.cond_notify()
                batch_processed = 0

            row = self.pop_work_item()
Beispiel #7
0
    def close(self):
        self.header_target.close()
        self.header_target = None

        if self.body_target:
            self.body_target.close()
            self.body_target = None

        if self.url_id != self.eff_id:
            os.rename(get_loose_path(self.url_id, True), get_loose_path(self.eff_id, True))

            old_path = get_loose_path(self.url_id)
            if os.path.exists(old_path):
                if self.retrieve_body:
                    os.rename(old_path, get_loose_path(self.eff_id))
                else:
                    os.remove(old_path)

        self.owner.finish_page(self.url_id, self.eff_id, self.retrieve_body)
Beispiel #8
0
    def open_page_ex(self, url_id, volume_id):
        if not self.alt_repre:
            return self.open_page(url_id, volume_id)

        f = None
        loose_path = get_loose_path(url_id, alt_repre=self.alt_repre)
        if os.path.exists(loose_path):
            f = open(loose_path, "rb")

        return f
Beispiel #9
0
 def __init__(self, owner, url, url_id):
     self.owner = owner
     self.url = url
     self.url_id = url_id
     self.eff_id = url_id
     self.header_target = open(get_loose_path(url_id, True), 'wb')
     self.body_target = None
     self.retrieve_body = True
     self.http_code = None
     self.http_phrase = None
     self.retry_after = None
Beispiel #10
0
    def get_body_size_ex(self, url_id, volume_id):
        if not self.alt_repre:
            return self.get_body_size(url_id, volume_id)

        sz = None
        loose_path = get_loose_path(url_id, alt_repre=self.alt_repre)
        if os.path.exists(loose_path):
            statinfo = os.stat(loose_path)
            sz = statinfo.st_size

        return sz
Beispiel #11
0
    def open_headers(self, url_id, volume_id=None):
        f = None
        if volume_id is None:
            loose_path = get_loose_path(url_id, True)
            if os.path.exists(loose_path):
                f = open(loose_path, "rb")
        else:
            if volume_id != self.volume_id:
                self.change_volume(volume_id)

            try:
                info = self.zp.getinfo(str(url_id) + 'h')
                f = self.zp.open(info)
            except KeyError:
                pass

        return f
Beispiel #12
0
    def get_body_size(self, url_id, volume_id=None):
        sz = None
        if volume_id is None:
            loose_path = get_loose_path(url_id)
            if os.path.exists(loose_path):
                statinfo = os.stat(loose_path)
                sz = statinfo.st_size
        else:
            if volume_id != self.volume_id:
                self.change_volume(volume_id)

            try:
                info = self.zp.getinfo(str(url_id))
                sz = info.file_size
            except KeyError:
                pass

        return sz
Beispiel #13
0
    def purge_fast(self, url_id):
        volume_id = self.get_volume_id(url_id)
        if volume_id is None:
            loose_path = get_loose_path(url_id)
            self.ensure_removed(loose_path)

            loose_path += 'h'
            self.ensure_removed(loose_path)
        else:
            self.shrunk.add(volume_id)
            self.doomed.add(url_id)

        self.purge_from_set(url_id)
        self.purge_to_set(url_id)

        self.cur.execute("""delete from edges
where from_id=%s or to_id=%s""", (url_id, url_id))

        self.cur.execute("""delete from nodes
where url_id=%s""", (url_id,))

        self.cur.execute("""delete from extra
where url_id=%s""", (url_id,))

        self.cur.execute("""delete from redirect
where from_id=%s or to_id=%s""", (url_id, url_id))

        self.cur.execute("""delete from parse_queue
where url_id=%s""", (url_id,))

        self.cur.execute("""delete from content
where url_id=%s""", (url_id,))

        self.cur.execute("""delete from download_error
where url_id=%s""", (url_id,))

        self.cur.execute("""delete from download_queue
where url_id=%s""", (url_id,))

        self.cur.execute("""delete from field
where id=%s""", (url_id,))
Beispiel #14
0
 def make_target(self):
     return open(get_loose_path(self.url_id, True), 'wb')
Beispiel #15
0
 def add_member_half(self, url_id, hdr):
     path = get_loose_path(url_id, hdr)
     if os.path.exists(path):
         self.zip_front.write(path, os.path.basename(path))