Ejemplo n.º 1
0
    def update_db(self, p=1):
        # TODO  fix hole , update
        db_max_id = db.ses.query(func.max(New.id)).one()[0]

        site_res = self.get_page_result(1)

        logging.info("records on site = %r, max_id in db = %r" %
                     (site_res.total_records, db_max_id))
        news_id = site_res.total_records

        if db_max_id < site_res.total_records:
            n = site_res.total_records - db_max_id
            logging.info("will update %r news" % n)
            # updte news here
            # assume that, n<=12
            for new in site_res.news_list:
                if n <= 0:
                    break
                n -= 1
                print n
                # do update
                src = urlopen(SITE_URL + new['link']).read()
                detail = par.ssdut_news_parse(src)
                r = New(
                    id=news_id,
                    raw=detail.raw,
                    title=detail.title,
                    link=new['link'],
                    body=detail.body,
                    clean_body=detail.clean_body,
                    date=detail.date,
                    publisher=detail.publisher,
                    source=detail.source,
                    source_link=new['source_link'],
                    sha1=detail.sha1,
                    search_text=detail.search_text)
                logging.info("%r added to db, id = %r" % (r, r.id))
                db.ses.add(r)
                db.ses.commit()
                news_id -= 1
        else:
            logging.info("no news to be update")
        logging.debug("update finish")
Ejemplo n.º 2
0
    def reset_news_db(self):
        ''' get the first 10 pages news and store them in db'''
        logging.warn("reset_news_db called, but will have no effect")
        return  # just return

        # delete all records in db
        for r in New.query.all():
            db.ses.delete(r)
        db.ses.commit()
        logging.debug("delete all news records in db")

        # get all the news links
        res_list = []
        for p in xrange(1, 220):
            res_list.append(self.get_page_result(p))

        # get news detail and store in db
        news_id = res_list[0].total_records
        for page in res_list:
            for new in page.news_list:
                #try:
                src = urlopen(SITE_URL + new['link']).read()
                detail = par.ssdut_news_parse(src)
                r = New(
                        id=news_id,
                        raw=detail.raw,
                        title=detail.title,
                        link=new['link'],
                        body=detail.body,
                        clean_body=detail.clean_body,
                        date=detail.date,
                        publisher=detail.publisher,
                        source=detail.source,
                        source_link=new['source_link'],
                        sha1=detail.sha1,
                        search_text=detail.search_text)
                db.ses.add(r)
                db.ses.commit()
                logging.info("%r, added, link=%r, page_no = %r" %
                                 (r, r.link, page.page_no))
                news_id -= 1
Ejemplo n.º 3
0
    def reset_news_db(self):
        ''' get the first 10 pages news and store them in db'''
        logging.warn("reset_news_db called, but will have no effect")
        return  # just return

        # delete all records in db
        for r in New.query.all():
            db.ses.delete(r)
        db.ses.commit()
        logging.debug("delete all news records in db")

        # get all the news links
        res_list = []
        for p in xrange(1, 220):
            res_list.append(self.get_page_result(p))

        # get news detail and store in db
        news_id = res_list[0].total_records
        for page in res_list:
            for new in page.news_list:
                #try:
                src = urlopen(SITE_URL + new['link']).read()
                detail = par.ssdut_news_parse(src)
                r = New(id=news_id,
                        raw=detail.raw,
                        title=detail.title,
                        link=new['link'],
                        body=detail.body,
                        clean_body=detail.clean_body,
                        date=detail.date,
                        publisher=detail.publisher,
                        source=detail.source,
                        source_link=new['source_link'],
                        sha1=detail.sha1,
                        search_text=detail.search_text)
                db.ses.add(r)
                db.ses.commit()
                logging.info("%r, added, link=%r, page_no = %r" %
                             (r, r.link, page.page_no))
                news_id -= 1
Ejemplo n.º 4
0
    def update_db(self, p=1):
        # TODO  fix hole , update
        self.do_one_post_in_q()  # do one post

        db_max_id = db.ses.query(func.max(New.id)).one()[0]
        try:
            db.ses.commit()
        except:
            db.ses.rollback()
            db_max_id = 100000

        site_res = self.get_page_result(1)

        logging.info("records on site = %r, max_id in db = %r" %
                     (site_res.total_records, db_max_id))
        news_id = site_res.total_records

        if db_max_id < site_res.total_records:
            n = site_res.total_records - db_max_id
            logging.info("will update %r news" % n)
            # updte news here
            # assume that, n<=12
            for new in site_res.news_list:
                if n <= 0:
                    break
                n -= 1
                print n
                # do update
                try:
                    src = urlopen(SITE_URL + new['link']).read()
                except:
                    logging.error("urlopen() ERROR, link = %r" % new['link'])
                    news_id -= 1
                    continue
                detail = par.ssdut_news_parse(src)
                r = New(
                    id=news_id,
                    raw=detail.raw,
                    title=detail.title,
                    link=new['link'],
                    body=detail.body,
                    clean_body=detail.clean_body,
                    date=detail.date,
                    publisher=detail.publisher,
                    source=detail.source,
                    source_link=new['source_link'],
                    sha1=detail.sha1,
                    search_text=detail.search_text)
                logging.info("%r added to db, id = %r" % (r, r.id))
                db.ses.add(r)
                try:
                    db.ses.commit()
                except:
                    db.ses.rollback()
                    logging.error("session commit error, when add %r" % r)

                #  add a post to queue
                s = self.add_new_post_to_q(news_id)

                news_id -= 1
        else:
            pass
        logging.debug("update finish")
Ejemplo n.º 5
0
    def update_db(self, p=1):
        self.do_one_post_in_q()  # do one post

        db_max_id = db.ses.query(func.max(New.id)).one()[0]
        db_max_record = New.query.filter(New.id == db_max_id).one()

        try:
            db.ses.commit()
        except:
            db.ses.rollback()
            db_max_id = 100000
            logging.warn("get max db record faild")
            return

        site_res = self.get_page_result(1)

        logging.info(
            "records on site = %r, max_id in db = %r, max_url_db = %r, max_url_site = %r"
            % (site_res.total_records, db_max_id, db_max_record.link,
               site_res.news_list[0]['link']))
        news_id = site_res.total_records

        # id less than db , or link different
        if db_max_id < site_res.total_records or db_max_record.link != site_res.news_list[
                0]['link']:
            n = site_res.total_records - db_max_id
            logging.info("max_record_on_site - max_id_in_db = %r" % n)
            # updte news here
            # assume that, n<=12
            to_be_added_list = []
            for new in site_res.news_list:
                #if n <= 0:
                #    break
                n -= 1
                logging.info("n=%r, link=%r" % (n, new['link']))
                # do update
                try:
                    src = urlopen(SITE_URL + new['link']).read()
                except:
                    logging.error("urlopen() ERROR, link = %r" % new['link'])
                    news_id -= 1
                    continue
                detail = par.ssdut_news_parse(src)

                # if link encounter the same, break

                if new['link'] == db_max_record.link:
                    logging.info(
                        "encounter same url, update stop, site_url = %r, db_max_url=%r"
                        % (new['link'], db_max_record.link))
                    break
                elif detail['title'] == db_max_record.title:
                    logging.info(
                        "encounter same title, update stop, site_url = %r, db_max_url=%r, site_title=%r, db_title=%r"
                        % (new['link'], db_max_record.link, detail['title'],
                           db_max_record.title))
                    break
                elif detail['sha1'] == db_max_record.title:
                    logging.info(
                        "encounter same sha1, update stop, site_url = %r, db_max_url=%r, site_sha1=%r, db_sha1=%r"
                        % (new['link'], db_max_record.link, detail['sha1'],
                           db_max_record.sha1))
                    break
                elif detail['body'] == db_max_record.body:
                    logging.info(
                        "encounter same body, update stop, site_url = %r, db_max_url=%r, site_sha1=%r, db_sha1=%r"
                        % (new['link'], db_max_record.link, detail['body'],
                           db_max_record.body))
                    break
                else:
                    logging.info(
                        "! a new thread find, new_url = %r, db_max_url= %r" %
                        (new['link'], db_max_record.link))
                    to_be_added_list.append(new)
            to_be_added_len = len(to_be_added_list)
            logging.info("%r  records will be added" % to_be_added_len)

            for new in to_be_added_list:

                try:
                    src = urlopen(SITE_URL + new['link']).read()
                except:
                    logging.error("urlopen() ERROR, link = %r" % new['link'])
                    news_id -= 1
                    continue
                finally:
                    to_be_added_len -= 1
                detail = par.ssdut_news_parse(src)
                r = New(id=to_be_added_len + db_max_id + 1,
                        raw=detail.raw,
                        title=detail.title,
                        link=new['link'],
                        body=detail.body,
                        clean_body=detail.clean_body,
                        date=detail.date,
                        publisher=detail.publisher,
                        source=detail.source,
                        source_link=new['source_link'],
                        sha1=detail.sha1,
                        search_text=detail.search_text)
                logging.info("%r added to db, id = %r, link = %r" %
                             (r, r.id, r.link))
                db.ses.add(r)
                try:
                    db.ses.commit()
                except:
                    db.ses.rollback()
                    logging.error("session commit error, when add %r" % r)

                #  add a post to queue
                s = self.add_new_post_to_q(r.id)
        else:
            pass
        logging.debug("update finish")
Ejemplo n.º 6
0
    def update_db(self, p=1):
        self.do_one_post_in_q()  # do one post

        db_max_id = db.ses.query(func.max(New.id)).one()[0]
        db_max_record = New.query.filter(New.id==db_max_id).one()

        try:
            db.ses.commit()
        except:
            db.ses.rollback()
            db_max_id = 100000
            logging.warn("get max db record faild")
            return

        site_res = self.get_page_result(1)

        logging.info("records on site = %r, max_id in db = %r, max_url_db = %r, max_url_site = %r" %
                     (site_res.total_records, db_max_id, db_max_record.link, site_res.news_list[0]['link']))
        news_id = site_res.total_records

        # id less than db , or link different
        if db_max_id < site_res.total_records or db_max_record.link != site_res.news_list[0]['link']:
            n = site_res.total_records - db_max_id
            logging.info("max_record_on_site - max_id_in_db = %r" % n)
            # updte news here
            # assume that, n<=12
            to_be_added_list = []
            for new in site_res.news_list:
                #if n <= 0:
                #    break
                n -= 1
                logging.info("n=%r, link=%r" % (n, new['link']))
                # do update
                try:
                    src = urlopen(SITE_URL + new['link']).read()
                except:
                    logging.error("urlopen() ERROR, link = %r" % new['link'])
                    news_id -= 1
                    continue
                detail = par.ssdut_news_parse(src)
                
                # if link encounter the same, break
                
                if new['link'] == db_max_record.link:
                    logging.info("encounter same url, update db stop here, site_url = %r, db_max_url=%r" %(new['link'], db_max_record.link))
                    break
                else:
                    logging.info("! a new url find, new_url = %r, db_max_url= %r" % (new['link'], db_max_record.link))
                    to_be_added_list.append(new)
            to_be_added_len = len(to_be_added_list)
            logging.info("%r  records will be added" % to_be_added_len)

            for new in to_be_added_list:

                try:
                    src = urlopen(SITE_URL + new['link']).read()
                except:
                    logging.error("urlopen() ERROR, link = %r" % new['link'])
                    news_id -= 1
                    continue
                finally:
                    to_be_added_len -= 1
                detail = par.ssdut_news_parse(src)
                r = New(
                    id=to_be_added_len + db_max_id + 1,
                    raw=detail.raw,
                    title=detail.title,
                    link=new['link'],
                    body=detail.body,
                    clean_body=detail.clean_body,
                    date=detail.date,
                    publisher=detail.publisher,
                    source=detail.source,
                    source_link=new['source_link'],
                    sha1=detail.sha1,
                    search_text=detail.search_text)
                logging.info("%r added to db, id = %r, link = %r" % (r, r.id, r.link))
                db.ses.add(r)
                try:
                    db.ses.commit()
                except:
                    db.ses.rollback()
                    logging.error("session commit error, when add %r" % r)

                #  add a post to queue
                s = self.add_new_post_to_q(r.id)
        else:
            pass
        logging.debug("update finish")
Ejemplo n.º 7
0
    def update_db(self, p=1):
        # TODO  fix hole , update
        self.do_one_post_in_q()  # do one post

        db_max_id = db.ses.query(func.max(New.id)).one()[0]
        try:
            db.ses.commit()
        except:
            db.ses.rollback()
            db_max_id = 100000

        site_res = self.get_page_result(1)

        logging.info("records on site = %r, max_id in db = %r" %
                     (site_res.total_records, db_max_id))
        news_id = site_res.total_records

        if db_max_id < site_res.total_records:
            n = site_res.total_records - db_max_id
            logging.info("will update %r news" % n)
            # updte news here
            # assume that, n<=12
            for new in site_res.news_list:
                if n <= 0:
                    break
                n -= 1
                print n
                # do update
                try:
                    src = urlopen(SITE_URL + new['link']).read()
                except:
                    logging.error("urlopen() ERROR, link = %r" % new['link'])
                    news_id -= 1
                    continue
                detail = par.ssdut_news_parse(src)
                r = New(id=news_id,
                        raw=detail.raw,
                        title=detail.title,
                        link=new['link'],
                        body=detail.body,
                        clean_body=detail.clean_body,
                        date=detail.date,
                        publisher=detail.publisher,
                        source=detail.source,
                        source_link=new['source_link'],
                        sha1=detail.sha1,
                        search_text=detail.search_text)
                logging.info("%r added to db, id = %r" % (r, r.id))
                db.ses.add(r)
                try:
                    db.ses.commit()
                except:
                    db.ses.rollback()
                    logging.error("session commit error, when add %r" % r)

                #  add a post to queue
                s = self.add_new_post_to_q(news_id)

                news_id -= 1
        else:
            pass
        logging.debug("update finish")