def update_db(self, p=1): # TODO fix hole , update db_max_id = db.ses.query(func.max(New.id)).one()[0] site_res = self.get_page_result(1) logging.info("records on site = %r, max_id in db = %r" % (site_res.total_records, db_max_id)) news_id = site_res.total_records if db_max_id < site_res.total_records: n = site_res.total_records - db_max_id logging.info("will update %r news" % n) # updte news here # assume that, n<=12 for new in site_res.news_list: if n <= 0: break n -= 1 print n # do update src = urlopen(SITE_URL + new['link']).read() detail = par.ssdut_news_parse(src) r = New( id=news_id, raw=detail.raw, title=detail.title, link=new['link'], body=detail.body, clean_body=detail.clean_body, date=detail.date, publisher=detail.publisher, source=detail.source, source_link=new['source_link'], sha1=detail.sha1, search_text=detail.search_text) logging.info("%r added to db, id = %r" % (r, r.id)) db.ses.add(r) db.ses.commit() news_id -= 1 else: logging.info("no news to be update") logging.debug("update finish")
def reset_news_db(self): ''' get the first 10 pages news and store them in db''' logging.warn("reset_news_db called, but will have no effect") return # just return # delete all records in db for r in New.query.all(): db.ses.delete(r) db.ses.commit() logging.debug("delete all news records in db") # get all the news links res_list = [] for p in xrange(1, 220): res_list.append(self.get_page_result(p)) # get news detail and store in db news_id = res_list[0].total_records for page in res_list: for new in page.news_list: #try: src = urlopen(SITE_URL + new['link']).read() detail = par.ssdut_news_parse(src) r = New( id=news_id, raw=detail.raw, title=detail.title, link=new['link'], body=detail.body, clean_body=detail.clean_body, date=detail.date, publisher=detail.publisher, source=detail.source, source_link=new['source_link'], sha1=detail.sha1, search_text=detail.search_text) db.ses.add(r) db.ses.commit() logging.info("%r, added, link=%r, page_no = %r" % (r, r.link, page.page_no)) news_id -= 1
def reset_news_db(self): ''' get the first 10 pages news and store them in db''' logging.warn("reset_news_db called, but will have no effect") return # just return # delete all records in db for r in New.query.all(): db.ses.delete(r) db.ses.commit() logging.debug("delete all news records in db") # get all the news links res_list = [] for p in xrange(1, 220): res_list.append(self.get_page_result(p)) # get news detail and store in db news_id = res_list[0].total_records for page in res_list: for new in page.news_list: #try: src = urlopen(SITE_URL + new['link']).read() detail = par.ssdut_news_parse(src) r = New(id=news_id, raw=detail.raw, title=detail.title, link=new['link'], body=detail.body, clean_body=detail.clean_body, date=detail.date, publisher=detail.publisher, source=detail.source, source_link=new['source_link'], sha1=detail.sha1, search_text=detail.search_text) db.ses.add(r) db.ses.commit() logging.info("%r, added, link=%r, page_no = %r" % (r, r.link, page.page_no)) news_id -= 1
def update_db(self, p=1): # TODO fix hole , update self.do_one_post_in_q() # do one post db_max_id = db.ses.query(func.max(New.id)).one()[0] try: db.ses.commit() except: db.ses.rollback() db_max_id = 100000 site_res = self.get_page_result(1) logging.info("records on site = %r, max_id in db = %r" % (site_res.total_records, db_max_id)) news_id = site_res.total_records if db_max_id < site_res.total_records: n = site_res.total_records - db_max_id logging.info("will update %r news" % n) # updte news here # assume that, n<=12 for new in site_res.news_list: if n <= 0: break n -= 1 print n # do update try: src = urlopen(SITE_URL + new['link']).read() except: logging.error("urlopen() ERROR, link = %r" % new['link']) news_id -= 1 continue detail = par.ssdut_news_parse(src) r = New( id=news_id, raw=detail.raw, title=detail.title, link=new['link'], body=detail.body, clean_body=detail.clean_body, date=detail.date, publisher=detail.publisher, source=detail.source, source_link=new['source_link'], sha1=detail.sha1, search_text=detail.search_text) logging.info("%r added to db, id = %r" % (r, r.id)) db.ses.add(r) try: db.ses.commit() except: db.ses.rollback() logging.error("session commit error, when add %r" % r) # add a post to queue s = self.add_new_post_to_q(news_id) news_id -= 1 else: pass logging.debug("update finish")
def update_db(self, p=1): self.do_one_post_in_q() # do one post db_max_id = db.ses.query(func.max(New.id)).one()[0] db_max_record = New.query.filter(New.id == db_max_id).one() try: db.ses.commit() except: db.ses.rollback() db_max_id = 100000 logging.warn("get max db record faild") return site_res = self.get_page_result(1) logging.info( "records on site = %r, max_id in db = %r, max_url_db = %r, max_url_site = %r" % (site_res.total_records, db_max_id, db_max_record.link, site_res.news_list[0]['link'])) news_id = site_res.total_records # id less than db , or link different if db_max_id < site_res.total_records or db_max_record.link != site_res.news_list[ 0]['link']: n = site_res.total_records - db_max_id logging.info("max_record_on_site - max_id_in_db = %r" % n) # updte news here # assume that, n<=12 to_be_added_list = [] for new in site_res.news_list: #if n <= 0: # break n -= 1 logging.info("n=%r, link=%r" % (n, new['link'])) # do update try: src = urlopen(SITE_URL + new['link']).read() except: logging.error("urlopen() ERROR, link = %r" % new['link']) news_id -= 1 continue detail = par.ssdut_news_parse(src) # if link encounter the same, break if new['link'] == db_max_record.link: logging.info( "encounter same url, update stop, site_url = %r, db_max_url=%r" % (new['link'], db_max_record.link)) break elif detail['title'] == db_max_record.title: logging.info( "encounter same title, update stop, site_url = %r, db_max_url=%r, site_title=%r, db_title=%r" % (new['link'], db_max_record.link, detail['title'], db_max_record.title)) break elif detail['sha1'] == db_max_record.title: logging.info( "encounter same sha1, update stop, site_url = %r, db_max_url=%r, site_sha1=%r, db_sha1=%r" % (new['link'], db_max_record.link, detail['sha1'], db_max_record.sha1)) break elif detail['body'] == db_max_record.body: logging.info( "encounter same body, update stop, site_url = %r, db_max_url=%r, site_sha1=%r, db_sha1=%r" % (new['link'], db_max_record.link, detail['body'], db_max_record.body)) break else: logging.info( "! a new thread find, new_url = %r, db_max_url= %r" % (new['link'], db_max_record.link)) to_be_added_list.append(new) to_be_added_len = len(to_be_added_list) logging.info("%r records will be added" % to_be_added_len) for new in to_be_added_list: try: src = urlopen(SITE_URL + new['link']).read() except: logging.error("urlopen() ERROR, link = %r" % new['link']) news_id -= 1 continue finally: to_be_added_len -= 1 detail = par.ssdut_news_parse(src) r = New(id=to_be_added_len + db_max_id + 1, raw=detail.raw, title=detail.title, link=new['link'], body=detail.body, clean_body=detail.clean_body, date=detail.date, publisher=detail.publisher, source=detail.source, source_link=new['source_link'], sha1=detail.sha1, search_text=detail.search_text) logging.info("%r added to db, id = %r, link = %r" % (r, r.id, r.link)) db.ses.add(r) try: db.ses.commit() except: db.ses.rollback() logging.error("session commit error, when add %r" % r) # add a post to queue s = self.add_new_post_to_q(r.id) else: pass logging.debug("update finish")
def update_db(self, p=1): self.do_one_post_in_q() # do one post db_max_id = db.ses.query(func.max(New.id)).one()[0] db_max_record = New.query.filter(New.id==db_max_id).one() try: db.ses.commit() except: db.ses.rollback() db_max_id = 100000 logging.warn("get max db record faild") return site_res = self.get_page_result(1) logging.info("records on site = %r, max_id in db = %r, max_url_db = %r, max_url_site = %r" % (site_res.total_records, db_max_id, db_max_record.link, site_res.news_list[0]['link'])) news_id = site_res.total_records # id less than db , or link different if db_max_id < site_res.total_records or db_max_record.link != site_res.news_list[0]['link']: n = site_res.total_records - db_max_id logging.info("max_record_on_site - max_id_in_db = %r" % n) # updte news here # assume that, n<=12 to_be_added_list = [] for new in site_res.news_list: #if n <= 0: # break n -= 1 logging.info("n=%r, link=%r" % (n, new['link'])) # do update try: src = urlopen(SITE_URL + new['link']).read() except: logging.error("urlopen() ERROR, link = %r" % new['link']) news_id -= 1 continue detail = par.ssdut_news_parse(src) # if link encounter the same, break if new['link'] == db_max_record.link: logging.info("encounter same url, update db stop here, site_url = %r, db_max_url=%r" %(new['link'], db_max_record.link)) break else: logging.info("! a new url find, new_url = %r, db_max_url= %r" % (new['link'], db_max_record.link)) to_be_added_list.append(new) to_be_added_len = len(to_be_added_list) logging.info("%r records will be added" % to_be_added_len) for new in to_be_added_list: try: src = urlopen(SITE_URL + new['link']).read() except: logging.error("urlopen() ERROR, link = %r" % new['link']) news_id -= 1 continue finally: to_be_added_len -= 1 detail = par.ssdut_news_parse(src) r = New( id=to_be_added_len + db_max_id + 1, raw=detail.raw, title=detail.title, link=new['link'], body=detail.body, clean_body=detail.clean_body, date=detail.date, publisher=detail.publisher, source=detail.source, source_link=new['source_link'], sha1=detail.sha1, search_text=detail.search_text) logging.info("%r added to db, id = %r, link = %r" % (r, r.id, r.link)) db.ses.add(r) try: db.ses.commit() except: db.ses.rollback() logging.error("session commit error, when add %r" % r) # add a post to queue s = self.add_new_post_to_q(r.id) else: pass logging.debug("update finish")
def update_db(self, p=1): # TODO fix hole , update self.do_one_post_in_q() # do one post db_max_id = db.ses.query(func.max(New.id)).one()[0] try: db.ses.commit() except: db.ses.rollback() db_max_id = 100000 site_res = self.get_page_result(1) logging.info("records on site = %r, max_id in db = %r" % (site_res.total_records, db_max_id)) news_id = site_res.total_records if db_max_id < site_res.total_records: n = site_res.total_records - db_max_id logging.info("will update %r news" % n) # updte news here # assume that, n<=12 for new in site_res.news_list: if n <= 0: break n -= 1 print n # do update try: src = urlopen(SITE_URL + new['link']).read() except: logging.error("urlopen() ERROR, link = %r" % new['link']) news_id -= 1 continue detail = par.ssdut_news_parse(src) r = New(id=news_id, raw=detail.raw, title=detail.title, link=new['link'], body=detail.body, clean_body=detail.clean_body, date=detail.date, publisher=detail.publisher, source=detail.source, source_link=new['source_link'], sha1=detail.sha1, search_text=detail.search_text) logging.info("%r added to db, id = %r" % (r, r.id)) db.ses.add(r) try: db.ses.commit() except: db.ses.rollback() logging.error("session commit error, when add %r" % r) # add a post to queue s = self.add_new_post_to_q(news_id) news_id -= 1 else: pass logging.debug("update finish")