def get_timelines(self, uid): """ get all timelines of user with this uid :param uid: :return: """ fetcher = self.fetchers[self.main_fetcher] timeline_page_num, first_page = self.get_timeline_page_num(uid) if timeline_page_num == 0: print 'No any posts.' return else: for pt in first_page: self.timeline_list.extend( self.parser.parse_timelines(pt, uid, datetime.now())) if timeline_page_num == 1: print 'He/She just has one page timeline.' return timelines = [] for pnum in xrange(2, timeline_page_num + 1): print 'There are totally %d timeline pages.' % ( timeline_page_num, ) for bnum in xrange(3): html = self.fetch_timelines_by_page_bar(uid, pnum, bnum) time.sleep( random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2 * Config.SLEEP_BETWEEN_2FPAGES)) if html is not None: timelines = self.parser.parse_timelines( html, uid, datetime.now()) self.timeline_list.extend(timelines) self.end_time = datetime.now() duration = self.end_time - self.start_time if duration.seconds > Config.ACCOUNT_CHANGE_TIME: self.main_fetcher = loop_increase(self.main_fetcher, len(self.fetchers)) self.start_time = datetime.now() emphasis_print('Account changed!!!') emphasis_print('Now %d of %d accounts are working!' % (self.main_fetcher + 1, len(self.fetchers))) time.sleep( random.randint(Config.SLEEP_BETWEEN_TIMELINE_PAGES, 2 * Config.SLEEP_BETWEEN_TIMELINE_PAGES))
def get_timelines(self, uid): """ get all timelines of user with this uid :param uid: :return: """ fetcher = self.fetchers[self.main_fetcher] timeline_page_num, first_page = self.get_timeline_page_num(uid) if timeline_page_num == 0: print 'No any posts.' return else: for pt in first_page: self.timeline_list.extend(self.parser.parse_timelines(pt, uid, datetime.now())) if timeline_page_num == 1: print 'He/She just has one page timeline.' return timelines = [] for pnum in xrange(2, timeline_page_num+1): print 'There are totally %d timeline pages.' % (timeline_page_num,) for bnum in xrange(3): html = self.fetch_timelines_by_page_bar(uid, pnum, bnum) time.sleep(random.randint(Config.SLEEP_BETWEEN_2FPAGES, 2*Config.SLEEP_BETWEEN_2FPAGES)) if html is not None: timelines = self.parser.parse_timelines(html, uid, datetime.now()) self.timeline_list.extend(timelines) self.end_time = datetime.now() duration = self.end_time - self.start_time if duration.seconds > Config.ACCOUNT_CHANGE_TIME: self.main_fetcher = loop_increase(self.main_fetcher, len(self.fetchers)) self.start_time = datetime.now() emphasis_print('Account changed!!!') emphasis_print('Now %d of %d accounts are working!' % (self.main_fetcher+1, len(self.fetchers))) time.sleep(random.randint(Config.SLEEP_BETWEEN_TIMELINE_PAGES, 2*Config.SLEEP_BETWEEN_TIMELINE_PAGES))
while True: # in case of connection lost try: spider.save() break except Exception as e: print e.message, uid if 'Lost connection to MySQL server during query' in e.message: continue else: break crawled_list.append(uid) spider.end_time = datetime.now() duration = spider.end_time - spider.start_time if duration.seconds > ACCOUNT_CHANGE_TIME: spider.main_fetcher = loop_increase( spider.main_fetcher, len(spider.fetchers)) spider.start_time = datetime.now() emphasis_print('Account changed!!!') print 'Complete a batch of tasks!' print 'Getting new tasks...' uid_list = get_tasks(TASK_NUM) if len(uid_list) == 0: print 'No tasks to proceed!' exit(-1) except Exception as e: print e.message log.error('Problematic UID: %s' % (uid, )) finally: reset(user_list, uid_list, crawled_list) # reset
while True: # in case of connection lost try: spider.save() break except Exception as e: print e.message, uid if 'Lost connection to MySQL server during query' in e.message: continue else: break crawled_list.append(uid) spider.end_time = datetime.now() duration = spider.end_time - spider.start_time if duration.seconds > ACCOUNT_CHANGE_TIME: spider.main_fetcher = loop_increase(spider.main_fetcher, len(spider.fetchers)) spider.start_time = datetime.now() emphasis_print('Account changed!!!') print 'Complete a batch of tasks!' print 'Getting new tasks...' uid_list = get_tasks(TASK_NUM) if len(uid_list) == 0: print 'No tasks to proceed!' exit(-1) except Exception as e: print e.message log.error('Problematic UID: %s' % (uid, )) finally: reset(user_list, uid_list, crawled_list) # reset