def getMicroblog(idd, pages, opener, MIDs,batch=20): #global db global sleepTime global randomTimes result = True try: for i in range(1, pages + 1): micros = 0 TEnd = time.time() TBegin = [0.0] for j in range(0, 3): try: #in fact, i don't know the details of this function text = GetPage.tryGetPage(idd, i, j, opener) except GetPage.FailureOverTimesException as e: print(e) continue except GetPage.LoadingFailingException as e: print(e) continue except GetPage.NetworkBusyException as e: print(e) continue microblogs = text.split("div action-type=\"feed_list_item\" ") micros += len(microblogs) if(len(microblogs)==1 and isLastOne(microblogs[0])): raise TimeLimitException for microblog in microblogs: if(not store(idd,microblog,MIDs,TBegin)): #store() will return a judgement for the microblog's birthday, then go on or end up raise TimeLimitException time.sleep(randomTimes * random.random() + sleepTime) if(i==1): TBegin[0] /= 1000 days = (TEnd-TBegin[0])/(3600*24)+0.1 # if microblog density is bigger than 1.5/day, return false. that is mean the microblog's comments won't be crawled if((micros/days)>0.5): result = False print("post too frequently") break finally: print("end grab microblog", idd) return result
def getPageNumber(idd, opener): global db try: #tryGetPage(idd,page,pagebar,opener) why is pagebar here 3?? text = GetPage.tryGetPage(idd, 1, 3, opener) except GetPage.NoMicroblogException: return 0 #output(text) if text.find("微博列表") == -1: return 1 # # this should be repaired matches = re.search(r' (\d+) ', text) if matches is None: return 0 n = int(matches.group(1)) return n