class GetLatestBlog(threading.Thread): def __init__(self, jobs_queue, results_queue, gsid, proxy=None): threading.Thread.__init__(self) self.jobs_queue = jobs_queue self.results_queue = results_queue self.gsid = gsid self.proxy = proxy self.wc = WeiboCrawler() self.wc.setGsid(self.gsid) self.wc.setProxy(self.proxy) def run(self): while True: time.sleep(random.randint(2, 4)) uid, page = self.jobs_queue.get() self.jobs_queue.task_done() if page is None: page = "1" resp = self.wc.getMicroBlogs(uid, page) if resp is None: self.jobs_queue.put(uid) soup = BeautifulSoup(resp) body = soup.body mblogs = body.findAll("div", {"class": "c", "id": re.compile(u"M_")}) if mblogs is None: # no micro blog continue #print mblogs blogs_file = open("%s/data/blogs/%s.blog" % (basepath, datetime.date.today()), "a") for mblog in mblogs: blogs_file.write("[%s]:%s\n" % (uid, mblog)) blogs_file.close()
def main(): results_queue = Queue.Queue() jobs_queue = Queue.Queue() wc = WeiboCrawler() accounts = wc.getAllGsidProxyPair() gsid, proxy = accounts[0][0], accounts[0][1] if proxy == "None": proxy = None wc.setGsid(gsid) wc.setProxy(proxy) res = wc.getMicroBlogs("1646194541") soup = BeautifulSoup(res) pagelist = soup.find("div", {"id": "pagelist"}) mp = pagelist.find("input", {"name": "mp"}) uid = "xxxxxxxxx" for page in range(1, int(mp) + 1): jobs_queue.put((uid, page)) for account in accounts: gsid = account[0] proxy = account[1] if proxy == "None": proxy = None glb = GetLatestBlog(jobs_queue, results_queue, gsid, proxy) glb.setDaemon(True) glb.start() jobs_queue.join()
class StatusUpdater(threading.Thread): def __init__(self, jobs): threading.Thread.__init__(self) self.wc = WeiboCrawler() self.jobs = jobs def run(self): if self.jobs.empty(): return gsid, proxy, content, image_info = self.jobs.get() self.jobs.task_done() self.setGsid(gsid) self.setProxy(proxy) #self.wc.sendBlog(content, image_info) #self.wc.setBirthday("1987", "4", "30") #self.wc.setAvatar("2.gif", "image/gif", "2.gif") time.sleep(2) def setGsid(self, gsid): if gsid is None: return self.wc.setGsid(gsid) self.wc.is_login = True def setProxy(self, proxy): if proxy is None: return self.wc.setProxy(proxy) def login(username, password, proxy=None): self.wc.login(username, password, proxy) def isLogin(self): return self.wc.is_login
for i in range(days): datestr = date.strftime("%Y%m%d") #print datestr #print keyword keywords.put((keyword.strip(), datestr)) date = date + delta objects = [] for account in accounts_list: gsid = account[0] if account[1] == "None": proxy = None else: proxy = account[1] c = WeiboCrawler() c.setGsid(gsid) c.setProxy(proxy) objects.append(c) kw = KeyWord(c, keywords, jobs) kw.setDaemon(True) kw.start() keywords.join() for crawler in objects: s = Search(crawler, jobs, results_queue) s.setDaemon(True) s.start() jobs.join() logger.info("[%s] Done!" % task)
sys.exit(1) # Get a Database handle dbh = connect['weibo'] fans = Queue.Queue() atts = Queue.Queue() jobs = Queue.Queue() wc = WeiboCrawler() accounts = wc.getAllGsidProxyPair() for account in accounts: gsid, proxy = account[0], account[1] print gsid, proxy wct = WeiboCrawler(gsid=gsid, proxy=proxy) wct.setGsid(gsid) wct.setProxy(proxy) wsg = WeiboSocialGraph(jobs, dbh, wct) wsg.setDaemon(True) wsg.start() paj = PrepareAttsJobs(atts, jobs, dbh, wct) paj.setDaemon(True) paj.start() while True: users = dbh.user.find({"checkAtts":0}, {"_id":1}, limit=10, sort=[("iTime", ASCENDING)]) # users = dbh.user.find({"checkFans":0, "checkAtts":0, "fans":{"$lte": 5000}}, {"_id":1}, limit=10, sort=[("iTime", ASCENDING)]) for user in users: print user['_id']
js = json.loads(res) if int(js['ok']) == 1: with open("%s/data/profile/%s.info" % (basepath, datetime.date.today()), "a") as profilef: profilef.write("%s\n" % res) self.jobs_queue.task_done() else: raise Exception("Unknown Excpetion") except Exception, e: logger.info(url) logger.info(res) logger.error(e) number = number + 1 self.jobs_queue.put((uid, number)) self.jobs_queue.task_done() continue if __name__ == "__main__": jobs_queue = Queue.Queue() jobs_queue.put(("1804147667", 0)) wc = WeiboCrawler() accounts = wc.getAllGsidProxyPair() gsid, proxy = accounts[0] if proxy == "None": proxy = None print gsid, proxy wc.setGsid(gsid) wc.setProxy(proxy) gp = GetProfile(wc, jobs_queue) gp.setDaemon(False) gp.start()