class GetLatestBlog(threading.Thread): def __init__(self, jobs_queue, results_queue, gsid, proxy=None): threading.Thread.__init__(self) self.jobs_queue = jobs_queue self.results_queue = results_queue self.gsid = gsid self.proxy = proxy self.wc = WeiboCrawler() self.wc.setGsid(self.gsid) self.wc.setProxy(self.proxy) def run(self): while True: time.sleep(random.randint(2, 4)) uid, page = self.jobs_queue.get() self.jobs_queue.task_done() if page is None: page = "1" resp = self.wc.getMicroBlogs(uid, page) if resp is None: self.jobs_queue.put(uid) soup = BeautifulSoup(resp) body = soup.body mblogs = body.findAll("div", {"class": "c", "id": re.compile(u"M_")}) if mblogs is None: # no micro blog continue #print mblogs blogs_file = open("%s/data/blogs/%s.blog" % (basepath, datetime.date.today()), "a") for mblog in mblogs: blogs_file.write("[%s]:%s\n" % (uid, mblog)) blogs_file.close()
def getWeibos(self, keyword, page=1, count=None): url = 'http://t.hexun.com/k/topic.html?type=1&value=%s&pg=%d' % (json.dumps(keyword).replace('\\', '%').replace('"', ''), page) result = WeiboCrawler.request(self, url, self.headers) if 'result' in result and result['result']: infos = result['info'].decode('gb2312') soup = BeautifulSoup(infos) total_soup = soup.select('.headerR1')[0] total_num = total_soup.get_text().split('共')[-1].split('条')[0].strip() return_val = {'total_count': int(total_num), 'msgs':[]} allmsgs = [] msgs_soup = soup.select('.nr_con') for msg_soup in msgs_soup: avatar = 'http://t.hexun.com%s' % msg_soup.select('.nr_conLa > a')[0].get('href') nickandtext = msg_soup.select('.nr_shuo')[0].get_text().split(':') nickname = nickandtext[0] text = nickandtext[1] ts = msg_soup.select('.nr_tan > h3 > a')[0].get_text() allmsgs.append({ 'avatar': avatar, 'nickname': nickname, 'text': text, 'datetime': ts, }) return_val['msgs'] = allmsgs return return_val
def main(): results_queue = Queue.Queue() jobs_queue = Queue.Queue() wc = WeiboCrawler() accounts = wc.getAllGsidProxyPair() gsid, proxy = accounts[0][0], accounts[0][1] if proxy == "None": proxy = None wc.setGsid(gsid) wc.setProxy(proxy) res = wc.getMicroBlogs("1646194541") soup = BeautifulSoup(res) pagelist = soup.find("div", {"id": "pagelist"}) mp = pagelist.find("input", {"name": "mp"}) uid = "xxxxxxxxx" for page in range(1, int(mp) + 1): jobs_queue.put((uid, page)) for account in accounts: gsid = account[0] proxy = account[1] if proxy == "None": proxy = None glb = GetLatestBlog(jobs_queue, results_queue, gsid, proxy) glb.setDaemon(True) glb.start() jobs_queue.join()
def __init__(self, jobs_queue, results_queue, gsid, proxy=None): threading.Thread.__init__(self) self.jobs_queue = jobs_queue self.results_queue = results_queue self.gsid = gsid self.proxy = proxy self.wc = WeiboCrawler() self.wc.setGsid(self.gsid) self.wc.setProxy(self.proxy)
def getWeibos(self, keyword, page=1, size=10, sid=None): if not sid: sid = self.sid url = 'http://ti.3g.qq.com/touch/s?sid=%s&aid=vaction&more=1&mst=33&ac=60&keyword=%s&dl2=1&dumpJSON=1&pageid=search&pid=%d&psize=%d' % (sid, keyword, page, size) result = WeiboCrawler.request(self, url, self.headers) if 'result' in result and result['result']: json_info = json.loads(result['info']) if 'result' in json_info and json_info['result'] == '0': msgs = json_info['jsonDump']['msgs'] total_info = json_info['info'] return {'msgs': msgs, 'total_pages': total_info['pageCount'], 'total_count': total_info['totalCount']}
def getSid(self): url = 'http://pt.3g.qq.com/login?act=json&format=2&bid_code=microblogLogin&r=%f&qq=%s&pmd5=%s&go_url=http://ti.3g.qq.com/touch/iphone/index.jsp?g_f=18106' % (random.random(), self.username, self.password) result = WeiboCrawler.request(self, url, self.headers) if 'result' in result and result['result']: info = result['info'].replace('pt.handleLoginResult(', '')[:-2] json_info = json.loads(info) if len(json_info) == 8: sid = json_info[4] self.sid = sid return sid return None
def getWeibos(self, keyword, page=1, count=10): url = 'http://m.weibo.cn/searchs/weibo?key=%s&page=%d&count=%d' % (keyword, page, count) result = WeiboCrawler.request(self, url, self.headers) if 'result' in result and result['result']: infos = result['info'] json_infos = json.loads(infos) if 'ok' in json_infos and json_infos['ok']: return_val = {'total_count': json_infos['total_number'], 'total_pages': json_infos['maxPage'], 'msgs': []} msgs = json_infos['mblogList'] return_val['msgs'] = msgs return return_val
def getSid(self): url = 'http://pt.3g.qq.com/login?act=json&format=2&bid_code=microblogLogin&r=%f&qq=%s&pmd5=%s&go_url=http://ti.3g.qq.com/touch/iphone/index.jsp?g_f=18106' % ( random.random(), self.username, self.password) result = WeiboCrawler.request(self, url, self.headers) if 'result' in result and result['result']: info = result['info'].replace('pt.handleLoginResult(', '')[:-2] json_info = json.loads(info) if len(json_info) == 8: sid = json_info[4] self.sid = sid return sid return None
def getWeibos(self, keyword, page=1, count=10): url = 'http://m.weibo.cn/searchs/weibo?key=%s&page=%d&count=%d' % ( keyword, page, count) result = WeiboCrawler.request(self, url, self.headers) if 'result' in result and result['result']: infos = result['info'] json_infos = json.loads(infos) if 'ok' in json_infos and json_infos['ok']: return_val = { 'total_count': json_infos['total_number'], 'total_pages': json_infos['maxPage'], 'msgs': [] } msgs = json_infos['mblogList'] return_val['msgs'] = msgs return return_val
def getWeibos(self, keyword, page=1, size=10, sid=None): if not sid: sid = self.sid url = 'http://ti.3g.qq.com/touch/s?sid=%s&aid=vaction&more=1&mst=33&ac=60&keyword=%s&dl2=1&dumpJSON=1&pageid=search&pid=%d&psize=%d' % ( sid, keyword, page, size) result = WeiboCrawler.request(self, url, self.headers) if 'result' in result and result['result']: json_info = json.loads(result['info']) if 'result' in json_info and json_info['result'] == '0': msgs = json_info['jsonDump']['msgs'] total_info = json_info['info'] return { 'msgs': msgs, 'total_pages': total_info['pageCount'], 'total_count': total_info['totalCount'] }
class StatusUpdater(threading.Thread): def __init__(self, jobs): threading.Thread.__init__(self) self.wc = WeiboCrawler() self.jobs = jobs def run(self): if self.jobs.empty(): return gsid, proxy, content, image_info = self.jobs.get() self.jobs.task_done() self.setGsid(gsid) self.setProxy(proxy) #self.wc.sendBlog(content, image_info) #self.wc.setBirthday("1987", "4", "30") #self.wc.setAvatar("2.gif", "image/gif", "2.gif") time.sleep(2) def setGsid(self, gsid): if gsid is None: return self.wc.setGsid(gsid) self.wc.is_login = True def setProxy(self, proxy): if proxy is None: return self.wc.setProxy(proxy) def login(username, password, proxy=None): self.wc.login(username, password, proxy) def isLogin(self): return self.wc.is_login
# users.append(user_doc) with open("baby_search_result.dat", "a") as sr: sr.write("%s\n" % user_doc) break self.jobs.task_done() if __name__ == "__main__": results_queue = Queue.Queue() # pj = PersistJobs(results_queue) # pj.setDaemon(True) # pj.start() crawler = WeiboCrawler() accounts_list = crawler.getAllGsidProxyPair() with open(sys.argv[1].strip()) as tasks: for task in tasks: if len(task.strip()) == 0: continue items = task.split("|") keyword = items[0].strip() year = int(items[1].strip()) month = int(items[2].strip()) day = int(items[3].strip()) days = int(items[4].strip()) keywords = Queue.Queue() jobs = Queue.Queue() start = datetime.date(year, month, day)
def __init__(self, jobs): threading.Thread.__init__(self) self.wc = WeiboCrawler() self.jobs = jobs
def main(): try: connect = Connection(host="localhost", port=27017) print "Connected Successfully" except ConnectionFailure, e: sys.stderr.write("Could not connect to MongoDB: %s" % e) sys.exit(1) # Get a Database handle dbh = connect['weibo'] fans = Queue.Queue() atts = Queue.Queue() jobs = Queue.Queue() wc = WeiboCrawler() accounts = wc.getAllGsidProxyPair() for account in accounts: gsid, proxy = account[0], account[1] print gsid, proxy wct = WeiboCrawler(gsid=gsid, proxy=proxy) wct.setGsid(gsid) wct.setProxy(proxy) wsg = WeiboSocialGraph(jobs, dbh, wct) wsg.setDaemon(True) wsg.start() paj = PrepareAttsJobs(atts, jobs, dbh, wct) paj.setDaemon(True) paj.start()
def __init__(self, username, password, proxy=None): self.wc = WeiboCrawler() self.wc.login(username, password, proxy) self.is_login = False if self.wc.is_login: self.is_login = True
class ProfileFiller: def __init__(self, username, password, proxy=None): self.wc = WeiboCrawler() self.wc.login(username, password, proxy) self.is_login = False if self.wc.is_login: self.is_login = True def isLogin(self): return self.is_login def fillProfile(self, kwds): nick = kwds['nick'] self.wc.setNick(nick) domain = kwds['domain'] self.wc.setDomain(domain) description = kwds['description'] self.wc.setDescription(description) tag = kwds['tag'] self.wc.setTag(tag) gender = kwds['gender'] self.wc.setGender(gender) provid, cityid = kwds['location']['provid'], kwds['location']['cityid'] self.wc.setLocation(provid, cityid) schoolid, inyear, department = kwds['school']['id'], kwds['school']['in'], kwds['school']['department'] self.wc.setSchool(schoolid, inyear, department) companyname, inyear, outyear, department = kwds['company']['name'], kwds['company']['in'], kwds['company']['out'], kwds['company']['department'] self.wc.setCompany(companyname, inyear, outyear, department) year, month, day = kwds['birth']['year'], kwds['birth']['month'], kwds['birth']['day']