def run(self): #start the works if self.website == settings.SINA_WEIBO: if self.ids_type == 'uid': sina_weibo.main(fetcher=self.fetcher, uids=self.ids, fetch_data=self.fetch_data, store_path=self.store_path, window=self.window) elif self.ids_type == 'msg_url': sina_weibo.main(fetcher=self.fetcher, msg_urls=self.ids, store_path=self.store_path, window=self.window) elif self.website == settings.TWITTER: msg = 'For twitter, not implemented in current version.' self.window.write_logs(msg) elif self.website == settings.FACEBOOK: msg = 'For facebook, not implemented in current version.' self.window.write_logs(msg) else: msg = 'For %s, not implemented in current version.' % self.website self.window.write_logs(msg) #finished wx.CallAfter(self.window.finished)
def run(self): #start the works if self.website == settings.COMWEIBO: if self.ids_type == 'uid': sina_weibo.main(fetcher=self.fetcher, uids=self.ids, fetch_data=self.fetch_data, store_path=self.store_path, window=self.window, weibo_com=True) elif self.ids_type == 'msg_url': sina_weibo.main(fetcher=self.fetcher, msg_urls=self.ids, fetch_data=self.fetch_data, store_path=self.store_path, window=self.window, weibo_com=True) elif self.website == settings.CNWEIBO: sina_weibo.main(fetcher=self.fetcher, uids=self.ids, fetch_data=self.fetch_data, window=self.window, weibo_com=False) elif self.website == settings.TWITTER: msg = 'For twitter, not implemented in current version.' wx.CallAfter(self.window.write_logs, str(msg)) elif self.website == settings.FACEBOOK: msg = 'For facebook, not implemented in current version.' wx.CallAfter(self.window.write_logs, str(msg)) else: msg = 'For %s, not implemented in current version.' %self.website wx.CallAfter(self.window.write_logs, str(msg)) #finished wx.CallAfter(self.window.finished)
from thread_pool import WorkerManager fetcher = ComWeiboFetcher(username=account.user, password=account.pwd) login_ok = fetcher.check_cookie() if not login_ok: print 'login failed.' sys.exit() fans = [] follows = [] sw.main(fetcher, fetch_data='follows', store_path='./file/', uids=memstorage.users_id_moniterd, uids_storage=follows) sw.main(fetcher, fetch_data='fans', store_path='./file/', uids=memstorage.users_id_moniterd, uids_storage=fans) friends_list = list(set(fans) | set(follows)) print friends_list #host's weibo sw.main(fetcher, fetch_data='weibos', store_path='./file/',
import memstorage import account from thread_pool import WorkerManager fetcher = ComWeiboFetcher(username=account.user, password=account.pwd) login_ok = fetcher.check_cookie() if not login_ok: print 'login failed.' sys.exit() fans = [] follows = [] sw.main(fetcher, fetch_data='follows', store_path='./file/', uids=memstorage.users_id_moniterd, uids_storage=follows) sw.main(fetcher, fetch_data='fans', store_path='./file/', uids=memstorage.users_id_moniterd, uids_storage=fans) friends_list = list(set(fans)|set(follows)) print friends_list #host's weibo sw.main(fetcher,fetch_data='weibos',store_path='./file/',uids=memstorage.users_id_moniterd) #friends' weibo n_threads = 10 n_paritions = 10 len_partition = len(friends_list)/n_paritions worker_manager = WorkerManager(n_threads) for i in range(0,len(friends_list),len_partition): worker_manager.add_job(sw.main, fetcher, fetch_data='weibos',store_path='./file/',
def do_task(self): '''task file format: task_id:**(time format) time.strftime('%Y-%m-%d-%H-%M', time.localtime()) id_type:**(uid/msg_url) fetch_data: weibos/follows/fans/infos uids:(separated by semicolon) msg_urls:(separated by semicolon) ''' task_id = '' id_type = 'uid' fetch_data = 'infos' uids = [] msg_urls = [] tar_file = None f_task = os.path.join(TASK_PATH, 'task.dat') if os.path.exists(f_task): fp = codecs.open(f_task, 'r', 'utf-8') data = fp.readlines() #parse for line in data: line = line.strip() if line.startswith('task_id:'): task_id = line.split('task_id:')[-1] elif line.startswith('id_type:'): id_type = line.split('id_type:')[-1] elif line.startswith('fetch_data:'): fetch_data = line.split('fetch_data:')[-1] fetch_data = fetch_data.lower() elif line.startswith('uids:'): _uids = line.split('uids:')[-1] _uids = _uids.split(';') _uids = [ uid.strip().encode('utf-8') for uid in _uids if len(uid) > 0 ] for uid in uids: try: int(uid) uids.append(uid) except: pass elif line.startswith('msg_urls:'): _msg_urls = line.split('msg_urls:')[-1] _msg_urls = _msg_urls.split(';') p = re.compile(r'^http[s]?://weibo.com/\d*/[A-Za-z0-9]+$', re.U) for msg_url in _msg_urls: try: msg_url = p.search(msg_url).group(0) msg_urls.append(msg_url) except: pass else: msg = 'Task format error.' logger.info(msg) wx.CallAfter(self.window.write_logs, msg) #start if id_type == 'uid' and len(uids) > 0: sina_weibo.main(fetcher=self.fetcher, fetch_data=fetch_data, uids=uids, store_path=self.store_path, window=self.window) files = os.listdir(self.store_path) files = filter( lambda f: fetch_data in f and f.endswith('.csv'), files) elif id_type == 'msg_url' and len(msg_urls) > 0: sina_weibo.main(fetcher=self.fetcher, msg_urls=msg_urls, store_path=self.store_path, window=self.window) files = os.listdir(self.store_path) files = filter( lambda f: 'reposts' in f or 'comments' in f and f.endswith( '.csv'), files) #compress and upload if len(files) > 0: tar_f = str( self.host_fetcher.username) + str(task_id) + '.tar.gz' tar_f = os.path.join(self.store_path, tar_f) tar_file = tarfile.open(tar_f, 'w:bz2') tar_file.add(f_task, arcname='task.dat') for f in files: f_name = os.path.join(self.store_path, f) tar_file.add(f_name, arcname=f) os.rename(f_name, os.path.join(self.upload_path, f)) tar_file.close() os.remove(os.path.join(self.store_path, f_task)) return tar_file
# encoding: utf-8 from sina_weibo.fetcher import ComWeiboFetcher import sina_weibo import sys import time import memstorage user = '******' pwd = 'ecnupass' fetcher = ComWeiboFetcher(username=user, password=pwd) login_ok = fetcher.check_cookie() if not login_ok: print 'login failed.' sys.exit() sina_weibo.main(fetcher,fetch_data='weibos',store_path='./file/',uids=memstorage.users_id_moniterd)
def do_task(self): '''task file format: task_id:**(time format) time.strftime('%Y-%m-%d-%H-%M', time.localtime()) id_type:**(uid/msg_url) fetch_data: weibos/follows/fans/infos uids:(separated by semicolon) msg_urls:(separated by semicolon) ''' task_id = '' id_type = 'uid' fetch_data = 'infos' uids = [] msg_urls = [] tar_file = None f_task = os.path.join(TASK_PATH, 'task.dat') if os.path.exists(f_task): fp = codecs.open(f_task, 'r', 'utf-8') data = fp.readlines() #parse for line in data: line = line.strip() if line.startswith('task_id:'): task_id = line.split('task_id:')[-1] elif line.startswith('id_type:'): id_type = line.split('id_type:')[-1] elif line.startswith('fetch_data:'): fetch_data = line.split('fetch_data:')[-1] fetch_data = fetch_data.lower() elif line.startswith('uids:'): _uids = line.split('uids:')[-1] _uids = _uids.split(';') _uids = [uid.strip().encode('utf-8') for uid in _uids if len(uid) > 0] for uid in uids: try: int(uid) uids.append(uid) except: pass elif line.startswith('msg_urls:'): _msg_urls = line.split('msg_urls:')[-1] _msg_urls = _msg_urls.split(';') p = re.compile(r'^http[s]?://weibo.com/\d*/[A-Za-z0-9]+$', re.U) for msg_url in _msg_urls: try: msg_url = p.search(msg_url).group(0) msg_urls.append(msg_url) except: pass else: msg = 'Task format error.' logger.info(msg) wx.CallAfter(self.window.write_logs, msg) #start if id_type == 'uid' and len(uids) > 0: sina_weibo.main(fetcher=self.fetcher, fetch_data=fetch_data, uids=uids, store_path=self.store_path, window=self.window) files = os.listdir(self.store_path) files = filter(lambda f: fetch_data in f and f.endswith('.csv'), files) elif id_type == 'msg_url' and len(msg_urls) > 0: sina_weibo.main(fetcher=self.fetcher, msg_urls=msg_urls, store_path=self.store_path, window=self.window) files = os.listdir(self.store_path) files = filter(lambda f: 'reposts' in f or 'comments' in f and f.endswith('.csv'), files) #compress and upload if len(files) > 0: tar_f = str(self.host_fetcher.username) + str(task_id) + '.tar.gz' tar_f = os.path.join(self.store_path, tar_f) tar_file = tarfile.open(tar_f, 'w:bz2') tar_file.add(f_task, arcname='task.dat') for f in files: f_name = os.path.join(self.store_path, f) tar_file.add(f_name, arcname=f) os.rename(f_name, os.path.join(self.upload_path, f)) tar_file.close() os.remove(os.path.join(self.store_path, f_task)) return tar_file
# encoding: utf-8 from sina_weibo.fetcher import ComWeiboFetcher import sina_weibo import sys import time import memstorage user = '******' pwd = 'ecnupass' fetcher = ComWeiboFetcher(username=user, password=pwd) login_ok = fetcher.check_cookie() if not login_ok: print 'login failed.' sys.exit() sina_weibo.main(fetcher, fetch_data='weibos', store_path='./file/', uids=memstorage.users_id_moniterd, weibos_storage=memstorage.weibos_url_moniterd)
def TestWeibo__init__(user, pwd, weibo_com): if weibo_com: fetcher = ComWeiboFetcher(username=user, password=pwd) else: fetcher = CnWeiboFetcher(username=user, password=pwd) login_ok = fetcher.check_cookie() if not login_ok: print "login failed." sys.exit() uids = [1000000253, 10057, 10029] msg_urls = ["http://weibo.com/1000000253/ezC36cq3i6G", "http://weibo.com/1713926427/A2V5CENGU"] start = time.time() print "crawl weibos" sina_weibo.main(fetcher, fetch_data="weibos", store_path="./file/", uids=uids, weibo_com=weibo_com) print "crawl follows" sina_weibo.main(fetcher, fetch_data="follows", store_path="./file/", uids=uids, weibo_com=weibo_com) print "crawl fans" sina_weibo.main(fetcher, fetch_data="fans", store_path="./file/", uids=uids, weibo_com=weibo_com) print "crawl infos" sina_weibo.main(fetcher, fetch_data="infos", store_path="./file/", uids=uids, weibo_com=weibo_com) print "crawl reposts" sina_weibo.main(fetcher, store_path="./file/", msg_urls=msg_urls, fetch_data="repost", weibo_com=weibo_com) print "crawl comments" sina_weibo.main(fetcher, store_path="./file/", msg_urls=msg_urls, fetch_data="comment", weibo_com=weibo_com) cost_time = int(time.time() - start) print "finished: # connections: %s, cost time: %s" % (fetcher.n_connections, cost_time)
def TestWeibo__init__(user, pwd, weibo_com): if weibo_com: fetcher = ComWeiboFetcher(username=user, password=pwd) else: fetcher = CnWeiboFetcher(username=user, password=pwd) login_ok = fetcher.check_cookie() if not login_ok: print 'login failed.' sys.exit() uids = [1000000253, 10057, 10029] msg_urls = [ 'http://weibo.com/1000000253/ezC36cq3i6G', 'http://weibo.com/1713926427/A2V5CENGU' ] start = time.time() print 'crawl weibos' sina_weibo.main(fetcher, fetch_data='weibos', store_path='./file/', uids=uids, weibo_com=weibo_com) print 'crawl follows' sina_weibo.main(fetcher, fetch_data='follows', store_path='./file/', uids=uids, weibo_com=weibo_com) print 'crawl fans' sina_weibo.main(fetcher, fetch_data='fans', store_path='./file/', uids=uids, weibo_com=weibo_com) print 'crawl infos' sina_weibo.main(fetcher, fetch_data='infos', store_path='./file/', uids=uids, weibo_com=weibo_com) print 'crawl reposts' sina_weibo.main(fetcher, store_path='./file/', msg_urls=msg_urls, fetch_data='repost', weibo_com=weibo_com) print 'crawl comments' sina_weibo.main(fetcher, store_path='./file/', msg_urls=msg_urls, fetch_data='comment', weibo_com=weibo_com) cost_time = int(time.time() - start) print 'finished: # connections: %s, cost time: %s' % ( fetcher.n_connections, cost_time)
import sys import time import memstorage fetcher = ComWeiboFetcher(username=memstorage.user, password=memstorage.pwd) login_ok = fetcher.check_cookie() if not login_ok: print 'login failed.' sys.exit() start = time.time() sina_weibo.main(fetcher, fetch_data='follows', store_path='./file/', uids=memstorage.users_id_moniterd, uids_storage=memstorage.uids_url_moniterd) #sina_weibo.main(fetcher, fetch_data='fans', store_path='./file/', uids=memstorage.users_id_moniterd, uids_storage=memstorage.uids_url2_moniterd) #a = set(memstorage.uids_url_moniterd) #b = set(memstorage.uids_url2_moniterd) #print a & b #c = list(a&b) sina_weibo.main(fetcher, fetch_data='weibos', store_path='./file/', uids=memstorage.uids_url_moniterd, weibos_storage=memstorage.weibos_url_moniterd) print 'crawl reposts and comments' sina_weibo.main(fetcher, store_path='./file/', msg_urls=memstorage.weibos_url_moniterd)
def do_task(self): """task file format: task_id:**(time format) time.strftime('%Y-%m-%d-%H-%M', time.localtime()) id_type:**(uid/msg_url) fetch_data: weibos/follows/fans/infos uids:(separated by semicolon) msg_urls:(separated by semicolon) """ task_id = "" id_type = "uid" fetch_data = "infos" uids = [] msg_urls = [] tar_file = None f_task = os.path.join(TASK_PATH, "task.dat") if os.path.exists(f_task): fp = codecs.open(f_task, "r", "utf-8") data = fp.readlines() # parse for line in data: line = line.strip() if line.startswith("task_id:"): task_id = line.split("task_id:")[-1] elif line.startswith("id_type:"): id_type = line.split("id_type:")[-1] elif line.startswith("fetch_data:"): fetch_data = line.split("fetch_data:")[-1] fetch_data = fetch_data.lower() elif line.startswith("uids:"): _uids = line.split("uids:")[-1] _uids = _uids.split(";") _uids = [uid.strip().encode("utf-8") for uid in _uids if len(uid) > 0] for uid in uids: try: int(uid) uids.append(uid) except: pass elif line.startswith("msg_urls:"): _msg_urls = line.split("msg_urls:")[-1] _msg_urls = _msg_urls.split(";") p = re.compile(r"^http[s]?://weibo.com/\d*/[A-Za-z0-9]+$", re.U) for msg_url in _msg_urls: try: msg_url = p.search(msg_url).group(0) msg_urls.append(msg_url) except: pass else: msg = "Task format error." logger.info(msg) wx.CallAfter(self.window.write_logs, msg) # start if id_type == "uid" and len(uids) > 0: sina_weibo.main( fetcher=self.fetcher, fetch_data=fetch_data, uids=uids, store_path=self.store_path, window=self.window, ) files = os.listdir(self.store_path) files = filter(lambda f: fetch_data in f and f.endswith(".csv"), files) elif id_type == "msg_url" and len(msg_urls) > 0: sina_weibo.main(fetcher=self.fetcher, msg_urls=msg_urls, store_path=self.store_path, window=self.window) files = os.listdir(self.store_path) files = filter(lambda f: "reposts" in f or "comments" in f and f.endswith(".csv"), files) # compress and upload if len(files) > 0: tar_f = str(self.host_fetcher.username) + str(task_id) + ".tar.gz" tar_f = os.path.join(self.store_path, tar_f) tar_file = tarfile.open(tar_f, "w:bz2") tar_file.add(f_task, arcname="task.dat") for f in files: f_name = os.path.join(self.store_path, f) tar_file.add(f_name, arcname=f) os.rename(f_name, os.path.join(self.upload_path, f)) tar_file.close() os.remove(os.path.join(self.store_path, f_task)) return tar_file