def __init__(self, account, campaign, url, queue): MyThread.__init__(self) self.account = account self.campaign = campaign self.url = url self.finish_flag = False self.queue = queue
def __init__(self, account, campaign, url, queue): MyThread.__init__(self) self.account = account self.campaign = campaign self.url = url self.finish_flag = False self.queue = queue
def __init__(self, account, campaign, url, queue): MyThread.__init__(self) self.account = account self.campaign = campaign self.url = url if self.url.endswith("/"): self.url = self.url[:-1] self.finish_flag = False self.queue = queue
def __init__(self, url, username, password, queue): MyThread.__init__(self) self.url = url self.username = username self.password = password self.queue = queue self.finish_flag = False self.remainder = ''
def __init__(self, account, campaign, url, queue): MyThread.__init__(self) self.account = account self.campaign = campaign self.url = url if self.url.endswith("/"): self.url = self.url[:-1] self.finish_flag = False self.queue = queue
def __create_workers(self): """ Create new threads for the parser :return: Nothing """ for _ in range(Properties.PARSER_MAX_THREADS): t = MyThread(HTMLServiceFactory(self.html_service_type).get_instance(), self) t.start()
def create_workers(self): """ Create new threads for the crawler :return: Nothing """ for _ in range(Properties.CRAWLER_MAX_THREADS): t = MyThread( UrlDAOFactory(self.db_type).get_instance(), self, self.bucket) self.__threads.append(t) t.start()
def getBrokenlinks(url): broken_links = 0 threads = [] try: soup = url.getsoup() except WebcredError as e: raise WebcredError(e.message) except: raise WebcredError('Url is broken') for link in soup.find_all('a', href=True): uri = link.get('href') # TODO should it inlude inner links as well? if not uri.startswith('http://') and not uri.startswith('https://'): uri = url.geturl() + uri if validators.url(uri): t = MyThread(Method='funcBrokenllinks', Name='brokenlinks', Url=uri) t.start() threads.append(t) for t in threads: # pdb.set_trace() t.join() # t.freemem() if t.getResult(): broken_links += 1 return broken_links
def collect(self): li = [] for remark, url in self.urls: t = MyThread(self.check, args=(remark, url)) li.append(t) t.start() for t in li: t.join() yield t.get_result()
def getImgratio(url): total_img_size = 0 threads = [] try: text_size = url.getsize() except WebcredError as e: return e.message soup = url.getsoup() # total_img_size of images for link in soup.find_all('img', src=True): uri = link.get('src', None) if not uri.startswith('http://') and not uri.startswith('https://'): uri = url.geturl() + uri if validators.url(uri): try: uri = Urlattributes(uri) t = MyThread(Method='funcImgratio', Name='Imgratio', Url=uri) t.start() threads.append(t) except WebcredError as e: # even if particular image is not accessible, we don't mind it pass for t in threads: t.join() t.freemem() size = t.getResult() if isinstance(size, int): total_img_size += size # print total_img_size try: total_size = total_img_size + text_size ratio = float(text_size) / total_size # print ratio, text_size, total_size except ValueError: raise WebcredError('Error in fetching images') return ratio
res.append((acc, camp, url)) return res def getAllHistoryFeedURLs(self): res = [] accs = MongoManager.getActiveAccounts() for acc in accs: for camp in acc.getActiveCampaigns(): hff = camp.getHistoryFetchedForums() for url in camp.getForums(): if url not in hff: res.append((acc, camp, url)) return res if __name__ == "__main__": #f = FeedFetcher('http://blogdeunaembarazada.com/comments/feed') #'http://www.mamitips.com.pe/comments/feed/') #f.start() fm = FeedManager() fm.startWorking() try: while True: #pprint(fm.getStats()) time.sleep(1) except KeyboardInterrupt, e: print "Terminando.\n" fm.stopWorking() print "Terminado.\n" MyThread.checkFinalization()
ret_code, ret_msg, relation_ds, out_ds_list[section] =deal_csv(section, file_dict, glob_config, relation_ds) if(ret_code!=0): raise Exception("section[{}] func[{}] error[{}]".format(section, 'deal_csv', ret_msg)) threads=[] threads_num=0 for section in section_list_multiple: if(section_name!=None): if(section!=section_name): continue file_dict=config_dic.get(section, None) if(file_dict==None): raise Exception("section[{}] not found".format(section)) t=MyThread(deal_csv,args=(section, file_dict, glob_config, relation_ds)) threads.append(t) t.start() threads_num+=1 # print("\n\nNow begin deal section[{}]".format(section)) # ret_code, ret_msg, relation_ds, out_ds =deal_csv(file_dict, glob_config, relation_ds, out_ds) # if(ret_code!=0): # raise Exception("section[{}] func[{}] error[{}]".format(section, 'deal_csv', ret_msg)) for t in threads: t.join() # 一定要join,不然主线程比子线程跑的快,会拿不到结果 ret_code, ret_msg, relation_ds, out_ds_list[t.args[0]] = t.get_result() if(ret_code!=0): raise Exception("thread[{}] func[{}] error[{}]".format(t, 'deal_csv', ret_msg)) # out_ds=pd.DataFrame({'openday':[], 'detail_type':[], 'detail_cnt':[], 'detail_amt':[]})
if __name__ == "__main__": # Note: this automatically reconnects to the stream upon being disconnected UN = 'pablobesada' PWD = 'pdbpdb' ACC = 'promored' gcm = GnipCollectionManager(ACC, UN, PWD) gcm.startWorking() UN = '*****@*****.**' PWD = 'ladedarin' ACC = 'promored' gtm = GnipTwitterManager(ACC, UN, PWD) gtm.startWorking() try: while True: #pprint(gcm.getStats()) time.sleep(1) except KeyboardInterrupt, e: print "Terminando.\n" gcm.stopWorking() gtm.stopWorking() pprint(gcm.getStats()) pprint(gtm.getStats()) print "Terminado.\n" MyThread.checkFinalization() sys.exit(0)
def __init__(self): MyThread.__init__(self) self.finish_flag = False RulesMonitor.INSTANCE = self self.gnipTwitterRulesManager = GnipTwitterRulesManager() self.gnipCollectionRulesManager = GnipCollectionRulesManager()
def __init__(self, buf, queue): self.buf = buf self.queue = queue MyThread.__init__(self)
def main(cross_num=5, exp_path='', sav_dir='', conf='', cfg_sec='', bool_vad=False): if not os.path.exists(exp_path): os.mkdir(exp_path) for i in range(1, 6): session_name = 'Session' + str(i) session_path = exp_path + '/' + session_name if not os.path.exists(session_path): os.mkdir(session_path) if not os.path.exists(sav_dir): os.mkdir(sav_dir) thread_list = [] lab_list = [] for i in range(5): t = MyThread( get_emo_data, args=('Session' + str(i + 1), True, bool_vad, 'spectrogram', 3, exp_path + 'Session' + str(i + 1) + '/', conf, cfg_sec)) thread_list.append(t) for t in thread_list: t.start() for t in thread_list: t.join() lab_list.append(t.get_result()) lab_dict = dict() for item in lab_list: lab_dict.update(item) min_val = np.Inf freq_bag = set() time_bag = set() for sess_spk in lab_dict.keys(): # sess_spk aims to specific session of F/M, # wav_file_dict: {wav_file_name: [[mat1,lab1], [mat2, lab2] ...], wav_file_name: [[mat1,lab1], [mat2, # lab2] ...] , ..} wav_file_info_dict = lab_dict.get(sess_spk) print(sess_spk, len(list(wav_file_info_dict.keys()))) for wav_file in wav_file_info_dict: info_list = wav_file_info_dict.get(wav_file) for mat, label, valence, arouse, domain in info_list: min_val = min(min_val, mat.min()) # max_length = max(max_length, mat.shape[1]) # min_length = min(min_length, mat.shape[1]) time_bag.add(mat.shape[1]) freq_bag.add(mat.shape[0]) # print('Time bag:\n\t', time_bag) print('Freq bag:\n\t', freq_bag) print(max(time_bag), min(time_bag)) bias = min_val - 1 print(bias) sess_spk = set(lab_dict.keys()) if cross_num == 5: for i in range(1, 6): # construct the dataset for five-fold cross validation cross_val = 'leave_' + str(i) file_path = sav_dir + '/' + cross_val dev_key = {'Session' + str(i) + '_F'} test_key = {'Session' + str(i) + '_M'} train_key = sess_spk - test_key - dev_key if not os.path.exists(file_path): os.mkdir(file_path) print(test_key) store(file_path=file_path, train_key=train_key, test_key=test_key, data_dict=lab_dict, bias=bias, bool_vad=bool_vad) print() else: sess_spk_list = list(sess_spk) # construct the dataset for ten-fold cross validation for i in range(1, 11): idx = sess_spk_list[i - 1] cross_val = 'leave_' + str(i) file_path = data_path_prefix + '10cross_set/' + cross_val test_key = {idx} train_key = sess_spk - test_key if not os.path.exists(file_path): os.mkdir(file_path) store(file_path=file_path, train_key=train_key, test_key=test_key, data_dict=lab_dict)
def setUp(self): self.sut = MyThread(None, MyThreadMock())