def draw_histogram(args): logPrint('[draw_histogram] starts') #load data dist_label_dic, pairwise_dist_dic = load_pairwise_dist( args.pairwise_dist_file) if args.dist_cols == 'na': dist_cols = [c for c in dist_label_dic.keys()] else: dist_cols = [ int(col) for col in args.dist_cols.strip().split(',') if col != '' ] #pdb.set_trace() #draw histogram if len(dist_cols) > 1: fig, axes = plt.subplots(nrows=len(dist_cols)) for i in dist_cols: #i_min = min(min(pairwise_dist_dic[(0,i)]), min(pairwise_dist_dic[(1,i)])) #i_max = max(max(pairwise_dist_dic[(0,i)]), max(pairwise_dist_dic[(1,i)])) axes[i].hist(pairwise_dist_dic[(0, i)], 50, normed=args.normalized, facecolor='green', histtype='step', label='same cluster') axes[i].hist(pairwise_dist_dic[(1, i)], 50, normed=args.normalized, facecolor='red', histtype='step', label='different cluster') axes[i].set_title('%s' % dist_label_dic[i]) axes[i].legend() else: fig, ax = plt.subplots() ax.hist(pairwise_dist_dic[(0, dist_cols[0])], 50, normed=args.normalized, facecolor='green', histtype='step', label='same cluster') ax.hist(pairwise_dist_dic[(1, dist_cols[0])], 50, normed=args.normalized, facecolor='red', histtype='step', label='different cluster') ax.set_title('%s' % dist_label_dic[0]) ax.legend() #show/save all plt.tight_layout() plt.savefig(args.histogram_fig) #plt.show() #pdb.set_trace() logPrint('[draw_histogram] finished. %s written' % args.histogram_fig) return
def siamese_seq2seq_data_pretrain(param_dic, args, data_dir): logPrint('siamese_seq2seq_data_pretrain') seq2seq_pair = '%s/pretrain/seq2seq_cgk.txt' % data_dir #seq2seq_cgk of seq2seq architecture cmd = 'python simulate_data.py gen_seq2seq '+\ '--output %s '%seq2seq_pair+\ '--seq_type %s '%param_dic['seq_type']+\ '--seq2seq_type %s '%param_dic['seq2seq_type']+\ '--num %s '%param_dic['n_clusters']+\ '--length %s '%param_dic['cluster_len'] #pdb.set_trace() run_cmd(cmd) if args.purpose == 'train' and param_dic['n_clusters_validation'] > 0: #generate validation data seq2seq_pair_validation = '%s/pretrain/seq2seq_cgk_validation.txt' % data_dir cmd = 'python simulate_data.py gen_seq2seq '+\ '--output %s '%seq2seq_pair_validation+\ '--seq_type %s '%param_dic['seq_type']+\ '--seq2seq_type %s '%param_dic['seq2seq_type']+\ '--num %s '%param_dic['n_clusters_validation']+\ '--length %s '%param_dic['cluster_len'] #pdb.set_trace() run_cmd(cmd) return
def export_embedding(args): logPrint('[export_embedding] starts') pdt = Predict(args.seq_type, args.model_prefix) s2n_obj = seq2nn(pdt.seq_type, pdt.maxlen, pdt.blocklen) seqs = s2n_obj.transform_seqs_from_fa(args.input_fa) N_seqs = len(seqs) #pdb.set_trace() with open(args.embed_output, 'w') as fout: iterCnt = iterCounter(N_seqs, 'export_embedding') for seq in seqs: iterCnt.inc() seq_embed = pdt.get_embed(seq.tseq) seq_embed = seq_embed.flatten() #embed_str = np.array2string(seq_embed.flatten(), separator=',') embed_str = ','.join(str(v) for v in seq_embed) fout.write('>%s\n%s\n' % (seq.description, embed_str)) iterCnt.finish() #pdb.set_trace() logPrint('[export_embedding] finished. %s written.' % args.embed_output) return
def _handle_events(self, events): for event in events: name = event[0] # logPrint ['event: '] + event if name in self.event_handlers: # can handle this event if len(event) > 1: try: self.event_handlers[name](*event[1:]) except TypeError: util.logPrint( (name, event, self.event_handlers), level='debug') else: self.event_handlers[name]() else: util.logPrint('Unhandled event: %s' % event)
def event(self): url = self.EVENTS_URL % self.server data = {'id': self.client_id} try: #print self.name + ' state: ' + self.state if self.state == 'CONNECTED': response = self.browser.open(url, urllib.urlencode(data)) else: response = self.browser.open(url, urllib.urlencode(data), util.WAIT_TIMEOUT) data = json.load(response) except URLError, e: if (self.topics[0] != None) and (e.reason.message == 'timed out'): util.logPrint('Failed to find anyone with interest ' + self.topics[0]) self.stop() # TODO: send stoplookingforcommonlikes to omegle server return
def batch_test_train_1job(input_args): param_dic, param_desc, args = input_args #packed to input_args for parallel purpose #pdb.set_trace() logLabel = '[train_1job_%s] ' % param_desc logPrint(logLabel + 'Start') root_dir = param_dic['root_dir'] data_label = param_dic['data_label'] model_label = param_dic['model_label'] data_dir = '%s/%s/train/data/' % (root_dir, data_label) run_cmd('mkdir -p %s' % data_dir) logPrint('%s created' % data_dir) model_dir = '%s/%s/train/%s/%s/' % (root_dir, data_label, model_label, param_desc) run_cmd('mkdir -p %s' % model_dir) logPrint('%s created' % model_dir) dst_config = '%s/%s/train/%s/config.txt' % (root_dir, data_label, model_label) #if os.path.exists(dst_config)==False: run_cmd('cp %s %s' % (args.config_file, dst_config)) sample_fa = '%s/sample.fa' % data_dir sample_dist = '%s/sample.dist' % data_dir cmd = 'python train.py use_simulate_data '+\ '--input_type 1 '+\ '--train_input1 %s '%sample_fa+\ '--train_input2 %s '%sample_dist+\ '--seq_type %s '%param_dic['seq_type']+\ '--train_output_dir %s '%model_dir+\ '--max_num_to_sample %s '%param_dic['max_num_to_sample']+\ '--batch_size %s '%param_dic['batch_size']+\ '--num_epochs %s '%param_dic['num_epochs']+\ '--load_model %s '%param_dic['load_model']+\ '--maxlen %s '%param_dic['maxlen']+\ '--blocklen %s '%param_dic['blocklen']+\ '--embedding_size %s '%param_dic['embedding_size']+\ '--num_layers %s '%param_dic['num_layers']+\ '--hidden_sz %s '%param_dic['hidden_sz']+\ '--learning_rate %s '%param_dic['learning_rate']+\ '--dropout %s '%param_dic['dropout'] #pdb.set_trace() run_cmd(cmd) logPrint(logLabel + 'End') return
def main(topic): util.initLogs() client1 = Client(event_delay=1, topics=[topic], name='Stranger 1') client2 = Client(event_delay=1, topics=[topic], name='Stranger 2') client1.register_other_client(client2) client2.register_other_client(client1) client1.start() util.waitForClient(client1, topic) client2.start() while client1.isAlive() or client2.isAlive(): try: client1.join(0.1) client2.join(0.1) except KeyboardInterrupt: break util.logPrint('Disconnecting... ') client1.stop() client2.stop()
def main(topic): util.initLogs() client1 = Client(event_delay=1, topics=[topic], name='Stranger 1') client2 = Client(event_delay=1, topics=[topic], name='Stranger 2') client1.register_other_client(client2) client2.register_other_client(client1) client1.start() util.waitForClient(client1, topic) client2.start() while client1.isAlive() or client2.isAlive(): try: client1.join(0.1) client2.join(0.1) except KeyboardInterrupt: break util.logPrint( 'Disconnecting... ') client1.stop() client2.stop()
def batch_test_train(args): logPrint('[batch_test_train] Start') config_desc_dic_list = parse_config_file(args.config_file) #pdb.set_trace() logPrint('[batch_test_train] %s parsed' % args.config_file) nJobs = args.N if nJobs == 1: for param_desc, param_dic in config_desc_dic_list: if args.a == 0: #siamese batch_test_train_1job((param_dic, param_desc, args)) elif args.a == 1: #seq2seq #pdb.set_trace() batch_test_train_1job_seq2seq((param_dic, param_desc, args)) elif args.a == 2: #siamese seq2seq batch_test_train_1job_siamese_seq2seq( (param_dic, param_desc, args)) else: args_list = [(param_dic, param_desc, args) for param_desc, param_dic in config_desc_dic_list] p = multiprocessing.Pool(nJobs) if args.a == 0: #siamese p.map(batch_test_train_1job, args_list) elif args.a == 1: #seq2seq p.map(batch_test_train_1job_seq2seq, args_list) elif args.a == 2: #siamese seq2seq p.map(batch_test_train_1job_siamese_seq2seq, args_list) logPrint('[batch_test_train] End') return
def draw_roc(args): logPrint('[draw_roc] starts') #load data dist_label_dic, pairwise_dist_dic = load_pairwise_dist( args.pairwise_dist_file) if args.dist_cols == 'na': dist_cols = [c for c in dist_label_dic.keys()] else: dist_cols = [ int(col) for col in args.dist_cols.strip().split(',') if col != '' ] #pdb.set_trace() #draw roc N = args.n_thresholds fig, ax = plt.subplots() #fig, axes = plt.subplots(nrows=len(dist_cols)) for i in dist_cols: i_min = min(min(pairwise_dist_dic[(0, i)]), min(pairwise_dist_dic[(1, i)])) i_max = max(max(pairwise_dist_dic[(0, i)]), max(pairwise_dist_dic[(1, i)])) T_list = [float(i_max - i_min) / N * t_idx for t_idx in range(N)] #pdb.set_trace() md_list = [0] * N fp_list = [0] * N for t_idx in range(N): t_val = T_list[t_idx] md_list[t_idx] = sum( [1 for v in pairwise_dist_dic[(0, i)] if v > t_val] ) #seq pair of same cluster has dist greater than threshold (no edge and thus mis-detection) fp_list[t_idx] = sum( [1 for v in pairwise_dist_dic[(1, i)] if v < t_val] ) #seq pair of diff cluster has dist smaller than threshold (w/ edge and thus false positive) if args.normalized == 1: md_list[t_idx] = float(md_list[t_idx]) / len( pairwise_dist_dic[(0, i)]) fp_list[t_idx] = float(fp_list[t_idx]) / len( pairwise_dist_dic[(1, i)]) #pdb.set_trace() ax.plot(fp_list, md_list, label=dist_label_dic[i], marker='.') logPrint('roc for %s done' % dist_label_dic[i]) ax.set_xlabel('False Positive') ax.set_ylabel('Mis-detection') ax.legend() #show/save all plt.tight_layout() plt.savefig(args.roc_fig) #plt.show() #pdb.set_trace() logPrint('[draw_roc] finished. %s written' % args.roc_fig) return
def batch_test_train_1job_seq2seq(input_args): #pdb.set_trace() param_dic, param_desc, args = input_args #packed to input_args for parallel purpose logLabel = '[train_1job_seq2seq_%s] ' % param_desc logPrint(logLabel + 'Start') root_dir = param_dic['root_dir'] data_label = param_dic['data_label'] model_label = param_dic['model_label'] data_dir = '%s/%s/train/data/' % (root_dir, data_label) run_cmd('mkdir -p %s' % data_dir) logPrint('%s created' % data_dir) model_dir = '%s/%s/train/%s/%s/' % (root_dir, data_label, model_label, param_desc) run_cmd('mkdir -p %s' % model_dir) logPrint('%s created' % model_dir) dst_config = '%s/%s/train/%s/config.txt' % (root_dir, data_label, model_label) if os.path.exists(dst_config) == False: run_cmd('cp %s %s' % (args.config_file, dst_config)) #sample_fa = '%s/sample.fa'%data_dir #sample_dist = '%s/sample.dist'%data_dir seq2seq_pair = '%s/seq2seq_cgk.txt' % data_dir #to be passed to train_seq2seq function param_dic['seq2seq_pair_path'] = seq2seq_pair param_dic['model_dir_path'] = model_dir if param_dic['n_clusters_validation'] > 0: #pdb.set_trace() param_dic[ 'seq2seq_pair_path_validation'] = '%s/seq2seq_cgk_validation.txt' % data_dir train_seq2seq(param_dic) logPrint(logLabel + 'End') return
def batch_test_eval(args): logLabel = '[batch_test_eval]' logPrint('%s Start' % logLabel) config_desc_dic_list = parse_config_file(args.config_file) #pdb.set_trace() logPrint('%s %s parsed' % (logLabel, args.config_file)) nJobs = args.N for param_desc, param_dic in config_desc_dic_list: batch_test_eval_1job((param_dic, param_desc, args)) logPrint('%s End' % logLabel) return
def handle_recaptcha_required(self, challenge): self.state = 'RECAPTCHA_REQUIRED' util.logPrint('Captcha required. Please go to www.omegle.com and enter the captcha manually.') util.logPrint('Disconnecting...') self.stop()
def load_pairwise_dist(pairwise_dist_file): logPrint('[load_pairwise_dist] starts') #pdb.set_trace() dist_label_dic = {} with open(pairwise_dist_file, 'r') as fin: line = fin.readline() if line[0] == '#': tokens = line[1:].strip().split() else: tokens = line.strip().split() n_dist_metric = len(tokens) - 3 for i in range(n_dist_metric): if line[0] == '#': dist_label_dic[i] = tokens[i + 3] else: dist_label_dic[i] = str(i) #pdb.set_trace() pairwise_dist_dic = {} n_lines = sum([1 for line in open(pairwise_dist_file, 'r')]) iterCnt = iterCounter(n_lines, 'load_pairwise_dist') #pdb.set_trace() cnt_nan = 0 with open(pairwise_dist_file, 'r') as fin: for line in fin: iterCnt.inc() if line[0] == '#': continue tokens = line.strip().split() tp = int(tokens[2]) has_nan = False for i in range(len(tokens) - 3): if tokens[i + 3] == 'nan': #pdb.set_trace() has_nan = True break if has_nan: cnt_nan += 1 continue for i in range(len(tokens) - 3): k = (tp, i) pairwise_dist_dic.setdefault(k, []).append(float(tokens[i + 3])) iterCnt.finish() logPrint('[load_pairwise_dist] finished; %d lines contain nan' % cnt_nan) #pdb.set_trace() return dist_label_dic, pairwise_dist_dic
def batch_test_eval_1job(input_args): #pdb.set_trace() param_dic, param_desc, args = input_args #packed to input_args for parallel purpose #pdb.set_trace() logLabel = '[eval_1job_%s] ' % param_desc logPrint(logLabel + 'Start') #---------- locations root_dir = param_dic['root_dir'] data_label = param_dic['data_label'] model_label = param_dic['model_label'] data_dir = '%s/%s/eval/data/' % (root_dir, data_label) run_cmd('mkdir -p %s' % data_dir) logPrint('%s created' % data_dir) model_train_dir = '%s/%s/train/%s/%s/' % (root_dir, data_label, model_label, param_desc) model_eval_dir = '%s/%s/eval/%s/%s/' % (root_dir, data_label, model_label, param_desc) run_cmd('mkdir -p %s' % model_eval_dir) logPrint('%s created' % model_eval_dir) sample_fa = '%s/sample.fa' % data_dir #---------- select a trained model (i.e. a check point) ckpt_path, ckpt_name, step_loss_fn_list = select_ckpt(model_train_dir) if ckpt_path == '': logPrint(logLabel + 'End (no valid ckpt found)') #pdb.set_trace() return else: ckpt_dir = '%s/%s/' % (model_eval_dir, ckpt_name) run_cmd('mkdir -p %s' % ckpt_dir) logPrint('%s created' % ckpt_dir) sample_dist = '%s/sample.dist' % ckpt_dir #pdb.set_trace() #---------- proc eval job tasks = [int(t) for t in args.tasks.split(',') if t != ''] # dist cmd = 'python simulate_data.py calc_dist '+\ '--distance_type_list %s '%param_dic['dist_tp_list_eval'] +\ '--seq_type %s '%param_dic['seq_type'] +\ '--seq_fa %s '%sample_fa +\ '--dist_out %s '%sample_dist+\ '--thread %d '%args.N +\ '--addheader %s '%param_dic['add_hd_eval'] +\ '--clear_intermediate %s '%param_dic['clear_interm_eval'] +\ '--model_prefix %s '%ckpt_path +\ '--max_num_dist_1thread %s '%param_dic['max_num_dist_1thread_eval'] #pdb.set_trace(); if 0 in tasks: run_cmd(cmd) # hist hist_fig_path = '%s/hist.norm_%s.png' % (ckpt_dir, param_dic['normalized_hist']) cmd = 'python evaluation.py draw_histogram '+\ '--pairwise_dist_file %s '%sample_dist +\ '--histogram_fig %s '%hist_fig_path +\ '--dist_cols %s '%param_dic['dist_cols'] +\ '--normalized %s '%param_dic['normalized_hist'] #pdb.set_trace(); if 1 in tasks: run_cmd(cmd) # roc roc_fig_path = '%s/roc.norm_%s.png' % (ckpt_dir, param_dic['normalized_roc']) cmd = 'python evaluation.py draw_roc '+\ '--pairwise_dist_file %s '%sample_dist +\ '--roc_fig %s '%roc_fig_path +\ '--n_thresholds %s '%param_dic['n_thresholds'] +\ '--normalized %s '%param_dic['normalized_roc'] #pdb.set_trace(); if 2 in tasks: run_cmd(cmd) # export embed embed_output = '%s/sample.embed.fa' cmd = 'python evaluation.py export_embedding '+\ '--seq_type %s '%param_dic['seq_type'] +\ '--input_fa %s '%sample_fa +\ '--embed_output %s '%embed_output +\ '--model_prefix %s '%ckpt_path #pdb.set_trace(); if 3 in tasks: run_cmd(cmd) logPrint(logLabel + 'End') return
def batch_test_data(args): #pdb.set_trace() logLabel = '[batch_test_data]' logPrint('%s Start' % (logLabel)) config_desc_dic_list = parse_config_file(args.config_file) param_dic = config_desc_dic_list[0][ 1] #we only need location info from param_dic, param combinations not important here #---------- locations root_dir = param_dic['root_dir'] data_label = param_dic['data_label'] data_dir = '%s/%s/%s/data/' % (root_dir, data_label, args.purpose) run_cmd('mkdir -p %s' % data_dir) logPrint('%s created' % data_dir) cluster_fa = '%s/cluster.fa' % data_dir sample_fa = '%s/sample.fa' % data_dir sample_dist = '%s/sample.dist' % data_dir seq2seq_pair = '%s/seq2seq_cgk.txt' % data_dir #for siamese seq2seq #siamese_seq2seq = '%s/siamese_seq2seq.txt'%data_dir #---------- what kind of data to generate tasks = [int(d) for d in args.tasks.split(',') if d != ''] #---------- actual commands if 0 in tasks: cmd = 'python simulate_data.py gen_cluster_center '+\ '--output %s '%cluster_fa+\ '--seq_type %s '%param_dic['seq_type']+\ '--num %s '%param_dic['n_clusters']+\ '--length %s '%param_dic['cluster_len']+\ '--weight_distr %s '%param_dic['weight_distr']+\ '--sid_pre %s '%param_dic['sid_pre'] run_cmd(cmd) if 1 in tasks: cmd = 'python simulate_data.py sample_from_cluster '+\ '--fa_input %s '%cluster_fa+\ '--type %s '%param_dic['seq_type']+\ '--fa_output %s '%sample_fa+\ '--prefix %s '%param_dic['sample_prefix']+\ '--total_samples %s '%param_dic['n_tot_samples']+\ '--copy %s '%param_dic['n_copy']+\ '--ins %s '%param_dic['rate_ins']+\ '--dele %s '%param_dic['rate_del']+\ '--sub %s '%param_dic['rate_sub']+\ '--thread %d '%args.N+\ '--clear_split_files %s '%param_dic['clear_split'] run_cmd(cmd) if 2 in tasks and args.purpose == 'train': cmd = 'python simulate_data.py calc_dist '+\ '--distance_type_list %s '%param_dic['dist_tp_list_data']+\ '--seq_type %s '%param_dic['seq_type']+\ '--seq_fa %s '%sample_fa+\ '--dist_out %s '%sample_dist+\ '--thread %d '%args.N+\ '--addheader %s '%param_dic['add_hd']+\ '--clear_intermediate %s '%param_dic['clear_interm']+\ '--max_num_dist_1thread %s '%param_dic['max_num_dist_1thread'] #pdb.set_trace() run_cmd(cmd) if 3 in tasks: #seq2seq_cgk of seq2seq architecture cmd = 'python simulate_data.py gen_seq2seq '+\ '--output %s '%seq2seq_pair+\ '--seq_type %s '%param_dic['seq_type']+\ '--seq2seq_type %s '%param_dic['seq2seq_type']+\ '--num %s '%param_dic['n_clusters']+\ '--length %s '%param_dic['cluster_len'] #pdb.set_trace() run_cmd(cmd) if args.purpose == 'train' and param_dic['n_clusters_validation'] > 0: #generate validation data seq2seq_pair_validation = '%s/seq2seq_cgk_validation.txt' % data_dir cmd = 'python simulate_data.py gen_seq2seq '+\ '--output %s '%seq2seq_pair_validation+\ '--seq_type %s '%param_dic['seq_type']+\ '--seq2seq_type %s '%param_dic['seq2seq_type']+\ '--num %s '%param_dic['n_clusters_validation']+\ '--length %s '%param_dic['cluster_len'] #pdb.set_trace() run_cmd(cmd) if 4 in tasks: siamese_seq2seq = '%s/siamese_seq2seq.txt' % data_dir #siamese_seq2seq of siamese seq2seq architecture cmd = 'python simulate_data.py gen_siamese_seq2seq '+\ '--output %s '%siamese_seq2seq+\ '--seq_type %s '%param_dic['seq_type']+\ '--si_correlation_type %s '%param_dic['si_correlation_type']+\ '--num %s '%param_dic['n_clusters']+\ '--length %s '%param_dic['cluster_len']+\ '--length2 %s '%param_dic['cluster_len2']+\ '--rate_ins %s '%param_dic['rate_ins']+\ '--rate_del %s '%param_dic['rate_del']+\ '--rate_sub %s '%param_dic['rate_sub'] #pdb.set_trace() run_cmd(cmd) if args.purpose == 'train' and param_dic['n_clusters_validation'] > 0: #generate validation data siamese_seq2seq_validation = '%s/siamese_seq2seq_validation.txt' % data_dir cmd = 'python simulate_data.py gen_siamese_seq2seq '+\ '--output %s '%siamese_seq2seq_validation+\ '--seq_type %s '%param_dic['seq_type']+\ '--si_correlation_type %s '%param_dic['si_correlation_type']+\ '--num %s '%param_dic['n_clusters_validation']+\ '--length %s '%param_dic['cluster_len']+\ '--length2 %s '%param_dic['cluster_len2']+\ '--rate_ins %s '%param_dic['rate_ins']+\ '--rate_del %s '%param_dic['rate_del']+\ '--rate_sub %s '%param_dic['rate_sub'] #pdb.set_trace() run_cmd(cmd) logPrint('%s End' % (logLabel)) # if int(param_dic['apply_pre_train']) == 1: siamese_seq2seq_data_pretrain(param_dic, args, data_dir) return
def handle_stranger_disconnected(self): self.connected = False util.logPrint( '[%s] Stranger has disconnected.' % self.name) self.disconnect() self.other_client.disconnect()
def handle_got_message(self, message): try: util.logPrint( '[%s] Stranger: %s' % (self.name, message)) except Exception, e: util.logPrint( str(e)) traceback.print_exc()
def handle_common_likes(self, likes): util.logPrint( '[%s] You both like %s.' % (self.name, ', '.join(likes)))
def handle_connected(self): self.state = 'CONNECTED' self.connected = True util.logPrint ("You're now chatting with a random stranger. Say hi!")
def handle_waiting(self): self.state = 'WAITING' util.logPrint("Looking for someone you can chat with...")