def cdr3_length_precluster(self, waterer, preclusters=None): cdr3lengthfname = self.args.workdir + '/cdr3lengths.csv' with opener('w')(cdr3lengthfname) as outfile: writer = csv.DictWriter( outfile, ('unique_id', 'second_unique_id', 'cdr3_length', 'second_cdr3_length', 'score')) writer.writeheader() for query_name, second_query_name in self.get_pairs(preclusters): cdr3_length = waterer.info[query_name]['cdr3_length'] second_cdr3_length = waterer.info[second_query_name][ 'cdr3_length'] same_length = cdr3_length == second_cdr3_length if not self.args.is_data: assert cdr3_length == int( self.reco_info[query_name]['cdr3_length']) if second_cdr3_length != int( self.reco_info[second_query_name]['cdr3_length']): print 'WARNING did not infer correct cdr3 length' assert False writer.writerow({ 'unique_id': query_name, 'second_unique_id': second_query_name, 'cdr3_length': cdr3_length, 'second_cdr3_length': second_cdr3_length, 'score': int(same_length) }) clust = Clusterer( 0.5, greater_than=True) # i.e. cluster together if same_length == True clust.cluster(cdr3lengthfname, debug=False) os.remove(cdr3lengthfname) return clust
def cluster(self, kmeans, hyper): clus1 = Clusterer( self.get_rel_docs(), APIAdapter.get_data_foldername(self.get_search_term()), kmeans, hyper) clus1.cluster() self.clusterer = clus1
def run_hmm(self, algorithm, sw_info, parameter_in_dir, parameter_out_dir='', preclusters=None, hmm_type='', stripped=False, prefix='', \ count_parameters=False, plotdir=None, make_clusters=False): # @parameterfetishist if prefix == '' and stripped: prefix = 'stripped' print '\n%shmm' % prefix csv_infname = self.args.workdir + '/' + prefix + '_hmm_input.csv' csv_outfname = self.args.workdir + '/' + prefix + '_hmm_output.csv' self.write_hmm_input(csv_infname, sw_info, preclusters=preclusters, hmm_type=hmm_type, stripped=stripped, parameter_dir=parameter_in_dir) print ' running' sys.stdout.flush() start = time.time() if self.args.n_procs > 1: self.split_input(self.args.n_procs, infname=csv_infname, prefix='hmm') procs = [] for iproc in range(self.args.n_procs): cmd_str = self.get_hmm_cmd_str(algorithm, csv_infname, csv_outfname, parameter_dir=parameter_in_dir, iproc=iproc) procs.append(Popen(cmd_str.split())) time.sleep(0.1) for proc in procs: proc.wait() for iproc in range(self.args.n_procs): if not self.args.no_clean: os.remove(csv_infname.replace(self.args.workdir, self.args.workdir + '/hmm-' + str(iproc))) self.merge_hmm_outputs(csv_outfname) else: cmd_str = self.get_hmm_cmd_str(algorithm, csv_infname, csv_outfname, parameter_dir=parameter_in_dir) check_call(cmd_str.split()) sys.stdout.flush() print ' hmm run time: %.3f' % (time.time()-start) hmminfo = self.read_hmm_output(algorithm, csv_outfname, make_clusters=make_clusters, count_parameters=count_parameters, parameter_out_dir=parameter_out_dir, plotdir=plotdir) if self.args.pants_seated_clustering: viterbicluster.cluster(hmminfo) clusters = None if make_clusters: if self.outfile is not None: self.outfile.write('hmm clusters\n') else: print '%shmm clusters' % prefix clusters = Clusterer(self.args.pair_hmm_cluster_cutoff, greater_than=True, singletons=preclusters.singletons) clusters.cluster(input_scores=hmminfo, debug=self.args.debug, reco_info=self.reco_info, outfile=self.outfile, plotdir=self.args.plotdir+'/pairscores') if self.args.outfname is not None: outpath = self.args.outfname if self.args.outfname[0] != '/': # if full output path wasn't specified on the command line outpath = os.getcwd() + '/' + outpath shutil.copyfile(csv_outfname, outpath) if not self.args.no_clean: if os.path.exists(csv_infname): # if only one proc, this will already be deleted os.remove(csv_infname) os.remove(csv_outfname) return clusters
def hamming_precluster(self, preclusters=None): assert self.args.truncate_pairs start = time.time() print 'hamming clustering' chopped_off_left_sides = False hamming_info = [] all_pairs = self.get_pairs(preclusters) # print ' getting pairs: %.3f' % (time.time()-start); start = time.time() # all_pairs = itertools.combinations(self.input_info.keys(), 2) if self.args.n_fewer_procs > 1: pool = Pool(processes=self.args.n_fewer_procs) subqueries = self.split_input( self.args.n_fewer_procs, info=list(all_pairs), prefix='hamming' ) # NOTE 'casting' to a list here makes me nervous! sublists = [] for queries in subqueries: sublists.append([]) for id_a, id_b in queries: sublists[-1].append({ 'id_a': id_a, 'id_b': id_b, 'seq_a': self.input_info[id_a]['seq'], 'seq_b': self.input_info[id_b]['seq'] }) # print ' preparing info: %.3f' % (time.time()-start); start = time.time() subinfos = pool.map(utils.get_hamming_distances, sublists) # NOTE this starts the proper number of processes, but they seem to end up i/o blocking or something (wait % stays at zero, but they each only get 20 or 30 %cpu on stoat) pool.close() pool.join() # print ' starting pools: %.3f' % (time.time()-start); start = time.time() for isub in range(len(subinfos)): hamming_info += subinfos[isub] # print ' merging pools: %.3f' % (time.time()-start); start = time.time() else: hamming_info = self.get_hamming_distances(all_pairs) if self.outfile is not None: self.outfile.write('hamming clusters\n') clust = Clusterer( self.args.hamming_cluster_cutoff, greater_than=False ) # NOTE this 0.5 is reasonable but totally arbitrary clust.cluster(input_scores=hamming_info, debug=self.args.debug, outfile=self.outfile, reco_info=self.reco_info) # print ' clustering: %.3f' % (time.time()-start); start = time.time() if chopped_off_left_sides: print 'WARNING encountered unequal-length sequences, so chopped off the left-hand sides of each' print ' hamming time: %.3f' % (time.time() - start) return clust
def hamming_precluster(self, preclusters=None): assert self.args.truncate_pairs start = time.time() print 'hamming clustering' chopped_off_left_sides = False hamming_info = [] all_pairs = self.get_pairs(preclusters) # print ' getting pairs: %.3f' % (time.time()-start); start = time.time() # all_pairs = itertools.combinations(self.input_info.keys(), 2) if self.args.n_fewer_procs > 1: pool = Pool(processes=self.args.n_fewer_procs) subqueries = self.split_input(self.args.n_fewer_procs, info=list(all_pairs), prefix='hamming') # NOTE 'casting' to a list here makes me nervous! sublists = [] for queries in subqueries: sublists.append([]) for id_a, id_b in queries: sublists[-1].append({'id_a':id_a, 'id_b':id_b, 'seq_a':self.input_info[id_a]['seq'], 'seq_b':self.input_info[id_b]['seq']}) # print ' preparing info: %.3f' % (time.time()-start); start = time.time() subinfos = pool.map(utils.get_hamming_distances, sublists) # NOTE this starts the proper number of processes, but they seem to end up i/o blocking or something (wait % stays at zero, but they each only get 20 or 30 %cpu on stoat) pool.close() pool.join() # print ' starting pools: %.3f' % (time.time()-start); start = time.time() for isub in range(len(subinfos)): hamming_info += subinfos[isub] # print ' merging pools: %.3f' % (time.time()-start); start = time.time() else: hamming_info = self.get_hamming_distances(all_pairs) if self.outfile is not None: self.outfile.write('hamming clusters\n') clust = Clusterer(self.args.hamming_cluster_cutoff, greater_than=False) # NOTE this 0.5 is reasonable but totally arbitrary clust.cluster(input_scores=hamming_info, debug=self.args.debug, outfile=self.outfile, reco_info=self.reco_info) # print ' clustering: %.3f' % (time.time()-start); start = time.time() if chopped_off_left_sides: print 'WARNING encountered unequal-length sequences, so chopped off the left-hand sides of each' print ' hamming time: %.3f' % (time.time()-start) return clust
def cdr3_length_precluster(self, waterer, preclusters=None): cdr3lengthfname = self.args.workdir + '/cdr3lengths.csv' with opener('w')(cdr3lengthfname) as outfile: writer = csv.DictWriter(outfile, ('unique_id', 'second_unique_id', 'cdr3_length', 'second_cdr3_length', 'score')) writer.writeheader() for query_name, second_query_name in self.get_pairs(preclusters): cdr3_length = waterer.info[query_name]['cdr3_length'] second_cdr3_length = waterer.info[second_query_name]['cdr3_length'] same_length = cdr3_length == second_cdr3_length if not self.args.is_data: assert cdr3_length == int(self.reco_info[query_name]['cdr3_length']) if second_cdr3_length != int(self.reco_info[second_query_name]['cdr3_length']): print 'WARNING did not infer correct cdr3 length' assert False writer.writerow({'unique_id':query_name, 'second_unique_id':second_query_name, 'cdr3_length':cdr3_length, 'second_cdr3_length':second_cdr3_length, 'score':int(same_length)}) clust = Clusterer(0.5, greater_than=True) # i.e. cluster together if same_length == True clust.cluster(cdr3lengthfname, debug=False) os.remove(cdr3lengthfname) return clust
def run_hmm(self, algorithm, sw_info, parameter_in_dir, parameter_out_dir='', preclusters=None, hmm_type='', stripped=False, prefix='', \ count_parameters=False, plotdir=None, make_clusters=False): # @parameterfetishist if prefix == '' and stripped: prefix = 'stripped' print '\n%shmm' % prefix csv_infname = self.args.workdir + '/' + prefix + '_hmm_input.csv' csv_outfname = self.args.workdir + '/' + prefix + '_hmm_output.csv' self.write_hmm_input(csv_infname, sw_info, preclusters=preclusters, hmm_type=hmm_type, stripped=stripped, parameter_dir=parameter_in_dir) print ' running' sys.stdout.flush() start = time.time() if self.args.n_procs > 1: self.split_input(self.args.n_procs, infname=csv_infname, prefix='hmm') procs = [] for iproc in range(self.args.n_procs): cmd_str = self.get_hmm_cmd_str(algorithm, csv_infname, csv_outfname, parameter_dir=parameter_in_dir, iproc=iproc) procs.append(Popen(cmd_str.split())) time.sleep(0.1) for proc in procs: proc.wait() for iproc in range(self.args.n_procs): if not self.args.no_clean: os.remove( csv_infname.replace( self.args.workdir, self.args.workdir + '/hmm-' + str(iproc))) self.merge_hmm_outputs(csv_outfname) else: cmd_str = self.get_hmm_cmd_str(algorithm, csv_infname, csv_outfname, parameter_dir=parameter_in_dir) check_call(cmd_str.split()) sys.stdout.flush() print ' hmm run time: %.3f' % (time.time() - start) hmminfo = self.read_hmm_output(algorithm, csv_outfname, make_clusters=make_clusters, count_parameters=count_parameters, parameter_out_dir=parameter_out_dir, plotdir=plotdir) if self.args.pants_seated_clustering: viterbicluster.cluster(hmminfo) clusters = None if make_clusters: if self.outfile is not None: self.outfile.write('hmm clusters\n') else: print '%shmm clusters' % prefix clusters = Clusterer(self.args.pair_hmm_cluster_cutoff, greater_than=True, singletons=preclusters.singletons) clusters.cluster(input_scores=hmminfo, debug=self.args.debug, reco_info=self.reco_info, outfile=self.outfile, plotdir=self.args.plotdir + '/pairscores') if self.args.outfname is not None: outpath = self.args.outfname if self.args.outfname[ 0] != '/': # if full output path wasn't specified on the command line outpath = os.getcwd() + '/' + outpath shutil.copyfile(csv_outfname, outpath) if not self.args.no_clean: if os.path.exists( csv_infname ): # if only one proc, this will already be deleted os.remove(csv_infname) os.remove(csv_outfname) return clusters
from database import Database from youtube import YouTube from clusterer import Clusterer env = 'desktop' db_name = 'comment_sense_3' db = Database(env, db_name) yt = YouTube() videoId = 'kQibkV_V8-c' video_data = yt.video(videoId) comment_topics = db.comment_topics(videoId) cl = Clusterer(video_data, db) topics = cl.cluster(comment_topics) print(topics)
def search_click(self): _textval = self.searchbox.text() self._search_term = _textval if self.gene_button.isChecked() and self.fileselected: if self.fileName: goldencorpus = GoldenCorpus(_textval,self.fileName) goldencorpus.fetchData() self.rel_docs = goldencorpus.get_rel_docs_pmid() self.mesh_terms = goldencorpus.get_mesh_terms() mesh_explosion = DataForEachMeshTerm(self.mesh_terms,_textval) path = mesh_explosion.get_data_foldername(_textval) clus = Clusterer(self.rel_docs,path,True,5) self.representative_id,self.representative,self.best_mesh_terms_id, self.best_mesh_terms = clus.cluster() if self.representative: self.updateRepresentativeInformation() else: print("Error! getting file name") elif self.pmid_button.isChecked(): print("Golden corpus exists..") else: print("Please select related file..")