def cdr3_length_precluster(self, waterer, preclusters=None):
        cdr3lengthfname = self.args.workdir + '/cdr3lengths.csv'
        with opener('w')(cdr3lengthfname) as outfile:
            writer = csv.DictWriter(
                outfile, ('unique_id', 'second_unique_id', 'cdr3_length',
                          'second_cdr3_length', 'score'))
            writer.writeheader()
            for query_name, second_query_name in self.get_pairs(preclusters):
                cdr3_length = waterer.info[query_name]['cdr3_length']
                second_cdr3_length = waterer.info[second_query_name][
                    'cdr3_length']
                same_length = cdr3_length == second_cdr3_length
                if not self.args.is_data:
                    assert cdr3_length == int(
                        self.reco_info[query_name]['cdr3_length'])
                    if second_cdr3_length != int(
                            self.reco_info[second_query_name]['cdr3_length']):
                        print 'WARNING did not infer correct cdr3 length'
                        assert False
                writer.writerow({
                    'unique_id': query_name,
                    'second_unique_id': second_query_name,
                    'cdr3_length': cdr3_length,
                    'second_cdr3_length': second_cdr3_length,
                    'score': int(same_length)
                })

        clust = Clusterer(
            0.5,
            greater_than=True)  # i.e. cluster together if same_length == True
        clust.cluster(cdr3lengthfname, debug=False)
        os.remove(cdr3lengthfname)
        return clust
예제 #2
0
 def cluster(self, kmeans, hyper):
     clus1 = Clusterer(
         self.get_rel_docs(),
         APIAdapter.get_data_foldername(self.get_search_term()), kmeans,
         hyper)
     clus1.cluster()
     self.clusterer = clus1
예제 #3
0
    def run_hmm(self, algorithm, sw_info, parameter_in_dir, parameter_out_dir='', preclusters=None, hmm_type='', stripped=False, prefix='', \
                count_parameters=False, plotdir=None, make_clusters=False):  # @parameterfetishist

        if prefix == '' and stripped:
            prefix = 'stripped'
        print '\n%shmm' % prefix
        csv_infname = self.args.workdir + '/' + prefix + '_hmm_input.csv'
        csv_outfname = self.args.workdir + '/' + prefix + '_hmm_output.csv'
        self.write_hmm_input(csv_infname, sw_info, preclusters=preclusters, hmm_type=hmm_type, stripped=stripped, parameter_dir=parameter_in_dir)
        print '    running'
        sys.stdout.flush()
        start = time.time()
        if self.args.n_procs > 1:
            self.split_input(self.args.n_procs, infname=csv_infname, prefix='hmm')
            procs = []
            for iproc in range(self.args.n_procs):
                cmd_str = self.get_hmm_cmd_str(algorithm, csv_infname, csv_outfname, parameter_dir=parameter_in_dir, iproc=iproc)
                procs.append(Popen(cmd_str.split()))
                time.sleep(0.1)
            for proc in procs:
                proc.wait()
            for iproc in range(self.args.n_procs):
                if not self.args.no_clean:
                    os.remove(csv_infname.replace(self.args.workdir, self.args.workdir + '/hmm-' + str(iproc)))
            self.merge_hmm_outputs(csv_outfname)
        else:
            cmd_str = self.get_hmm_cmd_str(algorithm, csv_infname, csv_outfname, parameter_dir=parameter_in_dir)
            check_call(cmd_str.split())

        sys.stdout.flush()
        print '      hmm run time: %.3f' % (time.time()-start)

        hmminfo = self.read_hmm_output(algorithm, csv_outfname, make_clusters=make_clusters, count_parameters=count_parameters, parameter_out_dir=parameter_out_dir, plotdir=plotdir)

        if self.args.pants_seated_clustering:
            viterbicluster.cluster(hmminfo)

        clusters = None
        if make_clusters:
            if self.outfile is not None:
                self.outfile.write('hmm clusters\n')
            else:
                print '%shmm clusters' % prefix
            clusters = Clusterer(self.args.pair_hmm_cluster_cutoff, greater_than=True, singletons=preclusters.singletons)
            clusters.cluster(input_scores=hmminfo, debug=self.args.debug, reco_info=self.reco_info, outfile=self.outfile, plotdir=self.args.plotdir+'/pairscores')

        if self.args.outfname is not None:
            outpath = self.args.outfname
            if self.args.outfname[0] != '/':  # if full output path wasn't specified on the command line
                outpath = os.getcwd() + '/' + outpath
            shutil.copyfile(csv_outfname, outpath)

        if not self.args.no_clean:
            if os.path.exists(csv_infname):  # if only one proc, this will already be deleted
                os.remove(csv_infname)
            os.remove(csv_outfname)

        return clusters
    def hamming_precluster(self, preclusters=None):
        assert self.args.truncate_pairs
        start = time.time()
        print 'hamming clustering'
        chopped_off_left_sides = False
        hamming_info = []
        all_pairs = self.get_pairs(preclusters)
        # print '    getting pairs: %.3f' % (time.time()-start); start = time.time()
        # all_pairs = itertools.combinations(self.input_info.keys(), 2)
        if self.args.n_fewer_procs > 1:
            pool = Pool(processes=self.args.n_fewer_procs)
            subqueries = self.split_input(
                self.args.n_fewer_procs,
                info=list(all_pairs),
                prefix='hamming'
            )  # NOTE 'casting' to a list here makes me nervous!
            sublists = []
            for queries in subqueries:
                sublists.append([])
                for id_a, id_b in queries:
                    sublists[-1].append({
                        'id_a': id_a,
                        'id_b': id_b,
                        'seq_a': self.input_info[id_a]['seq'],
                        'seq_b': self.input_info[id_b]['seq']
                    })

            # print '    preparing info: %.3f' % (time.time()-start); start = time.time()
            subinfos = pool.map(utils.get_hamming_distances, sublists)
            # NOTE this starts the proper number of processes, but they seem to end up i/o blocking or something (wait % stays at zero, but they each only get 20 or 30 %cpu on stoat)
            pool.close()
            pool.join()
            # print '    starting pools: %.3f' % (time.time()-start); start = time.time()

            for isub in range(len(subinfos)):
                hamming_info += subinfos[isub]
            # print '    merging pools: %.3f' % (time.time()-start); start = time.time()
        else:
            hamming_info = self.get_hamming_distances(all_pairs)

        if self.outfile is not None:
            self.outfile.write('hamming clusters\n')

        clust = Clusterer(
            self.args.hamming_cluster_cutoff, greater_than=False
        )  # NOTE this 0.5 is reasonable but totally arbitrary
        clust.cluster(input_scores=hamming_info,
                      debug=self.args.debug,
                      outfile=self.outfile,
                      reco_info=self.reco_info)
        # print '    clustering: %.3f' % (time.time()-start); start = time.time()

        if chopped_off_left_sides:
            print 'WARNING encountered unequal-length sequences, so chopped off the left-hand sides of each'
        print '    hamming time: %.3f' % (time.time() - start)

        return clust
예제 #5
0
    def hamming_precluster(self, preclusters=None):
        assert self.args.truncate_pairs
        start = time.time()
        print 'hamming clustering'
        chopped_off_left_sides = False
        hamming_info = []
        all_pairs = self.get_pairs(preclusters)
        # print '    getting pairs: %.3f' % (time.time()-start); start = time.time()
        # all_pairs = itertools.combinations(self.input_info.keys(), 2)
        if self.args.n_fewer_procs > 1:
            pool = Pool(processes=self.args.n_fewer_procs)
            subqueries = self.split_input(self.args.n_fewer_procs, info=list(all_pairs), prefix='hamming')  # NOTE 'casting' to a list here makes me nervous!
            sublists = []
            for queries in subqueries:
                sublists.append([])
                for id_a, id_b in queries:
                    sublists[-1].append({'id_a':id_a, 'id_b':id_b, 'seq_a':self.input_info[id_a]['seq'], 'seq_b':self.input_info[id_b]['seq']})
            
            # print '    preparing info: %.3f' % (time.time()-start); start = time.time()
            subinfos = pool.map(utils.get_hamming_distances, sublists)
            # NOTE this starts the proper number of processes, but they seem to end up i/o blocking or something (wait % stays at zero, but they each only get 20 or 30 %cpu on stoat)
            pool.close()
            pool.join()
            # print '    starting pools: %.3f' % (time.time()-start); start = time.time()
    
            for isub in range(len(subinfos)):
                hamming_info += subinfos[isub]
            # print '    merging pools: %.3f' % (time.time()-start); start = time.time()
        else:
            hamming_info = self.get_hamming_distances(all_pairs)

        if self.outfile is not None:
            self.outfile.write('hamming clusters\n')

        clust = Clusterer(self.args.hamming_cluster_cutoff, greater_than=False)  # NOTE this 0.5 is reasonable but totally arbitrary
        clust.cluster(input_scores=hamming_info, debug=self.args.debug, outfile=self.outfile, reco_info=self.reco_info)
        # print '    clustering: %.3f' % (time.time()-start); start = time.time()

        if chopped_off_left_sides:
            print 'WARNING encountered unequal-length sequences, so chopped off the left-hand sides of each'
        print '    hamming time: %.3f' % (time.time()-start)

        return clust
예제 #6
0
    def cdr3_length_precluster(self, waterer, preclusters=None):
        cdr3lengthfname = self.args.workdir + '/cdr3lengths.csv'
        with opener('w')(cdr3lengthfname) as outfile:
            writer = csv.DictWriter(outfile, ('unique_id', 'second_unique_id', 'cdr3_length', 'second_cdr3_length', 'score'))
            writer.writeheader()
            for query_name, second_query_name in self.get_pairs(preclusters):
                cdr3_length = waterer.info[query_name]['cdr3_length']
                second_cdr3_length = waterer.info[second_query_name]['cdr3_length']
                same_length = cdr3_length == second_cdr3_length
                if not self.args.is_data:
                    assert cdr3_length == int(self.reco_info[query_name]['cdr3_length'])
                    if second_cdr3_length != int(self.reco_info[second_query_name]['cdr3_length']):
                        print 'WARNING did not infer correct cdr3 length'
                        assert False
                writer.writerow({'unique_id':query_name, 'second_unique_id':second_query_name, 'cdr3_length':cdr3_length, 'second_cdr3_length':second_cdr3_length, 'score':int(same_length)})

        clust = Clusterer(0.5, greater_than=True)  # i.e. cluster together if same_length == True
        clust.cluster(cdr3lengthfname, debug=False)
        os.remove(cdr3lengthfname)
        return clust
    def run_hmm(self, algorithm, sw_info, parameter_in_dir, parameter_out_dir='', preclusters=None, hmm_type='', stripped=False, prefix='', \
                count_parameters=False, plotdir=None, make_clusters=False):  # @parameterfetishist

        if prefix == '' and stripped:
            prefix = 'stripped'
        print '\n%shmm' % prefix
        csv_infname = self.args.workdir + '/' + prefix + '_hmm_input.csv'
        csv_outfname = self.args.workdir + '/' + prefix + '_hmm_output.csv'
        self.write_hmm_input(csv_infname,
                             sw_info,
                             preclusters=preclusters,
                             hmm_type=hmm_type,
                             stripped=stripped,
                             parameter_dir=parameter_in_dir)
        print '    running'
        sys.stdout.flush()
        start = time.time()
        if self.args.n_procs > 1:
            self.split_input(self.args.n_procs,
                             infname=csv_infname,
                             prefix='hmm')
            procs = []
            for iproc in range(self.args.n_procs):
                cmd_str = self.get_hmm_cmd_str(algorithm,
                                               csv_infname,
                                               csv_outfname,
                                               parameter_dir=parameter_in_dir,
                                               iproc=iproc)
                procs.append(Popen(cmd_str.split()))
                time.sleep(0.1)
            for proc in procs:
                proc.wait()
            for iproc in range(self.args.n_procs):
                if not self.args.no_clean:
                    os.remove(
                        csv_infname.replace(
                            self.args.workdir,
                            self.args.workdir + '/hmm-' + str(iproc)))
            self.merge_hmm_outputs(csv_outfname)
        else:
            cmd_str = self.get_hmm_cmd_str(algorithm,
                                           csv_infname,
                                           csv_outfname,
                                           parameter_dir=parameter_in_dir)
            check_call(cmd_str.split())

        sys.stdout.flush()
        print '      hmm run time: %.3f' % (time.time() - start)

        hmminfo = self.read_hmm_output(algorithm,
                                       csv_outfname,
                                       make_clusters=make_clusters,
                                       count_parameters=count_parameters,
                                       parameter_out_dir=parameter_out_dir,
                                       plotdir=plotdir)

        if self.args.pants_seated_clustering:
            viterbicluster.cluster(hmminfo)

        clusters = None
        if make_clusters:
            if self.outfile is not None:
                self.outfile.write('hmm clusters\n')
            else:
                print '%shmm clusters' % prefix
            clusters = Clusterer(self.args.pair_hmm_cluster_cutoff,
                                 greater_than=True,
                                 singletons=preclusters.singletons)
            clusters.cluster(input_scores=hmminfo,
                             debug=self.args.debug,
                             reco_info=self.reco_info,
                             outfile=self.outfile,
                             plotdir=self.args.plotdir + '/pairscores')

        if self.args.outfname is not None:
            outpath = self.args.outfname
            if self.args.outfname[
                    0] != '/':  # if full output path wasn't specified on the command line
                outpath = os.getcwd() + '/' + outpath
            shutil.copyfile(csv_outfname, outpath)

        if not self.args.no_clean:
            if os.path.exists(
                    csv_infname
            ):  # if only one proc, this will already be deleted
                os.remove(csv_infname)
            os.remove(csv_outfname)

        return clusters
예제 #8
0
from database import Database
from youtube import YouTube
from clusterer import Clusterer

env = 'desktop'
db_name = 'comment_sense_3'
db = Database(env, db_name)
yt = YouTube()

videoId = 'kQibkV_V8-c'
video_data = yt.video(videoId)
comment_topics = db.comment_topics(videoId)

cl = Clusterer(video_data, db)
topics = cl.cluster(comment_topics)
print(topics)
예제 #9
0
파일: pyqt.py 프로젝트: tmondal/pubmed
	def search_click(self):
		_textval = self.searchbox.text()
		self._search_term = _textval
		if self.gene_button.isChecked() and self.fileselected:
			if self.fileName:
				goldencorpus = GoldenCorpus(_textval,self.fileName)
				goldencorpus.fetchData()
				self.rel_docs = goldencorpus.get_rel_docs_pmid()
				self.mesh_terms = goldencorpus.get_mesh_terms()
				mesh_explosion = DataForEachMeshTerm(self.mesh_terms,_textval)
				path = mesh_explosion.get_data_foldername(_textval)
				clus = Clusterer(self.rel_docs,path,True,5)
				self.representative_id,self.representative,self.best_mesh_terms_id, self.best_mesh_terms = clus.cluster()
				if self.representative:
					self.updateRepresentativeInformation()
			else:
				print("Error! getting file name")
		elif self.pmid_button.isChecked():
			print("Golden corpus exists..")
		else:
			print("Please select related file..")