def cdr3_length_precluster(self, waterer, preclusters=None): cdr3lengthfname = self.args.workdir + '/cdr3lengths.csv' with opener('w')(cdr3lengthfname) as outfile: writer = csv.DictWriter( outfile, ('unique_id', 'second_unique_id', 'cdr3_length', 'second_cdr3_length', 'score')) writer.writeheader() for query_name, second_query_name in self.get_pairs(preclusters): cdr3_length = waterer.info[query_name]['cdr3_length'] second_cdr3_length = waterer.info[second_query_name][ 'cdr3_length'] same_length = cdr3_length == second_cdr3_length if not self.args.is_data: assert cdr3_length == int( self.reco_info[query_name]['cdr3_length']) if second_cdr3_length != int( self.reco_info[second_query_name]['cdr3_length']): print 'WARNING did not infer correct cdr3 length' assert False writer.writerow({ 'unique_id': query_name, 'second_unique_id': second_query_name, 'cdr3_length': cdr3_length, 'second_cdr3_length': second_cdr3_length, 'score': int(same_length) }) clust = Clusterer( 0.5, greater_than=True) # i.e. cluster together if same_length == True clust.cluster(cdr3lengthfname, debug=False) os.remove(cdr3lengthfname) return clust
def cluster(self, kmeans, hyper): clus1 = Clusterer( self.get_rel_docs(), APIAdapter.get_data_foldername(self.get_search_term()), kmeans, hyper) clus1.cluster() self.clusterer = clus1
def map_segments_to_clusters(x): # print('mapper: %s working on %s' % (os.getpid(), x)) ((filename, start, end, size), config) = x clusterer = Clusterer(**config) lines = FileSegmentReader.read(filename, start, end, size) clusters = clusterer.find(lines) return [(FIXED_MAP_JOB_KEY, clusters)]
def run_hmm(self, algorithm, sw_info, parameter_in_dir, parameter_out_dir='', preclusters=None, hmm_type='', stripped=False, prefix='', \ count_parameters=False, plotdir=None, make_clusters=False): # @parameterfetishist if prefix == '' and stripped: prefix = 'stripped' print '\n%shmm' % prefix csv_infname = self.args.workdir + '/' + prefix + '_hmm_input.csv' csv_outfname = self.args.workdir + '/' + prefix + '_hmm_output.csv' self.write_hmm_input(csv_infname, sw_info, preclusters=preclusters, hmm_type=hmm_type, stripped=stripped, parameter_dir=parameter_in_dir) print ' running' sys.stdout.flush() start = time.time() if self.args.n_procs > 1: self.split_input(self.args.n_procs, infname=csv_infname, prefix='hmm') procs = [] for iproc in range(self.args.n_procs): cmd_str = self.get_hmm_cmd_str(algorithm, csv_infname, csv_outfname, parameter_dir=parameter_in_dir, iproc=iproc) procs.append(Popen(cmd_str.split())) time.sleep(0.1) for proc in procs: proc.wait() for iproc in range(self.args.n_procs): if not self.args.no_clean: os.remove(csv_infname.replace(self.args.workdir, self.args.workdir + '/hmm-' + str(iproc))) self.merge_hmm_outputs(csv_outfname) else: cmd_str = self.get_hmm_cmd_str(algorithm, csv_infname, csv_outfname, parameter_dir=parameter_in_dir) check_call(cmd_str.split()) sys.stdout.flush() print ' hmm run time: %.3f' % (time.time()-start) hmminfo = self.read_hmm_output(algorithm, csv_outfname, make_clusters=make_clusters, count_parameters=count_parameters, parameter_out_dir=parameter_out_dir, plotdir=plotdir) if self.args.pants_seated_clustering: viterbicluster.cluster(hmminfo) clusters = None if make_clusters: if self.outfile is not None: self.outfile.write('hmm clusters\n') else: print '%shmm clusters' % prefix clusters = Clusterer(self.args.pair_hmm_cluster_cutoff, greater_than=True, singletons=preclusters.singletons) clusters.cluster(input_scores=hmminfo, debug=self.args.debug, reco_info=self.reco_info, outfile=self.outfile, plotdir=self.args.plotdir+'/pairscores') if self.args.outfname is not None: outpath = self.args.outfname if self.args.outfname[0] != '/': # if full output path wasn't specified on the command line outpath = os.getcwd() + '/' + outpath shutil.copyfile(csv_outfname, outpath) if not self.args.no_clean: if os.path.exists(csv_infname): # if only one proc, this will already be deleted os.remove(csv_infname) os.remove(csv_outfname) return clusters
def hamming_precluster(self, preclusters=None): assert self.args.truncate_pairs start = time.time() print 'hamming clustering' chopped_off_left_sides = False hamming_info = [] all_pairs = self.get_pairs(preclusters) # print ' getting pairs: %.3f' % (time.time()-start); start = time.time() # all_pairs = itertools.combinations(self.input_info.keys(), 2) if self.args.n_fewer_procs > 1: pool = Pool(processes=self.args.n_fewer_procs) subqueries = self.split_input( self.args.n_fewer_procs, info=list(all_pairs), prefix='hamming' ) # NOTE 'casting' to a list here makes me nervous! sublists = [] for queries in subqueries: sublists.append([]) for id_a, id_b in queries: sublists[-1].append({ 'id_a': id_a, 'id_b': id_b, 'seq_a': self.input_info[id_a]['seq'], 'seq_b': self.input_info[id_b]['seq'] }) # print ' preparing info: %.3f' % (time.time()-start); start = time.time() subinfos = pool.map(utils.get_hamming_distances, sublists) # NOTE this starts the proper number of processes, but they seem to end up i/o blocking or something (wait % stays at zero, but they each only get 20 or 30 %cpu on stoat) pool.close() pool.join() # print ' starting pools: %.3f' % (time.time()-start); start = time.time() for isub in range(len(subinfos)): hamming_info += subinfos[isub] # print ' merging pools: %.3f' % (time.time()-start); start = time.time() else: hamming_info = self.get_hamming_distances(all_pairs) if self.outfile is not None: self.outfile.write('hamming clusters\n') clust = Clusterer( self.args.hamming_cluster_cutoff, greater_than=False ) # NOTE this 0.5 is reasonable but totally arbitrary clust.cluster(input_scores=hamming_info, debug=self.args.debug, outfile=self.outfile, reco_info=self.reco_info) # print ' clustering: %.3f' % (time.time()-start); start = time.time() if chopped_off_left_sides: print 'WARNING encountered unequal-length sequences, so chopped off the left-hand sides of each' print ' hamming time: %.3f' % (time.time() - start) return clust
def process_single_core(self, filenames): """ Process multiple files sequencially using a single processor """ clusterer = Clusterer(**self.cluster_config) for filename in filenames: with open(filename, 'r') as f: for line in f: clusterer.process_line(line) return clusterer.result()
def process_pipe(self): """ Process continuously from stdin input stream """ clusterer = Clusterer(**self.cluster_config) try: for line in sys.stdin: clusterer.process_line(line) except KeyboardInterrupt: pass finally: return clusterer.result()
def test(self): clusterer = Clusterer(k1=1, k2=1, max_dist=0.5, variables=[]) clusters = clusterer.find([ 'hello 1 y 3', 'hello 1 x 3', 'abc m n q', ]) self.assertEqual( clusters, [ [['hello', '1', 'y', '3'], 2, ['hello', '1', '---', '3']], [['abc', 'm', 'n', 'q'], 1, ['abc', 'm', 'n', 'q']] ] )
def test_min_members(self): clusterer = Clusterer( k1=1, k2=1, max_dist=0.5, variables=[], min_members=2) clusters = clusterer.find([ 'hello 1 y 3', 'hello 1 x 3', 'abc m n q', ]) self.assertEqual( clusters, [ [['hello', '1', 'y', '3'], 2, ['hello', '1', '---', '3']], ] )
def __init__(self, parent=None): super(Window, self).__init__(parent) # Set Features List self.selectedFeatures = Clusterer.getDefaultFeatures() self.availableFeatures = Clusterer.getAllFeatures() self.availableFeatures &= ~self.selectedFeatures self.initUI() sys.stdout = EmittingStream(textWritten=self.normalOutputWritten) self.clustererThread = ClustererThread(self) self.clusterer = None self.clusters = None self.currentCluster = 0 self.backgroundColor = QtGui.QColor(0, 0, 0)
def get_clusterer(trainer, args, output_size, model): assert len( trainer.layer_list_all ) == 1, 'Active learning is only implemented for a single layer ablations' assert args.clustering, 'Active learning samples are associated with a specific clustering. The clustering flag ' \ 'is necessary' active_paths = torch.load( os.path.join(args.active_learning_name, 'a_paths.pth')) active_units = torch.load( os.path.join(args.active_learning_name, 'units.pth')) active_binary_masks = torch.load( os.path.join(args.active_learning_name, 'a_hmaps.pth')) trainer.active_dict = {} for i, path in enumerate(active_paths): trainer.active_dict[path] = { 'mask': active_binary_masks[i], 'units': active_units[i], 'index': i } cluster_path = os.path.join(args.active_learning_name, 'cluster') trainer.clusterer = Clusterer(trainer.loaders['train'], model, path_store=cluster_path, model_dim=args.embedding_dim, load_datapoints=True, load_histogram=True, load_clustering=True, load_name_final=True, save_results=True, output_size=output_size, args=args) return trainer.clusterer
def createFeaturesList(self, features): """Create feature list from features flags. """ ql = [] for i in xrange(features): flag = ((features >> i) & 1) << i if flag: ql.append(Clusterer.getFeatureName(flag)) return ql
def __init__(self): self.actuNames = ActuatorNames() self.sensorNames = SensorNames() self.bdm = BDWrapper() self.expLogColl = CollectionWrapper('experience_log') #self.zonelist = self.csv2list('metadata/partialzonelist.csv') self.zonelist = self.csv2list('metadata/zonelist.csv') self.feater = FeatureExtractor() self.clust = Clusterer()
def hamming_precluster(self, preclusters=None): assert self.args.truncate_pairs start = time.time() print 'hamming clustering' chopped_off_left_sides = False hamming_info = [] all_pairs = self.get_pairs(preclusters) # print ' getting pairs: %.3f' % (time.time()-start); start = time.time() # all_pairs = itertools.combinations(self.input_info.keys(), 2) if self.args.n_fewer_procs > 1: pool = Pool(processes=self.args.n_fewer_procs) subqueries = self.split_input(self.args.n_fewer_procs, info=list(all_pairs), prefix='hamming') # NOTE 'casting' to a list here makes me nervous! sublists = [] for queries in subqueries: sublists.append([]) for id_a, id_b in queries: sublists[-1].append({'id_a':id_a, 'id_b':id_b, 'seq_a':self.input_info[id_a]['seq'], 'seq_b':self.input_info[id_b]['seq']}) # print ' preparing info: %.3f' % (time.time()-start); start = time.time() subinfos = pool.map(utils.get_hamming_distances, sublists) # NOTE this starts the proper number of processes, but they seem to end up i/o blocking or something (wait % stays at zero, but they each only get 20 or 30 %cpu on stoat) pool.close() pool.join() # print ' starting pools: %.3f' % (time.time()-start); start = time.time() for isub in range(len(subinfos)): hamming_info += subinfos[isub] # print ' merging pools: %.3f' % (time.time()-start); start = time.time() else: hamming_info = self.get_hamming_distances(all_pairs) if self.outfile is not None: self.outfile.write('hamming clusters\n') clust = Clusterer(self.args.hamming_cluster_cutoff, greater_than=False) # NOTE this 0.5 is reasonable but totally arbitrary clust.cluster(input_scores=hamming_info, debug=self.args.debug, outfile=self.outfile, reco_info=self.reco_info) # print ' clustering: %.3f' % (time.time()-start); start = time.time() if chopped_off_left_sides: print 'WARNING encountered unequal-length sequences, so chopped off the left-hand sides of each' print ' hamming time: %.3f' % (time.time()-start) return clust
def addFeature(self): """Add selected features to selected features list. """ selected = self.availableFeatureList.selectedItems() for item in selected: feature = Clusterer.getFeatureByName(item.text()) self.availableFeatures &= ~feature self.selectedFeatures |= feature self.availableFeatureList.takeItem(self.availableFeatureList.row(item)) self.selectedFeatureList.addItem(item)
def run(self): path = self.mw.sourcePathField.text() if not path: print "[Error] File path is empty" return try: img = Clusterer.readImage(path) imageBGRA = cv2.cvtColor(img, cv2.cv.CV_BGR2BGRA) self.mw.refreshSource(imageBGRA) features = self.mw.selectedFeatures if not features: return self.mw.clusterer = Clusterer() backgroundColor = self.mw.backgroundColor backgroundColor = backgroundColor.blue(), backgroundColor.green( ), backgroundColor.red() if self.mw.transparentBg.isChecked(): backgroundColor = None mode = self.mw.modeCombo.itemText(self.mw.modeCombo.currentIndex()) mode = Clusterer.getModeByName(mode) modeK = self.mw.modeK.itemText(self.mw.modeK.currentIndex()) modeK = Clusterer.getKModeByName(modeK) k = self.mw.clusterCount.value() self.mw.runButton.setEnabled(False) self.mw.clusters = self.mw.clusterer.getClusters( path, mode=mode, kmode=modeK, clusterCount=k, features=features, backgroundColor=backgroundColor, slider=self.mw.clusterSlider.value()) self.mw.currentCluster = 0 self.mw.refreshCluster() self.mw.saveButton.setEnabled(True) self.mw.clusterer.graph(self.mw.figure) self.mw.canvas.setMinimumSize(self.mw.canvas.size()) self.mw.canvas.draw() except (OSError, cv2.error, urllib2.HTTPError) as err: print err self.mw.runButton.setEnabled(True)
def addFeature(self): """Add selected features to selected features list. """ selected = self.availableFeatureList.selectedItems() for item in selected: feature = Clusterer.getFeatureByName(item.text()) self.availableFeatures &= ~feature self.selectedFeatures |= feature self.availableFeatureList.takeItem( self.availableFeatureList.row(item)) self.selectedFeatureList.addItem(item)
def cdr3_length_precluster(self, waterer, preclusters=None): cdr3lengthfname = self.args.workdir + '/cdr3lengths.csv' with opener('w')(cdr3lengthfname) as outfile: writer = csv.DictWriter(outfile, ('unique_id', 'second_unique_id', 'cdr3_length', 'second_cdr3_length', 'score')) writer.writeheader() for query_name, second_query_name in self.get_pairs(preclusters): cdr3_length = waterer.info[query_name]['cdr3_length'] second_cdr3_length = waterer.info[second_query_name]['cdr3_length'] same_length = cdr3_length == second_cdr3_length if not self.args.is_data: assert cdr3_length == int(self.reco_info[query_name]['cdr3_length']) if second_cdr3_length != int(self.reco_info[second_query_name]['cdr3_length']): print 'WARNING did not infer correct cdr3 length' assert False writer.writerow({'unique_id':query_name, 'second_unique_id':second_query_name, 'cdr3_length':cdr3_length, 'second_cdr3_length':second_cdr3_length, 'score':int(same_length)}) clust = Clusterer(0.5, greater_than=True) # i.e. cluster together if same_length == True clust.cluster(cdr3lengthfname, debug=False) os.remove(cdr3lengthfname) return clust
def search_click(self): _textval = self.searchbox.text() self._search_term = _textval if self.gene_button.isChecked() and self.fileselected: if self.fileName: goldencorpus = GoldenCorpus(_textval,self.fileName) goldencorpus.fetchData() self.rel_docs = goldencorpus.get_rel_docs_pmid() self.mesh_terms = goldencorpus.get_mesh_terms() mesh_explosion = DataForEachMeshTerm(self.mesh_terms,_textval) path = mesh_explosion.get_data_foldername(_textval) clus = Clusterer(self.rel_docs,path,True,5) self.representative_id,self.representative,self.best_mesh_terms_id, self.best_mesh_terms = clus.cluster() if self.representative: self.updateRepresentativeInformation() else: print("Error! getting file name") elif self.pmid_button.isChecked(): print("Golden corpus exists..") else: print("Please select related file..")
def run(self): path = self.mw.sourcePathField.text() if not path: print "[Error] File path is empty" return try: img = Clusterer.readImage(path) imageBGRA = cv2.cvtColor(img, cv2.cv.CV_BGR2BGRA) self.mw.refreshSource(imageBGRA) features = self.mw.selectedFeatures if not features: return self.mw.clusterer = Clusterer() backgroundColor = self.mw.backgroundColor backgroundColor = backgroundColor.blue(), backgroundColor.green(), backgroundColor.red() if self.mw.transparentBg.isChecked(): backgroundColor = None mode = self.mw.modeCombo.itemText(self.mw.modeCombo.currentIndex()) mode = Clusterer.getModeByName(mode) modeK = self.mw.modeK.itemText(self.mw.modeK.currentIndex()) modeK = Clusterer.getKModeByName(modeK) k = self.mw.clusterCount.value() self.mw.runButton.setEnabled(False) self.mw.clusters = self.mw.clusterer.getClusters(path, mode=mode, kmode=modeK, clusterCount=k, features=features, backgroundColor=backgroundColor, slider=self.mw.clusterSlider.value()) self.mw.currentCluster = 0 self.mw.refreshCluster() self.mw.saveButton.setEnabled(True) self.mw.clusterer.graph(self.mw.figure) self.mw.canvas.setMinimumSize(self.mw.canvas.size()) self.mw.canvas.draw() except (OSError, cv2.error, urllib2.HTTPError) as err: print err self.mw.runButton.setEnabled(True)
def toggleClusterCount(self, index): """Disable cluster count when 'auto' is checked. """ mode = Clusterer.getKModeByName(self.modeK.itemText(self.modeK.currentIndex())) if mode == Clusterer.KMODE_USER: self.clusterCount.show() else: self.clusterCount.hide() if mode == Clusterer.KMODE_SLIDER: self.clusterSliderWidget.show() else: self.clusterSliderWidget.hide()
def toggleClusterCount(self, index): """Disable cluster count when 'auto' is checked. """ mode = Clusterer.getKModeByName( self.modeK.itemText(self.modeK.currentIndex())) if mode == Clusterer.KMODE_USER: self.clusterCount.show() else: self.clusterCount.hide() if mode == Clusterer.KMODE_SLIDER: self.clusterSliderWidget.show() else: self.clusterSliderWidget.hide()
from fastapi import FastAPI from vector_space import VectorSpace from org_dataset import OrgDataset from org_recommender import OrgRecommender from clusterer import Clusterer from keyword_finder import KeywordFinder from keyword_matcher import KeywordMatcher from gcd_utils import get_account_liked_tags app = FastAPI() dataset = OrgDataset.load_instance('./orgs.pkl') vs = VectorSpace.load_instance('./test_vs.pkl') recommender = OrgRecommender(dataset, vs) c = Clusterer(dataset, vs, 20) kw_finder = KeywordFinder(dataset, vs) matcher = KeywordMatcher(c, kw_finder, vs.data_centroid) @app.get('/get_init_recs/') async def get_init_recs(userId: str, numOrgs: int): keywords = get_account_liked_tags(userId) centroid = matcher.get_kw_centroid(keywords) orgids = recommender.centroid_recommend(centroid, numOrgs) return_arr = [] for id in orgids: entry = {'orgId': id} return_arr.append(entry) return return_arr """Example get request for api on local host: http://127.0.0.1:8000/get_recommendations/?userId=334614c0-7f55-11ea-b1bc-2f9730f51173&numOrgs=2
# Grid of 100x100 # 3 circles of 15x15 with each 10 points import testgenerator from clusterer import Clusterer from clustervisualizer import ClusterVisualizer points = testgenerator.create_circle_points(200, 8, 15, 10) clusterer = Clusterer(1, 2, 2) clustervisualizer = ClusterVisualizer(clusterer) clusterer.set_points(points) clusterer.run()
not_picked = clean[(clean['eligible'] == 1) & (clean['oz'] == 0)] picked = clean[clean['oz'] == 1] nonfeatures = drop_columns(picked, drop_cols) features = picked.columns ## standardize standardize = StandardScaler() X, features = picked.values, picked.columns.values X = standardize.fit_transform(X) ## build model cluster_labels = pd.DataFrame() for k in range(6, 7): pax = Clusterer(model, n_clusters=k, random_state=24) centers = pax.fit(X) pax.store_features(features) print("{} grouped {} clusters.".format(model, np.shape(centers)[0])) ## update labels and scores for column k filepath = "{}/{}/labels.pkl".format(data, model) with open(filepath, "rb") as f: k = pax.attributes['n_clusters'] model_labels_df = pickle.load(f) model_labels_df["k={}".format(k)] = pax.attributes['labels_'] model_labels_df["k{}silhouette_score".format( k)] = pax.get_silhouette_samples() model_labels_df.to_pickle(filepath) print("Updated labels @ {}".format(filepath))
def __init__(self, model, optimizer, all_loaders, args, resume_epoch): self.resume_epoch = resume_epoch self.args = args self.optimizer = torch.optim.SGD((model.parameters()), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) self.layer_list_all = args.layers self.layers_dict = { 'layer2': { 'name': 'layer2', 'depth': 512, 'size': 4 }, 'layer3': { 'name': 'layer3', 'depth': 512, 'size': 8 }, 'layer4': { 'name': 'layer4', 'depth': 512, 'size': 8 }, 'layer5': { 'name': 'layer5', 'depth': 256, 'size': 16 }, 'layer6': { 'name': 'layer6', 'depth': 256, 'size': 16 }, } self.generator = gantest.GanTester(args.path_model_gan, self.layer_list_all, device=torch.device('cuda')) self.z = self.generator.standard_z_sample(200000) self.model = model self.optimizer = optimizer self.loaders = all_loaders self.loss_type = args.loss_type # Other parameters self.margin = args.margin self.clustering = args.clustering self.epoch = 0 self.unorm = utils.UnNormalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) output_size = 32 if 'large' in args.audio_model else 256 if args.active_learning: active_learning.get_clusterer(self, args, output_size, model) else: if args.clustering: print('Creating cluster from scratch') cluster_path = os.path.join( self.args.results, 'clusters', args.name_checkpoint + '_' + str(time.time())) self.clusterer = Clusterer( self.loaders['train'], model, path_store=cluster_path, model_dim=args.embedding_dim, save_results=True, output_size=output_size, args=self.args, path_cluster_load=args.path_cluster_load) self.epochs_clustering = self.args.epochs_clustering self.clusters = self.mean_clust = self.std_clust = self.cluster_counts = self.clusters_unit = None
class Trainer: def __init__(self, model, optimizer, all_loaders, args, resume_epoch): self.resume_epoch = resume_epoch self.args = args self.optimizer = torch.optim.SGD((model.parameters()), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) self.layer_list_all = args.layers self.layers_dict = { 'layer2': { 'name': 'layer2', 'depth': 512, 'size': 4 }, 'layer3': { 'name': 'layer3', 'depth': 512, 'size': 8 }, 'layer4': { 'name': 'layer4', 'depth': 512, 'size': 8 }, 'layer5': { 'name': 'layer5', 'depth': 256, 'size': 16 }, 'layer6': { 'name': 'layer6', 'depth': 256, 'size': 16 }, } self.generator = gantest.GanTester(args.path_model_gan, self.layer_list_all, device=torch.device('cuda')) self.z = self.generator.standard_z_sample(200000) self.model = model self.optimizer = optimizer self.loaders = all_loaders self.loss_type = args.loss_type # Other parameters self.margin = args.margin self.clustering = args.clustering self.epoch = 0 self.unorm = utils.UnNormalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) output_size = 32 if 'large' in args.audio_model else 256 if args.active_learning: active_learning.get_clusterer(self, args, output_size, model) else: if args.clustering: print('Creating cluster from scratch') cluster_path = os.path.join( self.args.results, 'clusters', args.name_checkpoint + '_' + str(time.time())) self.clusterer = Clusterer( self.loaders['train'], model, path_store=cluster_path, model_dim=args.embedding_dim, save_results=True, output_size=output_size, args=self.args, path_cluster_load=args.path_cluster_load) self.epochs_clustering = self.args.epochs_clustering self.clusters = self.mean_clust = self.std_clust = self.cluster_counts = self.clusters_unit = None def train(self): """ Main training loop. For each epoch train the model and save checkpoint if the results are good. Cluster every epochs_clustering epochs """ best_eval = 0 try: for epoch in range(self.resume_epoch, self.args.epochs): self.epoch = epoch # Clustering if self.clustering and \ ((epoch % self.epochs_clustering == 0) or (self.args.resume and epoch == self.resume_epoch)): self.clusterer.save_results = True clus, mean_clust, std_clust = self.clusterer.create_clusters( iteration=0) self.clusters = torch.FloatTensor(clus).cuda() self.mean_clust = torch.FloatTensor(mean_clust) self.std_clust = torch.FloatTensor(std_clust) self.cluster_counts = 1 / self.clusters.max(1)[0] self.clusters_unit = self.cluster_counts.view(self.clusters.size(0), 1).expand_as(self.clusters) * \ self.clusters self.clusterer.name_with_images_clusters() self.clusterer.name_clusters() self.optimize_neurons() # This is for visualization: # self.clusterer.segment_images() # self.clusterer.create_web_images() # segment_images has to be uncommented before self.clusterer.create_web_clusters(with_images=True) utils.adjust_learning_rate(self.args, self.optimizer, epoch) # Train for one epoch print('Starting training epoch ' + str(epoch)) self.train_epoch(epoch) # Evaluate on validation set print('Starting evaluation epoch ' + str(epoch)) eval_score, recalls = self.eval() self.args.writer.add_scalar('eval_score', eval_score, epoch) # Remember best eval score and save checkpoint is_best = eval_score > best_eval best_eval = max(eval_score, best_eval) utils.save_checkpoint( { 'epoch': epoch + 1, 'model_state_dict': self.model.state_dict(), 'best_eval': best_eval, 'recall_now': recalls, 'optimizer': self.optimizer.state_dict(), }, is_best, self.args, name_checkpoint=self.args.name_checkpoint) except KeyboardInterrupt: print('You decided to finish the training at epoch ' + str(epoch + 1)) def train_epoch(self, epoch): """ Train one epoch. It consists of 5 steps Step 1: Compute the output of the positive image Step 2: Compute the mask for the positive image features Step 3: Generate the negative image from this mask Step 4: Compute the output of this negative Step 5: Compute all the losses And after that, do the backpropagation and weight updates """ if not self.args.use_cpu: torch.cuda.synchronize() batch_time = utils.AverageMeter() data_time = utils.AverageMeter() losses_meter = utils.AverageMeter() # Switch to train mode self.model.train() end = time.time() N_examples = self.loaders['train'].dataset.__len__() loss_list_total = { 'loss_regular': 0, 'loss_neg': 0, 'loss_hardneg': 0, 'loss_total': 0 } for batch_id, (image_input, audio_input, neg_images, nframes, path, image_raw) in enumerate(self.loaders['train']): loss_list = { 'loss_regular': 0, 'loss_neg': 0, 'loss_hardneg': 0, 'loss_total': 0 } # Measure data loading time data_time.update(time.time() - end) if not self.args.use_cpu: audio_input = audio_input.cuda(async=True) if not self.args.loading_image: path_ints = [p.split('/')[-1] for p in path ] # in case the audio is inside a subfolder v_init = self.z[int(path_ints[0])] z_img = torch.FloatTensor(image_input.size(0), v_init.shape[0]) for k in range(image_input.size(0)): z_img[k, :] = self.z[int(path_ints[k])] image_input = self.generator.generate_images(z_img, intervention=None) image_input = utils.transform(image_input).detach() else: image_input = image_input.cuda() neg_images = neg_images.cuda() # STEP 1: Compute output positive model_output = self.model(image_input, audio_input, []) image_output = model_output[0] audio_output = model_output[1] neg_images = [] pooling_ratio = round(audio_input.size(3) / audio_output.size(3)) nframes.div_(pooling_ratio) binary_mask_0 = None # Only do steps 2-4 if we want to train with semantic negatives if self.loss_type == 'negatives_edited' or self.loss_type == 'negatives_both': # STEP 2: Compute mask from image features limits = np.zeros((image_input.size(0), 2)) for i in range(image_input.size(0)): pos_image = image_input[i, :, :, :] nF = nframes[i] matchmap = utils.compute_matchmap( image_output[i], audio_output[i][:, :, :nF]) matchmap = matchmap.data.cpu().numpy().copy() matchmap = matchmap.transpose(2, 0, 1) # l, h, w matchmap = matchmap / (matchmap.max() + 1e-10) matchmap_image = matchmap.max(axis=0) threshold = 0.95 # ind_max = np.argmax(matchmap_image) ind_max = np.argmax(matchmap) ind_t = ind_max // (matchmap.shape[2] * matchmap.shape[1]) ind_h = (ind_max % (matchmap.shape[2] * matchmap.shape[1]) ) // matchmap.shape[1] ind_w = (ind_max % (matchmap.shape[2] * matchmap.shape[1]) ) % matchmap.shape[1] limits[i, 0] = ind_t limits[i, 1] = ind_t + 1 if self.clustering: if self.args.active_learning and 'active' in path[i]: neg_img = active_learning.get_negatives( self, path_ints[i]) else: v = (image_output[i][:, ind_h, ind_w] - self.mean_clust.cuda()) / ( self.std_clust.cuda() + 1e-8) normalized_clusters = np.matmul( self.clusters.cpu(), v.detach().cpu().numpy().transpose()) sorted_val = -np.sort(-normalized_clusters[:]) sorted_val = np.clip(sorted_val, 0, 4) if np.sum(sorted_val) <= 0: print( "None of the clusters was close to the image feature. If this happens regularly, " "it probably means they were low quality clusters. Did you pretrain with a " "regular loss before clustering?") prob_samples = sorted_val / np.sum(sorted_val) sorted_id = np.argsort(-normalized_clusters[:]) cluster_id = sorted_id[0] norm = 0 threshold_random = 0.95 # The number of units to be ablated grows if we cannot generate a good (changed) negative # The following numbers are the starting number of units to change num_units_dict = { 'layer2': 30, 'layer3': 30, 'layer4': 140, 'layer5': 30, 'layer6': 30 } thresold_heatmap = threshold count = 0 binary_mask_eval = matchmap_image > ( thresold_heatmap * matchmap_image.max()) binary_mask_eval = utils.geodesic_dilation( binary_mask_eval, (ind_h, ind_w)) binary_mask_eval = cv2.resize( binary_mask_eval, (128, 128)) bmask = torch.Tensor(binary_mask_eval).cuda() bmask = bmask.view(1, 128, 128).expand(3, 128, 128) while norm < threshold_random: with torch.no_grad(): binary_mask = matchmap_image > ( thresold_heatmap * matchmap_image.max()) binary_mask = utils.geodesic_dilation( binary_mask, (ind_h, ind_w)) if binary_mask_0 is None: binary_mask_0 = cv2.resize( binary_mask, (224, 224)) # STEP 3: Generate new image z_img = self.z[int(path_ints[i])] z_img = z_img[np.newaxis, :] _ = self.generator.generate_images(z_img) intervention = {} for layer_n in self.layer_list_all: units_ids = self.layers_units[layer_n][ cluster_id][:num_units_dict[ layer_n]] layer_size = self.layers_dict[layer_n][ 'size'] layer_dim = self.layers_dict[layer_n][ 'depth'] ablation, replacement = self.get_ablation_replacement( params=[layer_dim, units_ids], option='specific') ablation_final = cv2.resize( binary_mask, (layer_size, layer_size)) ablation_final = np.tile( ablation_final, (layer_dim, 1, 1)).astype( np.float32) ablation_final = torch.cuda.FloatTensor( ablation_final) ablation_final = ablation.view( layer_dim, 1, 1).expand_as(ablation_final ) * ablation_final intervention[layer_n] = ( ablation_final, replacement) neg_img = self.generator.generate_images( z_img, intervention=intervention).detach() neg_img_t = utils.transform( neg_img).detach() norm = (neg_img_t[0, :, :, :] - pos_image.detach()) norm = norm * bmask norm = torch.norm(torch.norm(torch.norm( norm, dim=2), dim=1), dim=0) norm_normalized = norm / torch.norm( torch.norm(torch.norm( pos_image.detach() * bmask, dim=2), dim=1), dim=0) norm = norm_normalized.item() for layer_n in self.layer_list_all: num_units_dict[layer_n] = num_units_dict[ layer_n] + 40 # increase units to change thresold_heatmap = thresold_heatmap - 0.1 threshold_random = threshold_random - 0.05 cluster_id = np.random.choice( sorted_id, size=1, p=prob_samples)[0] count = count + 1 else: # random edited negatives binary_mask = matchmap_image > (threshold * matchmap_image.max()) binary_mask = utils.geodesic_dilation( binary_mask, (ind_h, ind_w)) if binary_mask_0 is None: binary_mask_0 = cv2.resize(binary_mask, (224, 224)) norm = 0 threshold_random = 0.95 p = 0.4 while norm < threshold_random: with torch.no_grad(): intervention = {} for layer_n in self.layer_list_all: layer_size = self.layers_dict[layer_n][ 'size'] layer_dim = self.layers_dict[layer_n][ 'depth'] ablation, replacement = self.get_ablation_replacement( params=[layer_dim, True, 0.5], option='random') ablation_final = cv2.resize( binary_mask, (layer_size, layer_size)) ablation_final = np.tile( ablation_final, (layer_dim, 1, 1)).astype(np.float32) ablation_final = torch.cuda.FloatTensor( ablation_final) ablation_final = ablation.view( layer_dim, 1, 1).expand_as( ablation_final) * ablation_final intervention[layer_n] = (ablation_final, replacement) # STEP 3: Generate new image z_img = self.z[int(path_ints[i])] z_img = z_img[np.newaxis, :].detach() neg_img = self.generator.generate_images( z_img, intervention=intervention).detach() neg_img_t = utils.transform(neg_img).detach() binary_mask = cv2.resize( binary_mask, (128, 128)) bmask = torch.Tensor(binary_mask).cuda() bmask = bmask.view(1, 128, 128).expand(3, 128, 128) norm = (neg_img_t[0, :, :, :] - pos_image.detach()) norm = norm * bmask norm = torch.norm(torch.norm(torch.norm(norm, dim=2), dim=1), dim=0) norm_normalized = norm / torch.norm(torch.norm( torch.norm(pos_image.detach() * bmask, dim=2), dim=1), dim=0) norm = norm_normalized.item() if random.random() > 0.2: p = p + 0.05 else: threshold_random = threshold_random - 0.01 neg_images.append(neg_img) neg_images = torch.cat(neg_images) neg_images_t = utils.transform(neg_images) # print(neg_images_t.size()) # STEP 4: Compute output negative image_output_neg, _, _ = self.model(neg_images_t, None, []) # STEP 5: Compute losses if self.args.active_learning: image_output, image_output_neg = active_learning.switch_pos_neg( self, image_input, image_output, image_output_neg, path) if self.loss_type == 'regular': loss = losses.sampled_margin_rank_loss(image_output, audio_output, nframes, self.margin, self.args.symfun) loss_list['loss_regular'] = loss.item() loss_list['loss_total'] = loss.item() elif self.loss_type == 'negatives_edited': # train with semantic negatives loss_regular = losses.sampled_margin_rank_loss( image_output, audio_output, nframes, self.margin, self.args.symfun) loss_neg = losses.negatives_loss(image_output, audio_output, image_output_neg, nframes, self.margin, self.args.symfun) loss = loss_regular + loss_neg loss_list['loss_regular'] = loss_regular.item() loss_list['loss_neg'] = loss_neg.item() loss_list['loss_total'] = loss.item() elif self.loss_type == 'negatives_hard': # train with hard negatives loss_regular = losses.sampled_margin_rank_loss( image_output, audio_output, nframes, self.margin, self.args.symfun) loss_neg = losses.hard_negative_loss(image_output, audio_output, nframes, self.margin, self.args.symfun) loss = loss_regular + loss_neg loss_list['loss_regular'] = loss_regular.item() loss_list['loss_neg'] = loss_neg.item() loss_list['loss_total'] = loss.item() elif self.loss_type == 'negatives_both': # combine hard negatives with semantic negatives loss_hardneg = losses.combined_random_hard_negative_loss( image_output, audio_output, image_output_neg, nframes, self.margin, self.args.symfun) loss_regular = losses.sampled_margin_rank_loss( image_output, audio_output, nframes, self.margin, self.args.symfun) loss_regular = torch.clamp(loss_regular, min=0, max=5) loss_hardneg = torch.clamp(loss_hardneg, min=0, max=5) loss = loss_regular + loss_hardneg loss_list['loss_regular'] = loss_regular.item() loss_list['loss_hardneg'] = loss_hardneg.item() loss_list['loss_total'] = loss.item() else: raise Exception( f'The loss function {self.loss_type} is not implemented.') last_sample = N_examples * epoch + batch_id * self.args.batch_size + image_input.size( 0) # Record loss losses_meter.update(loss.item(), image_input.size(0)) # Backward pass and update self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Measure elapsed time batch_time.update(time.time() - end) end = time.time() # Print results if (batch_id + 1) % self.args.print_freq == 0: for name in loss_list: loss_list_total[name] += loss_list[name] for name in loss_list: loss_list_total[ name] = loss_list_total[name] / self.args.print_freq for loss_name in loss_list: self.args.writer.add_scalar(f'losses/{loss_name}', loss_list_total[loss_name], last_sample) print( f'Epoch: [{epoch}][{batch_id+1}/{len(self.loaders["train"])}]\t' f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' f'Data {data_time.val:.3f} ({data_time.avg:.3f})\t' f'Loss {losses_meter.val:.4f} ({losses_meter.avg:.4f})\t', flush=True) image_raw = self.unorm(image_input[0].data.cpu()) self.args.writer.add_image('positive', image_raw, last_sample) if self.loss_type == 'negatives_edited' or self.loss_type == 'negatives_both': image_raw_neg = self.unorm(neg_images[0].data.cpu()) image_neg = image_raw_neg / torch.max(image_raw_neg) self.args.writer.add_image('negative', image_neg, last_sample) self.args.writer.add_image( 'Images/region', 255 * np.array([binary_mask_0, binary_mask_0, binary_mask_0 ]).swapaxes(0, 1).swapaxes(1, 2), last_sample) loss_list_total = {k: 0 for k, v in loss_list_total.items()} else: for loss_name in loss_list: loss_list_total[loss_name] += loss_list[loss_name] def optimize_neurons(self): # Set up console output verbose_progress(True) gan_model = self.generator.model annotate_model_shapes(gan_model, gen=True) outdir = os.path.join( self.args.results, 'dissect', self.args.name_checkpoint + '_' + str(time.time())) os.makedirs(outdir, exist_ok=True) size = 1000 sample = z_sample_for_model(gan_model, size) train_sample = z_sample_for_model(gan_model, size, seed=2) dataset = TensorDataset(sample) train_dataset = TensorDataset(train_sample) self.cluster_segmenter = ClusterSegmenter(self.model, self.clusters, self.mean_clust, self.std_clust) segrunner = GeneratorSegRunner(self.cluster_segmenter) netname = outdir # Run dissect with torch.no_grad(): dissect( outdir, gan_model, dataset, train_dataset=train_dataset, segrunner=segrunner, examples_per_unit=20, netname=netname, quantile_threshold='iqr', meta=None, make_images=False, # True, make_labels=True, make_maxiou=False, make_covariance=False, make_report=True, make_row_images=True, make_single_images=True, batch_size=8, num_workers=8, rank_all_labels=True) sample_ablate = z_sample_for_model(gan_model, 16) dataset_ablate = TensorDataset(sample_ablate) data_loader = torch.utils.data.DataLoader(dataset_ablate, batch_size=8, shuffle=False, num_workers=8, pin_memory=True, sampler=None) with open(os.path.join(outdir, 'dissect.json')) as f: data = EasyDict(json.load(f)) dissect_layer = {lrec.layer: lrec for lrec in data.layers} self.layers_units = { 'layer2': [], 'layer3': [], 'layer4': [], 'layer5': [], 'layer6': [], } noise_units = np.array([35, 221, 496, 280]) for i in range(2, len(self.clusters) + 2): print('Cluster', i) rank_name = 'c_{0}-iou'.format(i) for l in range(len(self.layer_list_all)): ranking = next( r for r in dissect_layer[self.layer_list_all[l]].rankings if r.name == rank_name) unit_list = np.array(range(512)) unit_list[noise_units] = 0 ordering = np.argsort(ranking.score) units_list = unit_list[ordering] self.layers_units[self.layer_list_all[l]].append( units_list) # Mark the directory so that it's not done again. mark_job_done(outdir) def get_ablation_replacement(self, params=(), option='random'): if option == 'random': import random dim_mask = params[0] binary = params[1] values = np.random.rand(dim_mask) if binary: prob_ones = params[2] ablation = torch.FloatTensor( (np.random.rand(dim_mask) < prob_ones).astype( np.float)).cuda() else: ablation = torch.FloatTensor(values).cuda() replacement = torch.zeros(dim_mask).cuda() elif option == 'specific': units_ids = params[1] dim_mask = params[0] ablation, replacement = torch.zeros(dim_mask).cuda(), torch.zeros( dim_mask).cuda() ablation[units_ids] = 1 # color else: raise Exception('Please introduce a valid option') return ablation, replacement def eval(self): """ Collects features for number_recall images and audios and computes the recall @{1, 5, 10} of predicting one from the other. It does not involve any hard or edited negative. """ number_recall = 500 if not self.args.use_cpu: torch.cuda.synchronize() batch_time = utils.AverageMeter() # Switch to evaluate mode self.model.eval() end = time.time() N_examples = self.loaders['val'].dataset.__len__() image_embeddings = [] # torch.FloatTensor(N_examples, embedding_dim) audio_embeddings = [] # torch.FloatTensor(N_examples, embedding_dim) frame_counts = [] with torch.no_grad(): for i, (image_input, audio_input, negatives, nframes, path, _) in enumerate(self.loaders['val']): if len(image_embeddings) * image_input.size(0) > 500: break if not self.args.loading_image: path_ints = [p.split('/')[-1] for p in path ] # in case the audio is inside a subfolder v_init = self.z[int(path_ints[0])] z_img = torch.FloatTensor(image_input.size(0), v_init.shape[0]) for k in range(image_input.size(0)): z_img[k, :] = self.z[int(path_ints[k])] image_input = self.generator.generate_images( z_img, intervention=None) image_input = utils.transform(image_input) negatives = [] else: image_input = image_input.cuda() negatives = [negatives.cuda()] # compute output model_output = self.model(image_input, audio_input, negatives) image_output = model_output[0] audio_output = model_output[1] image_embeddings.append(image_output.data.cpu()) audio_embeddings.append(audio_output.data.cpu()) # find pooling ratio # audio_input is (B, D, 40, T) # audio_output is (B, D, 1, T/p) pooling_ratio = round( audio_input.size(3) / audio_output.size(3)) nframes.div_(pooling_ratio) frame_counts.append(nframes.cpu()) batch_time.update(time.time() - end) end = time.time() if i % self.args.print_freq == 0: print('Eval: [{0}/{1}]\t'.format(i + 1, len(self.loaders['val'])), flush=True) image_outputs = torch.cat(image_embeddings) audio_outputs = torch.cat(audio_embeddings) frame_counts_tensor = torch.cat(frame_counts) N_examples = np.minimum(number_recall, N_examples) image_outputs = image_outputs[-N_examples:, :, :, :] audio_outputs = audio_outputs[-N_examples:, :, :, :] frame_counts_tensor = frame_counts_tensor[-N_examples:] # measure accuracy and record loss print('Computing recalls...') recalls = utils.calc_recalls(image_outputs, audio_outputs, frame_counts_tensor, loss_type=self.loss_type) A_r10 = recalls['A_r10'] I_r10 = recalls['I_r10'] A_r5 = recalls['A_r5'] I_r5 = recalls['I_r5'] A_r1 = recalls['A_r1'] I_r1 = recalls['I_r1'] print( ' * Audio R@10 {A_r10:.3f} Image R@10 {I_r10:.3f} over {N:d} validation pairs' .format(A_r10=A_r10, I_r10=I_r10, N=N_examples), flush=True) print( ' * Audio R@5 {A_r5:.3f} Image R@5 {I_r5:.3f} over {N:d} validation pairs' .format(A_r5=A_r5, I_r5=I_r5, N=N_examples), flush=True) print( ' * Audio R@1 {A_r1:.3f} Image R@1 {I_r1:.3f} over {N:d} validation pairs' .format(A_r1=A_r1, I_r1=I_r1, N=N_examples), flush=True) eval_score = (A_r5 + I_r5) / 2 return eval_score, recalls
def gen_window_model(window_event, proc_events, clusterer=Clusterer()): global default_window_event if window_event == default_window_event.wm_name: return None assignments = {} clustered_events = {} for et in proc_events[window_event]: clusterer.clear_data() if et is EventType.NONE: continue try: for e in proc_events[window_event][et]: f = e.get_features() if len(f) == 0: break clusterer.append_data(f) if clusterer.shape[1] == 0: continue centroids, assigns = clusterer.cluster(clusterer.recommend_clusters(), 10) clustered_events[str(et)] = centroids for i in range(len(proc_events[window_event][et])): assignments[proc_events[window_event][et][i]] = assigns[i] except NotImplementedError as e: print(e) pass ngram = Ngram("") clustered_windowed_events = windowed_events[window_event][:] for i in range(len(clustered_windowed_events)): we = clustered_windowed_events[i] name = str(we.event_type) id = we.get_identifier() if not id is None: name += "[" + id + "]" if we in assignments: assignment = "{" + str(assignments[we]) + "}" if "{cluster}" in name: name = name.replace("{cluster}", assignment) else: name += "[" + assignment + "]" clustered_windowed_events[i] = name sequence = " ".join(clustered_windowed_events).replace("EventType.NONE", ngram.delimiter) ngram.construct(sequence, 5) ngram.calculate_probabilities() window_model = WindowModel(ngram, clustered_events) return window_model
from database import Database from youtube import YouTube from clusterer import Clusterer env = 'desktop' db_name = 'comment_sense_3' db = Database(env, db_name) yt = YouTube() videoId = 'kQibkV_V8-c' video_data = yt.video(videoId) comment_topics = db.comment_topics(videoId) cl = Clusterer(video_data, db) topics = cl.cluster(comment_topics) print(topics)
from clusterer import Clusterer import webbrowser # Get the user input track track_name = input( "Enter the name (artist optional) of a song: ") or 'Give it up Knife Party' # Run the clustering on the track c = Clusterer(track_name=track_name, alg_type='affprop') results = c.get_target_cluster() c.plot_clusters() print('Graph saved to ./Database/clusters.png') # convert the track ids returned from clustering back into track data print('Loading 20 of', len(results), 'track recommendations, please wait...') print() shift_tracks = [] for i, item in enumerate(results): shift_tracks += [c.ret.sp.track(item[1])] # output and save the recommended tracks to a file def output_recommendations(source, filename, tracks): print(source + ' Recommendations:') fout = open(filename, 'w') for track in tracks[:20]: print('track:', track['name'], '-', track['album']['artists'][0]['name']) print('track:', track['name'], '-',
def widgetParameters(self): """Create parameters widgets. """ # Cluster Count self.autoK = QCheckBox(self.tr('Auto')) self.clusterCount = QSpinBox(self) self.clusterCount.setValue(2) self.clusterCount.setMinimum(1) self.modeK = QComboBox(self) hcluster = QHBoxLayout() hcluster.addWidget(QLabel(self.tr('Cluster count:'))) hcluster.addWidget(self.modeK) hcluster.addWidget(self.clusterCount) # Slider hslider = QHBoxLayout() clusterLabel = QLabel(self.tr('Cluster count')) self.clusterSliderLabel = QLabel() compactnessLabel = QLabel(self.tr('Compactness')) self.compactnessSliderLabel = QLabel() self.clusterSlider = QSlider(QtCore.Qt.Horizontal) self.clusterSlider.valueChanged[int].connect(self.sliderMoved) self.clusterSlider.setMinimumWidth(100) self.clusterSlider.setValue(50) self.clusterSlider.setMaximum(100) hslider.addWidget(clusterLabel) hslider.addWidget(self.clusterSliderLabel) hslider.addWidget(self.clusterSlider) hslider.addWidget(compactnessLabel) hslider.addWidget(self.compactnessSliderLabel) self.clusterSliderWidget = QWidget() self.clusterSliderWidget.setLayout(hslider) # Set default mode self.modeK.currentIndexChanged.connect(self.toggleClusterCount) default = Clusterer.getDefaultKMode() defaultIndex = 0 for i, (mode, name) in enumerate(Clusterer.getAllKModes()): if mode == default: defaultIndex = i self.modeK.addItem(name) self.modeK.setCurrentIndex(defaultIndex) # Algo combo = QComboBox(self) default = Clusterer.getDefaultMode() defaultIndex = 0 for i, (mode, name) in enumerate(Clusterer.getAllModes()): if mode == default: defaultIndex = i combo.addItem(name) combo.setCurrentIndex(defaultIndex) halgo = QHBoxLayout() halgo.addWidget(QLabel(self.tr('Algorithm:'))) halgo.addWidget(combo) self.modeCombo = combo # BG color color = QtGui.QColor(0, 0, 0) self.colorPicker = QPushButton('') self.colorPicker.setMaximumSize(QtCore.QSize(16, 16)) self.colorPicker.clicked.connect(self.colorDialog) self.setPickerColor(color, self.colorPicker) self.transparentBg = QCheckBox(self.tr('Transparent')) self.transparentBg.setChecked(1) hbg = QHBoxLayout() hbg.addWidget(QLabel(self.tr('Background color:'))) hbg.addWidget(self.colorPicker) hbg.addWidget(self.transparentBg) hbg.addStretch(1) # Features featureBox = QGroupBox(self.tr('Features')) features = self.widgetFeatureList() featureBox.setLayout(features) # Param Box paramBox = QGroupBox(self.tr('Parameters')) paramLayout = QVBoxLayout() paramLayout.addLayout(hcluster) paramLayout.addWidget(self.clusterSliderWidget) paramLayout.addLayout(halgo) paramLayout.addLayout(hbg) paramBox.setLayout(paramLayout) runButton = self.widgetRun() vbox = QVBoxLayout() vbox.addWidget(paramBox) vbox.addWidget(featureBox) vbox.addLayout(runButton) vbox.addStretch(1) return vbox
''' Find the household with the lowest carbon emissions from a singe group ''' def find_greenest(cluster): min = 100000000 min_index = -1 for i in range(len(cluster)): if sum(cluster[1:len(cluster) - 1]) < min: min = cluster[i] min_index = i return min, min_index # Preprocessing preprocessor = Preprocessor() preprocessor.run_preprocessor() # Elbow method elbow = Elbow() elbow.run_elbow() # Clustering clusterer = Clusterer() clusterer.run_clusterer() # Regression regressor = Regressor() regressor.run_regressor()
def run_hmm(self, algorithm, sw_info, parameter_in_dir, parameter_out_dir='', preclusters=None, hmm_type='', stripped=False, prefix='', \ count_parameters=False, plotdir=None, make_clusters=False): # @parameterfetishist if prefix == '' and stripped: prefix = 'stripped' print '\n%shmm' % prefix csv_infname = self.args.workdir + '/' + prefix + '_hmm_input.csv' csv_outfname = self.args.workdir + '/' + prefix + '_hmm_output.csv' self.write_hmm_input(csv_infname, sw_info, preclusters=preclusters, hmm_type=hmm_type, stripped=stripped, parameter_dir=parameter_in_dir) print ' running' sys.stdout.flush() start = time.time() if self.args.n_procs > 1: self.split_input(self.args.n_procs, infname=csv_infname, prefix='hmm') procs = [] for iproc in range(self.args.n_procs): cmd_str = self.get_hmm_cmd_str(algorithm, csv_infname, csv_outfname, parameter_dir=parameter_in_dir, iproc=iproc) procs.append(Popen(cmd_str.split())) time.sleep(0.1) for proc in procs: proc.wait() for iproc in range(self.args.n_procs): if not self.args.no_clean: os.remove( csv_infname.replace( self.args.workdir, self.args.workdir + '/hmm-' + str(iproc))) self.merge_hmm_outputs(csv_outfname) else: cmd_str = self.get_hmm_cmd_str(algorithm, csv_infname, csv_outfname, parameter_dir=parameter_in_dir) check_call(cmd_str.split()) sys.stdout.flush() print ' hmm run time: %.3f' % (time.time() - start) hmminfo = self.read_hmm_output(algorithm, csv_outfname, make_clusters=make_clusters, count_parameters=count_parameters, parameter_out_dir=parameter_out_dir, plotdir=plotdir) if self.args.pants_seated_clustering: viterbicluster.cluster(hmminfo) clusters = None if make_clusters: if self.outfile is not None: self.outfile.write('hmm clusters\n') else: print '%shmm clusters' % prefix clusters = Clusterer(self.args.pair_hmm_cluster_cutoff, greater_than=True, singletons=preclusters.singletons) clusters.cluster(input_scores=hmminfo, debug=self.args.debug, reco_info=self.reco_info, outfile=self.outfile, plotdir=self.args.plotdir + '/pairscores') if self.args.outfname is not None: outpath = self.args.outfname if self.args.outfname[ 0] != '/': # if full output path wasn't specified on the command line outpath = os.getcwd() + '/' + outpath shutil.copyfile(csv_outfname, outpath) if not self.args.no_clean: if os.path.exists( csv_infname ): # if only one proc, this will already be deleted os.remove(csv_infname) os.remove(csv_outfname) return clusters
# Grid of 100x100 # 3 circles of 15x15 with each 10 points import testgenerator from clusterer import Clusterer from clustervisualizer import ClusterVisualizer points = testgenerator.create_circle_points(1000, 50, 50, 20, point_mass=10) clusterer = Clusterer(5, 10, 2) clustervisualizer = ClusterVisualizer(clusterer) clusterer.set_points(points) clusterer.run()
class Analyzer: bdm = None expLogColl = None #timeGran = timedelta(minutes=5) timeGran = timedelta(minutes=2) actuNames = None sensorNames = None zonelist = None feater = None clust = None def __init__(self): self.actuNames = ActuatorNames() self.sensorNames = SensorNames() self.bdm = BDWrapper() self.expLogColl = CollectionWrapper('experience_log') #self.zonelist = self.csv2list('metadata/partialzonelist.csv') self.zonelist = self.csv2list('metadata/zonelist.csv') self.feater = FeatureExtractor() self.clust = Clusterer() def csv2list(self, filename): outputList = list() with open(filename, 'r') as fp: reader = csv.reader(fp, delimiter=',') for row in reader: outputList.append(row[0]) return outputList def get_actuator_uuid(self, zone=None, actuType=None): context = dict() if zone != None: context['room']=zone if actuType != None: context['template']=actuType uuids = self.bdm.get_sensor_uuids(context) if len(uuids)>1: raise QRError('Many uuids are found', context) elif len(uuids)==0: raise QRError('No uuid is found', context) else: return uuids[0] def normalize_data_avg(self, rawData, beginTime, endTime): procData = pd.Series({beginTime:float(rawData[0])}) tp = beginTime while tp<=endTime: tp = tp+self.timeGran leftSeries = rawData[:tp] if len(leftSeries)>0: idx = len(leftSeries)-1 leftVal = leftSeries[idx] leftIdx = leftSeries.index[idx] else: leftVal = None rightSeries = rawData[tp:] if len(rightSeries)>0: rightVal = rightSeries[0] rightIdx = rightSeries.index[0] else: rightVal = None if rightVal==None and leftVal!=None: newVal = leftVal elif rightVal!=None and leftVal==None: newVal = rightVal elif tp==leftIdx: newVal = leftVal elif tp==rightIdx: newVal = rightVal elif rightVal!=None and leftVal!=None: leftDist = (tp - leftIdx).total_seconds() rightDist = (rightIdx - tp).total_seconds() newVal = (leftVal*rightDist+rightVal*leftDist)/(rightDist+leftDist) else: print "ERROR: no data found in raw data" newVal = None newData = pd.Series({tp:newVal}) procData = procData.append(newData) return procData def normalize_data_nextval_deprecated(self, rawData, beginTime, endTime): procData = pd.Series({beginTime:float(rawData[0])}) tp = beginTime while tp<=endTime: tp = tp+self.timeGran leftSeries = rawData[:tp] if len(leftSeries)>0: idx = len(leftSeries)-1 leftVal = leftSeries[idx] leftIdx = leftSeries.index[idx] else: leftVal = None rightSeries = rawData[tp:] if len(rightSeries)>0: rightVal = rightSeries[0] rightIdx = rightSeries.index[0] else: rightVal = None if rightVal != None: newVal = rightVal else: newVal = leftVal newData = pd.Series({tp:newVal}) procData = procData.append(newData) return procData def normalize_data(self, rawData, beginTime, endTime, normType): rawData = rawData[beginTime:endTime] if not beginTime in rawData.index: rawData[beginTime] = rawData.head(1)[0] rawData = rawData.sort_index() if not endTime in rawData.index: rawData[endTime] = rawData.tail(1)[0] rawData = rawData.sort_index() if normType=='nextval': procData = rawData.resample('2Min', fill_method='pad') elif normType=='avg': procData = rawData.resample('2Min', how='mean') else: procData = None return procData def receive_a_sensor(self, zone, actuType, beginTime, endTime, normType): print zone, actuType uuid = self.get_actuator_uuid(zone, actuType) rawData = self.bdm.get_sensor_ts(uuid, 'PresentValue', beginTime, endTime) if actuType!=self.actuNames.damperCommand: rawData = self.remove_negativeone(rawData) procData = self.normalize_data(rawData, beginTime, endTime, normType) return procData def receive_entire_sensors_notstore(self, beginTime, endTime, normType, exceptZoneList=[]): #TODO: Should be parallelized here dataDict = dict() for zone in self.zonelist: if not zone in exceptZoneList: dataDict[zone] = self.receive_zone_sensors(zone, beginTime, endTime, normType) return dataDict def receive_entire_sensors(self, beginTime, endTime, filename, normType, exceptZoneList=[]): # filename='data/'+beginTime.isoformat()[0:-7].replace(':','_') + '.pkl' dataDict = self.receive_entire_sensors_notstore(beginTime, endTime, normType, exceptZoneList=exceptZoneList) with open(filename, 'wb') as fp: pickle.dump(dataDict, fp) # json.dump(dataDict,fp) def clustering(self, inputData, dataDict): fftFeat = self.feater.get_fft_features(inputData, dataDict) minmaxFeat = self.feater.get_minmax_features(dataDict) dtwFeat = self.feater.get_dtw_features(inputData, dataDict) freqFeat = self.feater.get_freq_features(inputData, dataDict) featDict = dict() for zone in self.zonelist: featList = list() featList.append(fftFeat[zone]) featList.append(minmaxFeat[zone]) featList.append(dtwFeat[zone]) #featList.append(freqFeat[zone]) featDict[zone] = featList print featDict['RM-4132'] return self.clust.cluster_kmeans(featDict) def remove_negativeone(self, data): if -1 in data.values: indices = np.where(data==-1) for idx in indices: data[idx] = data[idx-1] return data def receive_zone_sensors(self, zone, beginTime, endTime, normType): zoneDict = dict() for actuType in self.actuNames.nameList+self.sensorNames.nameList: if actuType=='Actual Supply Flow': pass try: uuid = self.get_actuator_uuid(zone, actuType) except QRError: continue # if actuType == self.actuNames.commonSetpoint: # wcad = self.receive_a_sensor(zone, 'Warm Cool Adjust', beginTime, endTime, normType) # data = self.receive_a_sensor(zone, actuType, beginTime, endTime, normType) # data = data + wcad # pass if actuType != self.actuNames.damperCommand: if actuType==self.actuNames.occupiedCommand: pass data = self.receive_a_sensor(zone, actuType, beginTime, endTime, normType) else: data = self.receive_a_sensor(zone, actuType, beginTime, endTime, normType) zoneDict[actuType] = data return zoneDict def store_zone_sensors(self, zone, beginTime, endTime, normType, filename): data = self.receive_zone_sensors(zone, beginTime, endTime, normType) # with open(filename, 'wb') as fp: # w = csv.DictWriter(fp, data.keys()) # w.writeheader() # w.writerow(data) for key, val in data.iteritems(): val.to_csv('rm4132.csv', header=key, mode='a') def store_minmax_dict(self): minDict = defaultdict(dict) maxDict = defaultdict(dict) beginTime = datetime(2015,2,1) endTime = datetime(2015,9,1) shortBeginTime = datetime(2015,8,1) shortEndTime = datetime(2015,8,2) for zone in self.zonelist: for pointType in self.actuNames.nameList+self.sensorNames.nameList: try: if pointType=='Occupied Command': minDict[zone][pointType] = 1 maxDict[zone][pointType] = 3 elif pointType=='Cooling Command': minDict[zone][pointType] = 0 maxDict[zone][pointType] = 100 elif pointType=='Cooling Command' or pointType=='Heating Command': minDict[zone][pointType] = 0 maxDict[zone][pointType] = 100 elif pointType=='Occupied Clg Min' or pointType=='Occupied Htg Flow' or pointType=='Cooling Max Flow': uuid = self.get_actuator_uuid(zone, pointType) data = self.bdm.get_sensor_ts(uuid, 'Presentvalue', shortBeginTime, shortEndTime) minDict[zone][pointType] = min(data) maxDict[zone][pointType] = max(data) elif pointType=='Temp Occ Sts': minDict[zone][pointType] = 0 maxDict[zone][pointType] = 1 elif pointType=='Reheat Valve Command': minDict[zone][pointType] = 0 maxDict[zone][pointType] = 100 elif pointType=='Actual Supply Flow' or pointType=='Actual Sup Flow SP': uuid = self.get_actuator_uuid(zone, pointType) data = self.bdm.get_sensor_ts(uuid, 'Presentvalue', shortBeginTime, shortEndTime) maxFlow = data[0] minDict[zone][pointType] = 0 maxDict[zone][pointType] = maxFlow elif pointType=='Damper Position': minDict[zone][pointType] = 0 maxDict[zone][pointType] = 100 elif pointType=='Damper Command': uuid = self.get_actuator_uuid(zone, pointType) data = self.bdm.get_sensor_ts(uuid, 'Presentvalue', shortBeginTime, shortEndTime) meanData = np.mean(data) stdData = np.std(data) meanAgain = np.mean(data[np.logical_and(data<=meanData+2*stdData, data>=meanData-2*stdData)]) minDict[zone][pointType] = meanData-2*stdData maxDict[zone][pointType] = meanData+2*stdData else: uuid = self.get_actuator_uuid(zone, pointType) data = self.bdm.get_sensor_ts(uuid, 'Presentvalue', beginTime, endTime) minDict[zone][pointType] = min(data) maxDict[zone][pointType] = max(data) except: print "Something is wrong" pass with open('metadata/mindict.pkl', 'wb') as fp: pickle.dump(minDict, fp) with open('metadata/maxdict.pkl', 'wb') as fp: pickle.dump(maxDict, fp)
def __init__(self, appraisal, cluster_identity, marker, appraisal_colours): ''' appraisal: Appraisal cluster_identity: float, as in Clusterer marker: str the marker being plotted ''' self.appraisal_colours = appraisal_colours logging.debug("Generating plot info for %s" % marker) # Collect all OTUs from all samples so that they can be processed # together. all_binned_otus = [] all_assembled_not_binned_otus = [] all_not_found_otus = [] max_count = 0 # yuck. Sloppy scope in Python, but not in lambdas when I need it.. def add_to_totality(otus, totality, max_count): count = 0 for otu in otus: if otu.marker == marker: totality.append(otu) count += otu.count if count > max_count: return count else: return max_count for sample_appraisal in appraisal.appraisal_results: max_count = add_to_totality(sample_appraisal.binned_otus, all_binned_otus, max_count) max_count = add_to_totality( sample_appraisal.assembled_not_binned_otus(), all_assembled_not_binned_otus, max_count) max_count = add_to_totality(sample_appraisal.not_found_otus, all_not_found_otus, max_count) logging.debug("Found maximal count of seqs as %i" % max_count) sequence_to_cluster = {} cluster_rep_and_count = [] collection = OtuTableCollection() collection.otu_table_objects = [ all_not_found_otus, all_assembled_not_binned_otus, all_binned_otus ] for cotu in Clusterer().cluster(collection, cluster_identity): cluster_rep_and_count.append([cotu.sequence, cotu.count]) for otu in cotu.otus: sequence_to_cluster[otu.sequence] = cotu # Sort the OTUs by descending order of counts, so that more abundant # OTUs get colour. sorted_cluster_rep_and_count = sorted(cluster_rep_and_count, key=lambda x: x[1], reverse=True) cluster_sequence_to_order = {} i = 0 for pair in sorted_cluster_rep_and_count: cluster_sequence_to_order[pair[0]] = i i += 1 self._sequence_to_cluster = sequence_to_cluster self._sorted_cluster_rep_and_count = sorted_cluster_rep_and_count self._cluster_sequence_to_order = cluster_sequence_to_order self.max_count = max_count
def train(cfg, model, dataset, optimizer, scheduler=None, logger=None, is_continue=False, use_pretrained=False, cluster_vis_path=None): save_to = cfg.TRAIN.CHECKPOINT_PATH epochs = cfg.TRAIN.EPOCHS batch_size = cfg.TRAIN.BATCHSIZE if logger is None: print('>>> No tensorboard logger used in training.') else: print('>>> Logger is used in training.') counter = 0 if len(save_to) == 0: print('>>> No checkpoints will be saved.') start_ep = 0 # initiate start epoch number # 继续训练至预定epoch全部完成 if is_continue: print('>>> Continue training from the latest checkpoint.') if save_to is None: print('>>> Without checkpoint folder, cannot continue training!') exit(0) ckpts = glob.glob(os.path.join(save_to, '*.pth')) if len(ckpts) == 0: print('>>> No earlier checkpoints, train from the beginning.') else: start_ckpt = find_latest_checkpoint(ckpts) print('>>> Found earlier checkpoints, continue training with {}.'. format(start_ckpt)) # load latest model start_ep = torch.load(os.path.join(save_to, start_ckpt))['epoch'] model_state = torch.load(os.path.join( save_to, start_ckpt))['model_state_dict'] # 加载权重、优化器、scheduler等信息 opt_state = torch.load(os.path.join( save_to, start_ckpt))['optimizer_state_dict'] model.load_state_dict(model_state) optimizer.load_state_dict(opt_state) optimizer = opt_to_gpu(optimizer, torch.cuda.is_available()) if scheduler is not None: scheduler_state = torch.load(os.path.join( save_to, start_ckpt))['scheduler_state_dict'] scheduler.load_state_dict(scheduler_state) if logger is not None: counter = torch.load(os.path.join( save_to, start_ckpt))['logger_counter'] # 仅使用pretrained权重从头开始训练 if use_pretrained: print('>>> Use pretrained model weights to start a new training.') model_state = torch.load( cfg.TRAIN.PRETRAINED_PATH)['model_state_dict'] # 只加载模型权重 model.load_state_dict(model_state) if torch.cuda.is_available(): model = model.cuda() # training loop for epoch in range(start_ep, epochs): # extract global features print('>>> Extracting global features ...') features, v_labels, cam_labels = extract_global_features( img_shape=(256, 256), batch_size=batch_size, workers=8, model=model, dataset=dataset, mode='train', is_cuda=torch.cuda.is_available()) # clustering print('>>> Start clustering ...') features = merge_features_from_dict(features) pseudo_labels, num_ids, centroids = Clusterer( features, eps=0.5, is_cuda=torch.cuda.is_available()).cluster( visualize_path=cluster_vis_path, epoch=epoch + 1) # create non-outlier refined dataset print('>>> Refining dataset ...') good_dataset = refine_dataset((256, 256), dataset, pseudo_labels) sampler = ClusterSampler(good_dataset) sampler = torch.utils.data.BatchSampler(sampler, batch_size=cfg.TRAIN.BATCHSIZE, drop_last=False) # good_dataloader = DataLoader(good_dataset, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=4) good_dataloader = DataLoader(good_dataset, shuffle=False, batch_sampler=sampler, num_workers=8) # memory bank initialization memory = MemoryBank(num_feature_dims=2048, num_samples=num_ids, temp=0.07, momentum=0.02) memory = init_memory_bank(memory, centroids) # training step for i, (imgs, pids, fnames, vids, camids) in enumerate(good_dataloader): if torch.cuda.is_available(): imgs = imgs.cuda() memory = memory.cuda() optimizer.zero_grad() features = model(imgs) loss = memory(features, pids) # update memory bank and compute loss loss.backward() optimizer.step() if (i + 1) % 50 == 0: # print loss each 50 iters print('[epoch: {}/{}][iter: {}/{}] loss: {}'.format( epoch + 1, epochs, i + 1, len(good_dataloader), loss)) # update logger if logger is not None: logger.add_scalar('loss', loss.item(), global_step=counter) logger.add_scalar('cluster_centroids', memory.num_samples, global_step=counter) logger.add_scalar( 'lr', optimizer.state_dict()['param_groups'][0]['lr'], global_step=counter) counter += 1 # update scheduler if scheduler is not None: scheduler.step() # save checkpoint if len(save_to) != 0 and (epoch + 1) % cfg.TRAIN.SAVE_INTERVAL == 0: save_name = os.path.join(save_to, 'backbone-epoch-{}.pth'.format(epoch + 1)) state_dict = { 'epoch': epoch + 1, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'scheduler_state_dict': scheduler.state_dict() if scheduler is not None else None, 'logger_counter': counter if logger is not None else None } torch.save(state_dict, save_name) print('>>> Checkpoint is saved as {}.'.format(save_name))
def __init__(self, config): self.clusterer = Clusterer(**config) self.pattern_generator = self.clusterer.pattern_generator