def cdr3_length_precluster(self, waterer, preclusters=None):
        cdr3lengthfname = self.args.workdir + '/cdr3lengths.csv'
        with opener('w')(cdr3lengthfname) as outfile:
            writer = csv.DictWriter(
                outfile, ('unique_id', 'second_unique_id', 'cdr3_length',
                          'second_cdr3_length', 'score'))
            writer.writeheader()
            for query_name, second_query_name in self.get_pairs(preclusters):
                cdr3_length = waterer.info[query_name]['cdr3_length']
                second_cdr3_length = waterer.info[second_query_name][
                    'cdr3_length']
                same_length = cdr3_length == second_cdr3_length
                if not self.args.is_data:
                    assert cdr3_length == int(
                        self.reco_info[query_name]['cdr3_length'])
                    if second_cdr3_length != int(
                            self.reco_info[second_query_name]['cdr3_length']):
                        print 'WARNING did not infer correct cdr3 length'
                        assert False
                writer.writerow({
                    'unique_id': query_name,
                    'second_unique_id': second_query_name,
                    'cdr3_length': cdr3_length,
                    'second_cdr3_length': second_cdr3_length,
                    'score': int(same_length)
                })

        clust = Clusterer(
            0.5,
            greater_than=True)  # i.e. cluster together if same_length == True
        clust.cluster(cdr3lengthfname, debug=False)
        os.remove(cdr3lengthfname)
        return clust
Example #2
0
 def cluster(self, kmeans, hyper):
     clus1 = Clusterer(
         self.get_rel_docs(),
         APIAdapter.get_data_foldername(self.get_search_term()), kmeans,
         hyper)
     clus1.cluster()
     self.clusterer = clus1
Example #3
0
def map_segments_to_clusters(x):
    # print('mapper: %s working on %s' % (os.getpid(), x))
    ((filename, start, end, size), config) = x
    clusterer = Clusterer(**config)
    lines = FileSegmentReader.read(filename, start, end, size)
    clusters = clusterer.find(lines)
    return [(FIXED_MAP_JOB_KEY, clusters)]
    def run_hmm(self, algorithm, sw_info, parameter_in_dir, parameter_out_dir='', preclusters=None, hmm_type='', stripped=False, prefix='', \
                count_parameters=False, plotdir=None, make_clusters=False):  # @parameterfetishist

        if prefix == '' and stripped:
            prefix = 'stripped'
        print '\n%shmm' % prefix
        csv_infname = self.args.workdir + '/' + prefix + '_hmm_input.csv'
        csv_outfname = self.args.workdir + '/' + prefix + '_hmm_output.csv'
        self.write_hmm_input(csv_infname, sw_info, preclusters=preclusters, hmm_type=hmm_type, stripped=stripped, parameter_dir=parameter_in_dir)
        print '    running'
        sys.stdout.flush()
        start = time.time()
        if self.args.n_procs > 1:
            self.split_input(self.args.n_procs, infname=csv_infname, prefix='hmm')
            procs = []
            for iproc in range(self.args.n_procs):
                cmd_str = self.get_hmm_cmd_str(algorithm, csv_infname, csv_outfname, parameter_dir=parameter_in_dir, iproc=iproc)
                procs.append(Popen(cmd_str.split()))
                time.sleep(0.1)
            for proc in procs:
                proc.wait()
            for iproc in range(self.args.n_procs):
                if not self.args.no_clean:
                    os.remove(csv_infname.replace(self.args.workdir, self.args.workdir + '/hmm-' + str(iproc)))
            self.merge_hmm_outputs(csv_outfname)
        else:
            cmd_str = self.get_hmm_cmd_str(algorithm, csv_infname, csv_outfname, parameter_dir=parameter_in_dir)
            check_call(cmd_str.split())

        sys.stdout.flush()
        print '      hmm run time: %.3f' % (time.time()-start)

        hmminfo = self.read_hmm_output(algorithm, csv_outfname, make_clusters=make_clusters, count_parameters=count_parameters, parameter_out_dir=parameter_out_dir, plotdir=plotdir)

        if self.args.pants_seated_clustering:
            viterbicluster.cluster(hmminfo)

        clusters = None
        if make_clusters:
            if self.outfile is not None:
                self.outfile.write('hmm clusters\n')
            else:
                print '%shmm clusters' % prefix
            clusters = Clusterer(self.args.pair_hmm_cluster_cutoff, greater_than=True, singletons=preclusters.singletons)
            clusters.cluster(input_scores=hmminfo, debug=self.args.debug, reco_info=self.reco_info, outfile=self.outfile, plotdir=self.args.plotdir+'/pairscores')

        if self.args.outfname is not None:
            outpath = self.args.outfname
            if self.args.outfname[0] != '/':  # if full output path wasn't specified on the command line
                outpath = os.getcwd() + '/' + outpath
            shutil.copyfile(csv_outfname, outpath)

        if not self.args.no_clean:
            if os.path.exists(csv_infname):  # if only one proc, this will already be deleted
                os.remove(csv_infname)
            os.remove(csv_outfname)

        return clusters
    def hamming_precluster(self, preclusters=None):
        assert self.args.truncate_pairs
        start = time.time()
        print 'hamming clustering'
        chopped_off_left_sides = False
        hamming_info = []
        all_pairs = self.get_pairs(preclusters)
        # print '    getting pairs: %.3f' % (time.time()-start); start = time.time()
        # all_pairs = itertools.combinations(self.input_info.keys(), 2)
        if self.args.n_fewer_procs > 1:
            pool = Pool(processes=self.args.n_fewer_procs)
            subqueries = self.split_input(
                self.args.n_fewer_procs,
                info=list(all_pairs),
                prefix='hamming'
            )  # NOTE 'casting' to a list here makes me nervous!
            sublists = []
            for queries in subqueries:
                sublists.append([])
                for id_a, id_b in queries:
                    sublists[-1].append({
                        'id_a': id_a,
                        'id_b': id_b,
                        'seq_a': self.input_info[id_a]['seq'],
                        'seq_b': self.input_info[id_b]['seq']
                    })

            # print '    preparing info: %.3f' % (time.time()-start); start = time.time()
            subinfos = pool.map(utils.get_hamming_distances, sublists)
            # NOTE this starts the proper number of processes, but they seem to end up i/o blocking or something (wait % stays at zero, but they each only get 20 or 30 %cpu on stoat)
            pool.close()
            pool.join()
            # print '    starting pools: %.3f' % (time.time()-start); start = time.time()

            for isub in range(len(subinfos)):
                hamming_info += subinfos[isub]
            # print '    merging pools: %.3f' % (time.time()-start); start = time.time()
        else:
            hamming_info = self.get_hamming_distances(all_pairs)

        if self.outfile is not None:
            self.outfile.write('hamming clusters\n')

        clust = Clusterer(
            self.args.hamming_cluster_cutoff, greater_than=False
        )  # NOTE this 0.5 is reasonable but totally arbitrary
        clust.cluster(input_scores=hamming_info,
                      debug=self.args.debug,
                      outfile=self.outfile,
                      reco_info=self.reco_info)
        # print '    clustering: %.3f' % (time.time()-start); start = time.time()

        if chopped_off_left_sides:
            print 'WARNING encountered unequal-length sequences, so chopped off the left-hand sides of each'
        print '    hamming time: %.3f' % (time.time() - start)

        return clust
Example #6
0
 def process_single_core(self, filenames):
     """
     Process multiple files sequencially using a single processor
     """
     clusterer = Clusterer(**self.cluster_config)
     for filename in filenames:
         with open(filename, 'r') as f:
             for line in f:
                 clusterer.process_line(line)
     return clusterer.result()
Example #7
0
 def process_pipe(self):
     """
     Process continuously from stdin input stream
     """
     clusterer = Clusterer(**self.cluster_config)
     try:
         for line in sys.stdin:
             clusterer.process_line(line)
     except KeyboardInterrupt:
         pass
     finally:
         return clusterer.result()
Example #8
0
 def test(self):
     clusterer = Clusterer(k1=1, k2=1, max_dist=0.5, variables=[])
     clusters = clusterer.find([
         'hello 1 y 3',
         'hello 1 x 3',
         'abc m n q',
     ])
     self.assertEqual(
         clusters,
         [
             [['hello', '1', 'y', '3'], 2, ['hello', '1', '---', '3']],
             [['abc', 'm', 'n', 'q'], 1, ['abc', 'm', 'n', 'q']]
         ]
     )
Example #9
0
 def test_min_members(self):
     clusterer = Clusterer(
         k1=1, k2=1, max_dist=0.5, variables=[], min_members=2)
     clusters = clusterer.find([
         'hello 1 y 3',
         'hello 1 x 3',
         'abc m n q',
     ])
     self.assertEqual(
         clusters,
         [
             [['hello', '1', 'y', '3'], 2, ['hello', '1', '---', '3']],
         ]
     )
Example #10
0
    def __init__(self, parent=None):
        super(Window, self).__init__(parent)

        # Set Features List
        self.selectedFeatures = Clusterer.getDefaultFeatures()
        self.availableFeatures = Clusterer.getAllFeatures()
        self.availableFeatures &= ~self.selectedFeatures

        self.initUI()

        sys.stdout = EmittingStream(textWritten=self.normalOutputWritten)
        self.clustererThread = ClustererThread(self)
        self.clusterer = None
        self.clusters = None
        self.currentCluster = 0
        self.backgroundColor = QtGui.QColor(0, 0, 0)
def get_clusterer(trainer, args, output_size, model):
    assert len(
        trainer.layer_list_all
    ) == 1, 'Active learning is only implemented for a single layer ablations'
    assert args.clustering, 'Active learning samples are associated with a specific clustering. The clustering flag ' \
                            'is necessary'
    active_paths = torch.load(
        os.path.join(args.active_learning_name, 'a_paths.pth'))
    active_units = torch.load(
        os.path.join(args.active_learning_name, 'units.pth'))
    active_binary_masks = torch.load(
        os.path.join(args.active_learning_name, 'a_hmaps.pth'))

    trainer.active_dict = {}
    for i, path in enumerate(active_paths):
        trainer.active_dict[path] = {
            'mask': active_binary_masks[i],
            'units': active_units[i],
            'index': i
        }

    cluster_path = os.path.join(args.active_learning_name, 'cluster')
    trainer.clusterer = Clusterer(trainer.loaders['train'],
                                  model,
                                  path_store=cluster_path,
                                  model_dim=args.embedding_dim,
                                  load_datapoints=True,
                                  load_histogram=True,
                                  load_clustering=True,
                                  load_name_final=True,
                                  save_results=True,
                                  output_size=output_size,
                                  args=args)
    return trainer.clusterer
Example #12
0
    def __init__(self, parent=None):
        super(Window, self).__init__(parent)

        # Set Features List
        self.selectedFeatures = Clusterer.getDefaultFeatures()
        self.availableFeatures = Clusterer.getAllFeatures()
        self.availableFeatures &= ~self.selectedFeatures

        self.initUI()

        sys.stdout = EmittingStream(textWritten=self.normalOutputWritten)
        self.clustererThread = ClustererThread(self)
        self.clusterer = None
        self.clusters = None
        self.currentCluster = 0
        self.backgroundColor = QtGui.QColor(0, 0, 0)
Example #13
0
 def createFeaturesList(self, features):
     """Create feature list from features flags.
     """
     ql = []
     for i in xrange(features):
         flag = ((features >> i) & 1) << i
         if flag:
             ql.append(Clusterer.getFeatureName(flag))
     return ql
Example #14
0
	def __init__(self):
		self.actuNames = ActuatorNames()
		self.sensorNames = SensorNames()
		self.bdm = BDWrapper()
		self.expLogColl = CollectionWrapper('experience_log')
		#self.zonelist = self.csv2list('metadata/partialzonelist.csv')
		self.zonelist = self.csv2list('metadata/zonelist.csv')
		self.feater = FeatureExtractor()
		self.clust = Clusterer()
Example #15
0
 def createFeaturesList(self, features):
     """Create feature list from features flags.
     """
     ql = []
     for i in xrange(features):
         flag = ((features >> i) & 1) << i
         if flag:
             ql.append(Clusterer.getFeatureName(flag))
     return ql
Example #16
0
    def hamming_precluster(self, preclusters=None):
        assert self.args.truncate_pairs
        start = time.time()
        print 'hamming clustering'
        chopped_off_left_sides = False
        hamming_info = []
        all_pairs = self.get_pairs(preclusters)
        # print '    getting pairs: %.3f' % (time.time()-start); start = time.time()
        # all_pairs = itertools.combinations(self.input_info.keys(), 2)
        if self.args.n_fewer_procs > 1:
            pool = Pool(processes=self.args.n_fewer_procs)
            subqueries = self.split_input(self.args.n_fewer_procs, info=list(all_pairs), prefix='hamming')  # NOTE 'casting' to a list here makes me nervous!
            sublists = []
            for queries in subqueries:
                sublists.append([])
                for id_a, id_b in queries:
                    sublists[-1].append({'id_a':id_a, 'id_b':id_b, 'seq_a':self.input_info[id_a]['seq'], 'seq_b':self.input_info[id_b]['seq']})
            
            # print '    preparing info: %.3f' % (time.time()-start); start = time.time()
            subinfos = pool.map(utils.get_hamming_distances, sublists)
            # NOTE this starts the proper number of processes, but they seem to end up i/o blocking or something (wait % stays at zero, but they each only get 20 or 30 %cpu on stoat)
            pool.close()
            pool.join()
            # print '    starting pools: %.3f' % (time.time()-start); start = time.time()
    
            for isub in range(len(subinfos)):
                hamming_info += subinfos[isub]
            # print '    merging pools: %.3f' % (time.time()-start); start = time.time()
        else:
            hamming_info = self.get_hamming_distances(all_pairs)

        if self.outfile is not None:
            self.outfile.write('hamming clusters\n')

        clust = Clusterer(self.args.hamming_cluster_cutoff, greater_than=False)  # NOTE this 0.5 is reasonable but totally arbitrary
        clust.cluster(input_scores=hamming_info, debug=self.args.debug, outfile=self.outfile, reco_info=self.reco_info)
        # print '    clustering: %.3f' % (time.time()-start); start = time.time()

        if chopped_off_left_sides:
            print 'WARNING encountered unequal-length sequences, so chopped off the left-hand sides of each'
        print '    hamming time: %.3f' % (time.time()-start)

        return clust
Example #17
0
 def addFeature(self):
     """Add selected features to selected features list.
     """
     selected = self.availableFeatureList.selectedItems()
     for item in selected:
         feature = Clusterer.getFeatureByName(item.text())
         self.availableFeatures &= ~feature
         self.selectedFeatures |= feature
         self.availableFeatureList.takeItem(self.availableFeatureList.row(item))
         self.selectedFeatureList.addItem(item)
Example #18
0
    def run(self):
        path = self.mw.sourcePathField.text()
        if not path:
            print "[Error] File path is empty"
            return
        try:
            img = Clusterer.readImage(path)
            imageBGRA = cv2.cvtColor(img, cv2.cv.CV_BGR2BGRA)
            self.mw.refreshSource(imageBGRA)
            features = self.mw.selectedFeatures
            if not features:
                return
            self.mw.clusterer = Clusterer()
            backgroundColor = self.mw.backgroundColor
            backgroundColor = backgroundColor.blue(), backgroundColor.green(
            ), backgroundColor.red()
            if self.mw.transparentBg.isChecked():
                backgroundColor = None
            mode = self.mw.modeCombo.itemText(self.mw.modeCombo.currentIndex())
            mode = Clusterer.getModeByName(mode)
            modeK = self.mw.modeK.itemText(self.mw.modeK.currentIndex())
            modeK = Clusterer.getKModeByName(modeK)
            k = self.mw.clusterCount.value()
            self.mw.runButton.setEnabled(False)
            self.mw.clusters = self.mw.clusterer.getClusters(
                path,
                mode=mode,
                kmode=modeK,
                clusterCount=k,
                features=features,
                backgroundColor=backgroundColor,
                slider=self.mw.clusterSlider.value())
            self.mw.currentCluster = 0
            self.mw.refreshCluster()
            self.mw.saveButton.setEnabled(True)

            self.mw.clusterer.graph(self.mw.figure)
            self.mw.canvas.setMinimumSize(self.mw.canvas.size())
            self.mw.canvas.draw()

        except (OSError, cv2.error, urllib2.HTTPError) as err:
            print err
        self.mw.runButton.setEnabled(True)
Example #19
0
 def addFeature(self):
     """Add selected features to selected features list.
     """
     selected = self.availableFeatureList.selectedItems()
     for item in selected:
         feature = Clusterer.getFeatureByName(item.text())
         self.availableFeatures &= ~feature
         self.selectedFeatures |= feature
         self.availableFeatureList.takeItem(
             self.availableFeatureList.row(item))
         self.selectedFeatureList.addItem(item)
Example #20
0
    def cdr3_length_precluster(self, waterer, preclusters=None):
        cdr3lengthfname = self.args.workdir + '/cdr3lengths.csv'
        with opener('w')(cdr3lengthfname) as outfile:
            writer = csv.DictWriter(outfile, ('unique_id', 'second_unique_id', 'cdr3_length', 'second_cdr3_length', 'score'))
            writer.writeheader()
            for query_name, second_query_name in self.get_pairs(preclusters):
                cdr3_length = waterer.info[query_name]['cdr3_length']
                second_cdr3_length = waterer.info[second_query_name]['cdr3_length']
                same_length = cdr3_length == second_cdr3_length
                if not self.args.is_data:
                    assert cdr3_length == int(self.reco_info[query_name]['cdr3_length'])
                    if second_cdr3_length != int(self.reco_info[second_query_name]['cdr3_length']):
                        print 'WARNING did not infer correct cdr3 length'
                        assert False
                writer.writerow({'unique_id':query_name, 'second_unique_id':second_query_name, 'cdr3_length':cdr3_length, 'second_cdr3_length':second_cdr3_length, 'score':int(same_length)})

        clust = Clusterer(0.5, greater_than=True)  # i.e. cluster together if same_length == True
        clust.cluster(cdr3lengthfname, debug=False)
        os.remove(cdr3lengthfname)
        return clust
Example #21
0
	def search_click(self):
		_textval = self.searchbox.text()
		self._search_term = _textval
		if self.gene_button.isChecked() and self.fileselected:
			if self.fileName:
				goldencorpus = GoldenCorpus(_textval,self.fileName)
				goldencorpus.fetchData()
				self.rel_docs = goldencorpus.get_rel_docs_pmid()
				self.mesh_terms = goldencorpus.get_mesh_terms()
				mesh_explosion = DataForEachMeshTerm(self.mesh_terms,_textval)
				path = mesh_explosion.get_data_foldername(_textval)
				clus = Clusterer(self.rel_docs,path,True,5)
				self.representative_id,self.representative,self.best_mesh_terms_id, self.best_mesh_terms = clus.cluster()
				if self.representative:
					self.updateRepresentativeInformation()
			else:
				print("Error! getting file name")
		elif self.pmid_button.isChecked():
			print("Golden corpus exists..")
		else:
			print("Please select related file..")
Example #22
0
    def run(self):
        path = self.mw.sourcePathField.text()
        if not path:
            print "[Error] File path is empty"
            return
        try:
            img = Clusterer.readImage(path)
            imageBGRA = cv2.cvtColor(img, cv2.cv.CV_BGR2BGRA)
            self.mw.refreshSource(imageBGRA)
            features = self.mw.selectedFeatures
            if not features:
                return
            self.mw.clusterer = Clusterer()
            backgroundColor = self.mw.backgroundColor
            backgroundColor = backgroundColor.blue(), backgroundColor.green(), backgroundColor.red()
            if self.mw.transparentBg.isChecked():
                backgroundColor = None
            mode = self.mw.modeCombo.itemText(self.mw.modeCombo.currentIndex())
            mode = Clusterer.getModeByName(mode)
            modeK = self.mw.modeK.itemText(self.mw.modeK.currentIndex())
            modeK = Clusterer.getKModeByName(modeK)
            k = self.mw.clusterCount.value()
            self.mw.runButton.setEnabled(False)
            self.mw.clusters = self.mw.clusterer.getClusters(path, mode=mode,
                                                             kmode=modeK,
                                                             clusterCount=k,
                                                             features=features,
                                                             backgroundColor=backgroundColor,
                                                             slider=self.mw.clusterSlider.value())
            self.mw.currentCluster = 0
            self.mw.refreshCluster()
            self.mw.saveButton.setEnabled(True)

            self.mw.clusterer.graph(self.mw.figure)
            self.mw.canvas.setMinimumSize(self.mw.canvas.size())
            self.mw.canvas.draw()

        except (OSError, cv2.error, urllib2.HTTPError) as err:
            print err
        self.mw.runButton.setEnabled(True)
Example #23
0
    def toggleClusterCount(self, index):
        """Disable cluster count when 'auto' is checked.
        """
        mode = Clusterer.getKModeByName(self.modeK.itemText(self.modeK.currentIndex()))
        if mode == Clusterer.KMODE_USER:
            self.clusterCount.show()
        else:
            self.clusterCount.hide()

        if mode == Clusterer.KMODE_SLIDER:
            self.clusterSliderWidget.show()
        else:
            self.clusterSliderWidget.hide()
Example #24
0
    def toggleClusterCount(self, index):
        """Disable cluster count when 'auto' is checked.
        """
        mode = Clusterer.getKModeByName(
            self.modeK.itemText(self.modeK.currentIndex()))
        if mode == Clusterer.KMODE_USER:
            self.clusterCount.show()
        else:
            self.clusterCount.hide()

        if mode == Clusterer.KMODE_SLIDER:
            self.clusterSliderWidget.show()
        else:
            self.clusterSliderWidget.hide()
Example #25
0
from fastapi import FastAPI
from vector_space import VectorSpace
from org_dataset import OrgDataset
from org_recommender import OrgRecommender
from clusterer import Clusterer
from keyword_finder import KeywordFinder
from keyword_matcher import KeywordMatcher
from gcd_utils import get_account_liked_tags

app = FastAPI()
dataset = OrgDataset.load_instance('./orgs.pkl')
vs = VectorSpace.load_instance('./test_vs.pkl')
recommender = OrgRecommender(dataset, vs)

c = Clusterer(dataset, vs, 20)
kw_finder = KeywordFinder(dataset, vs)
matcher = KeywordMatcher(c, kw_finder, vs.data_centroid)

@app.get('/get_init_recs/')
async def get_init_recs(userId: str, numOrgs: int):
    keywords = get_account_liked_tags(userId)
    centroid = matcher.get_kw_centroid(keywords)
    orgids = recommender.centroid_recommend(centroid, numOrgs)
    return_arr = []
    for id in orgids:
        entry = {'orgId': id}
        return_arr.append(entry)
    return return_arr
"""Example get request for api on local host:

http://127.0.0.1:8000/get_recommendations/?userId=334614c0-7f55-11ea-b1bc-2f9730f51173&numOrgs=2
Example #26
0
# Grid of 100x100
# 3 circles of 15x15 with each 10 points
import testgenerator
from clusterer import Clusterer
from clustervisualizer import ClusterVisualizer

points = testgenerator.create_circle_points(200, 8, 15, 10)
clusterer = Clusterer(1, 2, 2)
clustervisualizer = ClusterVisualizer(clusterer)
clusterer.set_points(points)
clusterer.run()

    not_picked = clean[(clean['eligible'] == 1) & (clean['oz'] == 0)]
    picked = clean[clean['oz'] == 1]

    nonfeatures = drop_columns(picked, drop_cols)
    features = picked.columns

    ## standardize
    standardize = StandardScaler()
    X, features = picked.values, picked.columns.values
    X = standardize.fit_transform(X)

    ## build model
    cluster_labels = pd.DataFrame()
    for k in range(6, 7):
        pax = Clusterer(model, n_clusters=k, random_state=24)
        centers = pax.fit(X)
        pax.store_features(features)
        print("{} grouped {} clusters.".format(model, np.shape(centers)[0]))

        ## update labels and scores for column k
        filepath = "{}/{}/labels.pkl".format(data, model)
        with open(filepath, "rb") as f:
            k = pax.attributes['n_clusters']
            model_labels_df = pickle.load(f)
            model_labels_df["k={}".format(k)] = pax.attributes['labels_']
            model_labels_df["k{}silhouette_score".format(
                k)] = pax.get_silhouette_samples()
        model_labels_df.to_pickle(filepath)
        print("Updated labels @ {}".format(filepath))
Example #28
0
    def __init__(self, model, optimizer, all_loaders, args, resume_epoch):

        self.resume_epoch = resume_epoch
        self.args = args

        self.optimizer = torch.optim.SGD((model.parameters()),
                                         args.lr,
                                         momentum=args.momentum,
                                         weight_decay=args.weight_decay)

        self.layer_list_all = args.layers
        self.layers_dict = {
            'layer2': {
                'name': 'layer2',
                'depth': 512,
                'size': 4
            },
            'layer3': {
                'name': 'layer3',
                'depth': 512,
                'size': 8
            },
            'layer4': {
                'name': 'layer4',
                'depth': 512,
                'size': 8
            },
            'layer5': {
                'name': 'layer5',
                'depth': 256,
                'size': 16
            },
            'layer6': {
                'name': 'layer6',
                'depth': 256,
                'size': 16
            },
        }

        self.generator = gantest.GanTester(args.path_model_gan,
                                           self.layer_list_all,
                                           device=torch.device('cuda'))
        self.z = self.generator.standard_z_sample(200000)

        self.model = model
        self.optimizer = optimizer
        self.loaders = all_loaders
        self.loss_type = args.loss_type

        # Other parameters
        self.margin = args.margin
        self.clustering = args.clustering

        self.epoch = 0
        self.unorm = utils.UnNormalize(mean=(0.485, 0.456, 0.406),
                                       std=(0.229, 0.224, 0.225))

        output_size = 32 if 'large' in args.audio_model else 256

        if args.active_learning:
            active_learning.get_clusterer(self, args, output_size, model)
        else:
            if args.clustering:
                print('Creating cluster from scratch')
                cluster_path = os.path.join(
                    self.args.results, 'clusters',
                    args.name_checkpoint + '_' + str(time.time()))
                self.clusterer = Clusterer(
                    self.loaders['train'],
                    model,
                    path_store=cluster_path,
                    model_dim=args.embedding_dim,
                    save_results=True,
                    output_size=output_size,
                    args=self.args,
                    path_cluster_load=args.path_cluster_load)

        self.epochs_clustering = self.args.epochs_clustering
        self.clusters = self.mean_clust = self.std_clust = self.cluster_counts = self.clusters_unit = None
Example #29
0
class Trainer:
    def __init__(self, model, optimizer, all_loaders, args, resume_epoch):

        self.resume_epoch = resume_epoch
        self.args = args

        self.optimizer = torch.optim.SGD((model.parameters()),
                                         args.lr,
                                         momentum=args.momentum,
                                         weight_decay=args.weight_decay)

        self.layer_list_all = args.layers
        self.layers_dict = {
            'layer2': {
                'name': 'layer2',
                'depth': 512,
                'size': 4
            },
            'layer3': {
                'name': 'layer3',
                'depth': 512,
                'size': 8
            },
            'layer4': {
                'name': 'layer4',
                'depth': 512,
                'size': 8
            },
            'layer5': {
                'name': 'layer5',
                'depth': 256,
                'size': 16
            },
            'layer6': {
                'name': 'layer6',
                'depth': 256,
                'size': 16
            },
        }

        self.generator = gantest.GanTester(args.path_model_gan,
                                           self.layer_list_all,
                                           device=torch.device('cuda'))
        self.z = self.generator.standard_z_sample(200000)

        self.model = model
        self.optimizer = optimizer
        self.loaders = all_loaders
        self.loss_type = args.loss_type

        # Other parameters
        self.margin = args.margin
        self.clustering = args.clustering

        self.epoch = 0
        self.unorm = utils.UnNormalize(mean=(0.485, 0.456, 0.406),
                                       std=(0.229, 0.224, 0.225))

        output_size = 32 if 'large' in args.audio_model else 256

        if args.active_learning:
            active_learning.get_clusterer(self, args, output_size, model)
        else:
            if args.clustering:
                print('Creating cluster from scratch')
                cluster_path = os.path.join(
                    self.args.results, 'clusters',
                    args.name_checkpoint + '_' + str(time.time()))
                self.clusterer = Clusterer(
                    self.loaders['train'],
                    model,
                    path_store=cluster_path,
                    model_dim=args.embedding_dim,
                    save_results=True,
                    output_size=output_size,
                    args=self.args,
                    path_cluster_load=args.path_cluster_load)

        self.epochs_clustering = self.args.epochs_clustering
        self.clusters = self.mean_clust = self.std_clust = self.cluster_counts = self.clusters_unit = None

    def train(self):
        """
        Main training loop. For each epoch train the model and save checkpoint if the results are good.
        Cluster every epochs_clustering epochs
        """
        best_eval = 0

        try:
            for epoch in range(self.resume_epoch, self.args.epochs):
                self.epoch = epoch

                # Clustering
                if self.clustering and \
                        ((epoch % self.epochs_clustering == 0) or (self.args.resume and epoch == self.resume_epoch)):
                    self.clusterer.save_results = True
                    clus, mean_clust, std_clust = self.clusterer.create_clusters(
                        iteration=0)
                    self.clusters = torch.FloatTensor(clus).cuda()
                    self.mean_clust = torch.FloatTensor(mean_clust)
                    self.std_clust = torch.FloatTensor(std_clust)
                    self.cluster_counts = 1 / self.clusters.max(1)[0]
                    self.clusters_unit = self.cluster_counts.view(self.clusters.size(0), 1).expand_as(self.clusters) * \
                                         self.clusters

                    self.clusterer.name_with_images_clusters()
                    self.clusterer.name_clusters()
                    self.optimize_neurons()

                    # This is for visualization:
                    # self.clusterer.segment_images()
                    # self.clusterer.create_web_images()  # segment_images has to be uncommented before
                    self.clusterer.create_web_clusters(with_images=True)

                utils.adjust_learning_rate(self.args, self.optimizer, epoch)

                # Train for one epoch
                print('Starting training epoch ' + str(epoch))
                self.train_epoch(epoch)

                # Evaluate on validation set
                print('Starting evaluation epoch ' + str(epoch))
                eval_score, recalls = self.eval()
                self.args.writer.add_scalar('eval_score', eval_score, epoch)

                # Remember best eval score and save checkpoint
                is_best = eval_score > best_eval
                best_eval = max(eval_score, best_eval)
                utils.save_checkpoint(
                    {
                        'epoch': epoch + 1,
                        'model_state_dict': self.model.state_dict(),
                        'best_eval': best_eval,
                        'recall_now': recalls,
                        'optimizer': self.optimizer.state_dict(),
                    },
                    is_best,
                    self.args,
                    name_checkpoint=self.args.name_checkpoint)

        except KeyboardInterrupt:
            print('You decided to finish the training at epoch ' +
                  str(epoch + 1))

    def train_epoch(self, epoch):
        """
        Train one epoch. It consists of 5 steps
        Step 1: Compute the output of the positive image
        Step 2: Compute the mask for the positive image features
        Step 3: Generate the negative image from this mask
        Step 4: Compute the output of this negative
        Step 5: Compute all the losses
        And after that, do the backpropagation and weight updates
        """
        if not self.args.use_cpu:
            torch.cuda.synchronize()
        batch_time = utils.AverageMeter()
        data_time = utils.AverageMeter()
        losses_meter = utils.AverageMeter()

        # Switch to train mode
        self.model.train()

        end = time.time()
        N_examples = self.loaders['train'].dataset.__len__()

        loss_list_total = {
            'loss_regular': 0,
            'loss_neg': 0,
            'loss_hardneg': 0,
            'loss_total': 0
        }
        for batch_id, (image_input, audio_input, neg_images, nframes, path,
                       image_raw) in enumerate(self.loaders['train']):
            loss_list = {
                'loss_regular': 0,
                'loss_neg': 0,
                'loss_hardneg': 0,
                'loss_total': 0
            }

            # Measure data loading time
            data_time.update(time.time() - end)

            if not self.args.use_cpu:
                audio_input = audio_input.cuda(async=True)

            if not self.args.loading_image:
                path_ints = [p.split('/')[-1] for p in path
                             ]  # in case the audio is inside a subfolder

                v_init = self.z[int(path_ints[0])]
                z_img = torch.FloatTensor(image_input.size(0), v_init.shape[0])

                for k in range(image_input.size(0)):
                    z_img[k, :] = self.z[int(path_ints[k])]

                image_input = self.generator.generate_images(z_img,
                                                             intervention=None)
                image_input = utils.transform(image_input).detach()

            else:
                image_input = image_input.cuda()
                neg_images = neg_images.cuda()

            # STEP 1: Compute output positive
            model_output = self.model(image_input, audio_input, [])
            image_output = model_output[0]
            audio_output = model_output[1]

            neg_images = []

            pooling_ratio = round(audio_input.size(3) / audio_output.size(3))
            nframes.div_(pooling_ratio)

            binary_mask_0 = None

            # Only do steps 2-4 if we want to train with semantic negatives
            if self.loss_type == 'negatives_edited' or self.loss_type == 'negatives_both':
                # STEP 2: Compute mask from image features
                limits = np.zeros((image_input.size(0), 2))

                for i in range(image_input.size(0)):
                    pos_image = image_input[i, :, :, :]

                    nF = nframes[i]

                    matchmap = utils.compute_matchmap(
                        image_output[i], audio_output[i][:, :, :nF])

                    matchmap = matchmap.data.cpu().numpy().copy()

                    matchmap = matchmap.transpose(2, 0, 1)  # l, h, w
                    matchmap = matchmap / (matchmap.max() + 1e-10)
                    matchmap_image = matchmap.max(axis=0)
                    threshold = 0.95

                    # ind_max = np.argmax(matchmap_image)
                    ind_max = np.argmax(matchmap)
                    ind_t = ind_max // (matchmap.shape[2] * matchmap.shape[1])
                    ind_h = (ind_max % (matchmap.shape[2] * matchmap.shape[1])
                             ) // matchmap.shape[1]
                    ind_w = (ind_max % (matchmap.shape[2] * matchmap.shape[1])
                             ) % matchmap.shape[1]

                    limits[i, 0] = ind_t
                    limits[i, 1] = ind_t + 1

                    if self.clustering:
                        if self.args.active_learning and 'active' in path[i]:
                            neg_img = active_learning.get_negatives(
                                self, path_ints[i])

                        else:
                            v = (image_output[i][:, ind_h, ind_w] -
                                 self.mean_clust.cuda()) / (
                                     self.std_clust.cuda() + 1e-8)

                            normalized_clusters = np.matmul(
                                self.clusters.cpu(),
                                v.detach().cpu().numpy().transpose())
                            sorted_val = -np.sort(-normalized_clusters[:])
                            sorted_val = np.clip(sorted_val, 0, 4)
                            if np.sum(sorted_val) <= 0:
                                print(
                                    "None of the clusters was close to the image feature. If this happens regularly, "
                                    "it probably means they were low quality clusters. Did you pretrain with a "
                                    "regular loss before clustering?")
                            prob_samples = sorted_val / np.sum(sorted_val)
                            sorted_id = np.argsort(-normalized_clusters[:])
                            cluster_id = sorted_id[0]

                            norm = 0
                            threshold_random = 0.95

                            # The number of units to be ablated grows if we cannot generate a good (changed) negative
                            # The following numbers are the starting number of units to change
                            num_units_dict = {
                                'layer2': 30,
                                'layer3': 30,
                                'layer4': 140,
                                'layer5': 30,
                                'layer6': 30
                            }
                            thresold_heatmap = threshold

                            count = 0
                            binary_mask_eval = matchmap_image > (
                                thresold_heatmap * matchmap_image.max())
                            binary_mask_eval = utils.geodesic_dilation(
                                binary_mask_eval, (ind_h, ind_w))
                            binary_mask_eval = cv2.resize(
                                binary_mask_eval, (128, 128))
                            bmask = torch.Tensor(binary_mask_eval).cuda()
                            bmask = bmask.view(1, 128, 128).expand(3, 128, 128)

                            while norm < threshold_random:
                                with torch.no_grad():
                                    binary_mask = matchmap_image > (
                                        thresold_heatmap *
                                        matchmap_image.max())
                                    binary_mask = utils.geodesic_dilation(
                                        binary_mask, (ind_h, ind_w))

                                    if binary_mask_0 is None:
                                        binary_mask_0 = cv2.resize(
                                            binary_mask, (224, 224))

                                    # STEP 3: Generate new image
                                    z_img = self.z[int(path_ints[i])]
                                    z_img = z_img[np.newaxis, :]

                                    _ = self.generator.generate_images(z_img)
                                    intervention = {}
                                    for layer_n in self.layer_list_all:
                                        units_ids = self.layers_units[layer_n][
                                            cluster_id][:num_units_dict[
                                                layer_n]]
                                        layer_size = self.layers_dict[layer_n][
                                            'size']
                                        layer_dim = self.layers_dict[layer_n][
                                            'depth']

                                        ablation, replacement = self.get_ablation_replacement(
                                            params=[layer_dim, units_ids],
                                            option='specific')
                                        ablation_final = cv2.resize(
                                            binary_mask,
                                            (layer_size, layer_size))
                                        ablation_final = np.tile(
                                            ablation_final,
                                            (layer_dim, 1, 1)).astype(
                                                np.float32)
                                        ablation_final = torch.cuda.FloatTensor(
                                            ablation_final)
                                        ablation_final = ablation.view(
                                            layer_dim, 1,
                                            1).expand_as(ablation_final
                                                         ) * ablation_final
                                        intervention[layer_n] = (
                                            ablation_final, replacement)

                                    neg_img = self.generator.generate_images(
                                        z_img,
                                        intervention=intervention).detach()
                                    neg_img_t = utils.transform(
                                        neg_img).detach()

                                    norm = (neg_img_t[0, :, :, :] -
                                            pos_image.detach())
                                    norm = norm * bmask
                                    norm = torch.norm(torch.norm(torch.norm(
                                        norm, dim=2),
                                                                 dim=1),
                                                      dim=0)
                                    norm_normalized = norm / torch.norm(
                                        torch.norm(torch.norm(
                                            pos_image.detach() * bmask, dim=2),
                                                   dim=1),
                                        dim=0)
                                    norm = norm_normalized.item()
                                    for layer_n in self.layer_list_all:
                                        num_units_dict[layer_n] = num_units_dict[
                                            layer_n] + 40  # increase units to change
                                    thresold_heatmap = thresold_heatmap - 0.1
                                    threshold_random = threshold_random - 0.05

                                    cluster_id = np.random.choice(
                                        sorted_id, size=1, p=prob_samples)[0]

                                    count = count + 1

                    else:  # random edited negatives
                        binary_mask = matchmap_image > (threshold *
                                                        matchmap_image.max())
                        binary_mask = utils.geodesic_dilation(
                            binary_mask, (ind_h, ind_w))
                        if binary_mask_0 is None:
                            binary_mask_0 = cv2.resize(binary_mask, (224, 224))
                        norm = 0
                        threshold_random = 0.95
                        p = 0.4

                        while norm < threshold_random:
                            with torch.no_grad():
                                intervention = {}

                                for layer_n in self.layer_list_all:
                                    layer_size = self.layers_dict[layer_n][
                                        'size']
                                    layer_dim = self.layers_dict[layer_n][
                                        'depth']

                                    ablation, replacement = self.get_ablation_replacement(
                                        params=[layer_dim, True, 0.5],
                                        option='random')
                                    ablation_final = cv2.resize(
                                        binary_mask, (layer_size, layer_size))
                                    ablation_final = np.tile(
                                        ablation_final,
                                        (layer_dim, 1, 1)).astype(np.float32)
                                    ablation_final = torch.cuda.FloatTensor(
                                        ablation_final)
                                    ablation_final = ablation.view(
                                        layer_dim, 1, 1).expand_as(
                                            ablation_final) * ablation_final
                                    intervention[layer_n] = (ablation_final,
                                                             replacement)

                                # STEP 3: Generate new image
                                z_img = self.z[int(path_ints[i])]
                                z_img = z_img[np.newaxis, :].detach()
                                neg_img = self.generator.generate_images(
                                    z_img, intervention=intervention).detach()
                                neg_img_t = utils.transform(neg_img).detach()

                                binary_mask = cv2.resize(
                                    binary_mask, (128, 128))

                                bmask = torch.Tensor(binary_mask).cuda()

                                bmask = bmask.view(1, 128,
                                                   128).expand(3, 128, 128)
                                norm = (neg_img_t[0, :, :, :] -
                                        pos_image.detach())

                                norm = norm * bmask
                                norm = torch.norm(torch.norm(torch.norm(norm,
                                                                        dim=2),
                                                             dim=1),
                                                  dim=0)
                                norm_normalized = norm / torch.norm(torch.norm(
                                    torch.norm(pos_image.detach() * bmask,
                                               dim=2),
                                    dim=1),
                                                                    dim=0)
                                norm = norm_normalized.item()

                                if random.random() > 0.2:
                                    p = p + 0.05
                                else:
                                    threshold_random = threshold_random - 0.01

                    neg_images.append(neg_img)

                neg_images = torch.cat(neg_images)
                neg_images_t = utils.transform(neg_images)
                # print(neg_images_t.size())

                # STEP 4: Compute output negative
                image_output_neg, _, _ = self.model(neg_images_t, None, [])

            # STEP 5: Compute losses
            if self.args.active_learning:
                image_output, image_output_neg = active_learning.switch_pos_neg(
                    self, image_input, image_output, image_output_neg, path)

            if self.loss_type == 'regular':
                loss = losses.sampled_margin_rank_loss(image_output,
                                                       audio_output, nframes,
                                                       self.margin,
                                                       self.args.symfun)
                loss_list['loss_regular'] = loss.item()
                loss_list['loss_total'] = loss.item()

            elif self.loss_type == 'negatives_edited':  # train with semantic negatives
                loss_regular = losses.sampled_margin_rank_loss(
                    image_output, audio_output, nframes, self.margin,
                    self.args.symfun)
                loss_neg = losses.negatives_loss(image_output, audio_output,
                                                 image_output_neg, nframes,
                                                 self.margin, self.args.symfun)
                loss = loss_regular + loss_neg
                loss_list['loss_regular'] = loss_regular.item()
                loss_list['loss_neg'] = loss_neg.item()
                loss_list['loss_total'] = loss.item()

            elif self.loss_type == 'negatives_hard':  # train with hard negatives
                loss_regular = losses.sampled_margin_rank_loss(
                    image_output, audio_output, nframes, self.margin,
                    self.args.symfun)
                loss_neg = losses.hard_negative_loss(image_output,
                                                     audio_output, nframes,
                                                     self.margin,
                                                     self.args.symfun)
                loss = loss_regular + loss_neg
                loss_list['loss_regular'] = loss_regular.item()
                loss_list['loss_neg'] = loss_neg.item()
                loss_list['loss_total'] = loss.item()

            elif self.loss_type == 'negatives_both':  # combine hard negatives with semantic negatives
                loss_hardneg = losses.combined_random_hard_negative_loss(
                    image_output, audio_output, image_output_neg, nframes,
                    self.margin, self.args.symfun)
                loss_regular = losses.sampled_margin_rank_loss(
                    image_output, audio_output, nframes, self.margin,
                    self.args.symfun)
                loss_regular = torch.clamp(loss_regular, min=0, max=5)
                loss_hardneg = torch.clamp(loss_hardneg, min=0, max=5)
                loss = loss_regular + loss_hardneg
                loss_list['loss_regular'] = loss_regular.item()
                loss_list['loss_hardneg'] = loss_hardneg.item()
                loss_list['loss_total'] = loss.item()

            else:
                raise Exception(
                    f'The loss function {self.loss_type} is not implemented.')

            last_sample = N_examples * epoch + batch_id * self.args.batch_size + image_input.size(
                0)

            # Record loss
            losses_meter.update(loss.item(), image_input.size(0))

            # Backward pass and update
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

            # Measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()

            # Print results
            if (batch_id + 1) % self.args.print_freq == 0:
                for name in loss_list:
                    loss_list_total[name] += loss_list[name]
                for name in loss_list:
                    loss_list_total[
                        name] = loss_list_total[name] / self.args.print_freq

                for loss_name in loss_list:
                    self.args.writer.add_scalar(f'losses/{loss_name}',
                                                loss_list_total[loss_name],
                                                last_sample)

                print(
                    f'Epoch: [{epoch}][{batch_id+1}/{len(self.loaders["train"])}]\t'
                    f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                    f'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                    f'Loss {losses_meter.val:.4f} ({losses_meter.avg:.4f})\t',
                    flush=True)

                image_raw = self.unorm(image_input[0].data.cpu())
                self.args.writer.add_image('positive', image_raw, last_sample)
                if self.loss_type == 'negatives_edited' or self.loss_type == 'negatives_both':
                    image_raw_neg = self.unorm(neg_images[0].data.cpu())
                    image_neg = image_raw_neg / torch.max(image_raw_neg)
                    self.args.writer.add_image('negative', image_neg,
                                               last_sample)
                    self.args.writer.add_image(
                        'Images/region', 255 *
                        np.array([binary_mask_0, binary_mask_0, binary_mask_0
                                  ]).swapaxes(0, 1).swapaxes(1, 2),
                        last_sample)
                loss_list_total = {k: 0 for k, v in loss_list_total.items()}

            else:
                for loss_name in loss_list:
                    loss_list_total[loss_name] += loss_list[loss_name]

    def optimize_neurons(self):

        # Set up console output
        verbose_progress(True)

        gan_model = self.generator.model
        annotate_model_shapes(gan_model, gen=True)

        outdir = os.path.join(
            self.args.results, 'dissect',
            self.args.name_checkpoint + '_' + str(time.time()))
        os.makedirs(outdir, exist_ok=True)

        size = 1000

        sample = z_sample_for_model(gan_model, size)

        train_sample = z_sample_for_model(gan_model, size, seed=2)

        dataset = TensorDataset(sample)
        train_dataset = TensorDataset(train_sample)
        self.cluster_segmenter = ClusterSegmenter(self.model, self.clusters,
                                                  self.mean_clust,
                                                  self.std_clust)

        segrunner = GeneratorSegRunner(self.cluster_segmenter)

        netname = outdir
        # Run dissect
        with torch.no_grad():
            dissect(
                outdir,
                gan_model,
                dataset,
                train_dataset=train_dataset,
                segrunner=segrunner,
                examples_per_unit=20,
                netname=netname,
                quantile_threshold='iqr',
                meta=None,
                make_images=False,  # True,
                make_labels=True,
                make_maxiou=False,
                make_covariance=False,
                make_report=True,
                make_row_images=True,
                make_single_images=True,
                batch_size=8,
                num_workers=8,
                rank_all_labels=True)

            sample_ablate = z_sample_for_model(gan_model, 16)

            dataset_ablate = TensorDataset(sample_ablate)
            data_loader = torch.utils.data.DataLoader(dataset_ablate,
                                                      batch_size=8,
                                                      shuffle=False,
                                                      num_workers=8,
                                                      pin_memory=True,
                                                      sampler=None)

            with open(os.path.join(outdir, 'dissect.json')) as f:
                data = EasyDict(json.load(f))
            dissect_layer = {lrec.layer: lrec for lrec in data.layers}

            self.layers_units = {
                'layer2': [],
                'layer3': [],
                'layer4': [],
                'layer5': [],
                'layer6': [],
            }

            noise_units = np.array([35, 221, 496, 280])

            for i in range(2, len(self.clusters) + 2):
                print('Cluster', i)
                rank_name = 'c_{0}-iou'.format(i)
                for l in range(len(self.layer_list_all)):
                    ranking = next(
                        r
                        for r in dissect_layer[self.layer_list_all[l]].rankings
                        if r.name == rank_name)
                    unit_list = np.array(range(512))
                    unit_list[noise_units] = 0
                    ordering = np.argsort(ranking.score)
                    units_list = unit_list[ordering]
                    self.layers_units[self.layer_list_all[l]].append(
                        units_list)

        # Mark the directory so that it's not done again.
        mark_job_done(outdir)

    def get_ablation_replacement(self, params=(), option='random'):

        if option == 'random':
            import random
            dim_mask = params[0]
            binary = params[1]
            values = np.random.rand(dim_mask)

            if binary:
                prob_ones = params[2]
                ablation = torch.FloatTensor(
                    (np.random.rand(dim_mask) < prob_ones).astype(
                        np.float)).cuda()
            else:
                ablation = torch.FloatTensor(values).cuda()
            replacement = torch.zeros(dim_mask).cuda()

        elif option == 'specific':
            units_ids = params[1]
            dim_mask = params[0]
            ablation, replacement = torch.zeros(dim_mask).cuda(), torch.zeros(
                dim_mask).cuda()
            ablation[units_ids] = 1  # color

        else:
            raise Exception('Please introduce a valid option')

        return ablation, replacement

    def eval(self):
        """
        Collects features for number_recall images and audios and computes the recall @{1, 5, 10} of predicting one from
        the other. It does not involve any hard or edited negative.
        """
        number_recall = 500
        if not self.args.use_cpu:
            torch.cuda.synchronize()
        batch_time = utils.AverageMeter()

        # Switch to evaluate mode
        self.model.eval()

        end = time.time()
        N_examples = self.loaders['val'].dataset.__len__()
        image_embeddings = []  # torch.FloatTensor(N_examples, embedding_dim)
        audio_embeddings = []  # torch.FloatTensor(N_examples, embedding_dim)
        frame_counts = []

        with torch.no_grad():
            for i, (image_input, audio_input, negatives, nframes, path,
                    _) in enumerate(self.loaders['val']):
                if len(image_embeddings) * image_input.size(0) > 500:
                    break

                if not self.args.loading_image:
                    path_ints = [p.split('/')[-1] for p in path
                                 ]  # in case the audio is inside a subfolder

                    v_init = self.z[int(path_ints[0])]
                    z_img = torch.FloatTensor(image_input.size(0),
                                              v_init.shape[0])

                    for k in range(image_input.size(0)):
                        z_img[k, :] = self.z[int(path_ints[k])]

                    image_input = self.generator.generate_images(
                        z_img, intervention=None)
                    image_input = utils.transform(image_input)
                    negatives = []
                else:
                    image_input = image_input.cuda()
                    negatives = [negatives.cuda()]

                # compute output
                model_output = self.model(image_input, audio_input, negatives)
                image_output = model_output[0]
                audio_output = model_output[1]

                image_embeddings.append(image_output.data.cpu())
                audio_embeddings.append(audio_output.data.cpu())

                # find pooling ratio
                # audio_input is (B, D, 40, T)
                # audio_output is (B, D, 1, T/p)
                pooling_ratio = round(
                    audio_input.size(3) / audio_output.size(3))
                nframes.div_(pooling_ratio)
                frame_counts.append(nframes.cpu())

                batch_time.update(time.time() - end)
                end = time.time()

                if i % self.args.print_freq == 0:
                    print('Eval: [{0}/{1}]\t'.format(i + 1,
                                                     len(self.loaders['val'])),
                          flush=True)

            image_outputs = torch.cat(image_embeddings)
            audio_outputs = torch.cat(audio_embeddings)
            frame_counts_tensor = torch.cat(frame_counts)

            N_examples = np.minimum(number_recall, N_examples)

            image_outputs = image_outputs[-N_examples:, :, :, :]
            audio_outputs = audio_outputs[-N_examples:, :, :, :]
            frame_counts_tensor = frame_counts_tensor[-N_examples:]
            # measure accuracy and record loss
            print('Computing recalls...')
            recalls = utils.calc_recalls(image_outputs,
                                         audio_outputs,
                                         frame_counts_tensor,
                                         loss_type=self.loss_type)
            A_r10 = recalls['A_r10']
            I_r10 = recalls['I_r10']
            A_r5 = recalls['A_r5']
            I_r5 = recalls['I_r5']
            A_r1 = recalls['A_r1']
            I_r1 = recalls['I_r1']

            print(
                ' * Audio R@10 {A_r10:.3f} Image R@10 {I_r10:.3f} over {N:d} validation pairs'
                .format(A_r10=A_r10, I_r10=I_r10, N=N_examples),
                flush=True)
            print(
                ' * Audio R@5 {A_r5:.3f} Image R@5 {I_r5:.3f} over {N:d} validation pairs'
                .format(A_r5=A_r5, I_r5=I_r5, N=N_examples),
                flush=True)
            print(
                ' * Audio R@1 {A_r1:.3f} Image R@1 {I_r1:.3f} over {N:d} validation pairs'
                .format(A_r1=A_r1, I_r1=I_r1, N=N_examples),
                flush=True)

            eval_score = (A_r5 + I_r5) / 2

        return eval_score, recalls
def gen_window_model(window_event, proc_events, clusterer=Clusterer()):
    global default_window_event
    if window_event == default_window_event.wm_name:
        return None

    assignments = {}
    clustered_events = {}

    for et in proc_events[window_event]:
        clusterer.clear_data()

        if et is EventType.NONE:
            continue

        try:
            for e in proc_events[window_event][et]:
                f = e.get_features()
                if len(f) == 0:
                    break
                clusterer.append_data(f)
            if clusterer.shape[1] == 0:
                continue
            centroids, assigns = clusterer.cluster(clusterer.recommend_clusters(), 10)

            clustered_events[str(et)] = centroids

            for i in range(len(proc_events[window_event][et])):
                assignments[proc_events[window_event][et][i]] = assigns[i]

        except NotImplementedError as e:
            print(e)
            pass

    ngram = Ngram("")

    clustered_windowed_events = windowed_events[window_event][:]

    for i in range(len(clustered_windowed_events)):
        we = clustered_windowed_events[i]
        name = str(we.event_type)

        id = we.get_identifier()
        if not id is None:
            name += "[" + id + "]"

        if we in assignments:
            assignment = "{" + str(assignments[we]) + "}"
            if "{cluster}" in name:
                name = name.replace("{cluster}", assignment)
            else:
                name += "[" + assignment + "]"

        clustered_windowed_events[i] = name

    sequence = " ".join(clustered_windowed_events).replace("EventType.NONE", ngram.delimiter)
    ngram.construct(sequence, 5)

    ngram.calculate_probabilities()

    window_model = WindowModel(ngram, clustered_events)

    return window_model
Example #31
0
from database import Database
from youtube import YouTube
from clusterer import Clusterer

env = 'desktop'
db_name = 'comment_sense_3'
db = Database(env, db_name)
yt = YouTube()

videoId = 'kQibkV_V8-c'
video_data = yt.video(videoId)
comment_topics = db.comment_topics(videoId)

cl = Clusterer(video_data, db)
topics = cl.cluster(comment_topics)
print(topics)
Example #32
0
from clusterer import Clusterer
import webbrowser

# Get the user input track
track_name = input(
    "Enter the name (artist optional) of a song: ") or 'Give it up Knife Party'

# Run the clustering on the track
c = Clusterer(track_name=track_name, alg_type='affprop')
results = c.get_target_cluster()
c.plot_clusters()
print('Graph saved to ./Database/clusters.png')

# convert the track ids returned from clustering back into track data
print('Loading 20 of', len(results), 'track recommendations, please wait...')
print()
shift_tracks = []
for i, item in enumerate(results):
    shift_tracks += [c.ret.sp.track(item[1])]


# output and save the recommended tracks to a file
def output_recommendations(source, filename, tracks):
    print(source + ' Recommendations:')
    fout = open(filename, 'w')
    for track in tracks[:20]:
        print('track:', track['name'], '-',
              track['album']['artists'][0]['name'])
        print('track:',
              track['name'],
              '-',
Example #33
0
    def widgetParameters(self):
        """Create parameters widgets.
        """
        # Cluster Count
        self.autoK = QCheckBox(self.tr('Auto'))
        self.clusterCount = QSpinBox(self)
        self.clusterCount.setValue(2)
        self.clusterCount.setMinimum(1)
        self.modeK = QComboBox(self)

        hcluster = QHBoxLayout()
        hcluster.addWidget(QLabel(self.tr('Cluster count:')))
        hcluster.addWidget(self.modeK)
        hcluster.addWidget(self.clusterCount)

        # Slider
        hslider = QHBoxLayout()
        clusterLabel = QLabel(self.tr('Cluster count'))
        self.clusterSliderLabel = QLabel()
        compactnessLabel = QLabel(self.tr('Compactness'))
        self.compactnessSliderLabel = QLabel()
        self.clusterSlider = QSlider(QtCore.Qt.Horizontal)
        self.clusterSlider.valueChanged[int].connect(self.sliderMoved)
        self.clusterSlider.setMinimumWidth(100)
        self.clusterSlider.setValue(50)
        self.clusterSlider.setMaximum(100)
        hslider.addWidget(clusterLabel)
        hslider.addWidget(self.clusterSliderLabel)
        hslider.addWidget(self.clusterSlider)
        hslider.addWidget(compactnessLabel)
        hslider.addWidget(self.compactnessSliderLabel)
        self.clusterSliderWidget = QWidget()
        self.clusterSliderWidget.setLayout(hslider)

        # Set default mode
        self.modeK.currentIndexChanged.connect(self.toggleClusterCount)
        default = Clusterer.getDefaultKMode()
        defaultIndex = 0
        for i, (mode, name) in enumerate(Clusterer.getAllKModes()):
            if mode == default:
                defaultIndex = i
            self.modeK.addItem(name)
        self.modeK.setCurrentIndex(defaultIndex)

        # Algo
        combo = QComboBox(self)
        default = Clusterer.getDefaultMode()
        defaultIndex = 0
        for i, (mode, name) in enumerate(Clusterer.getAllModes()):
            if mode == default:
                defaultIndex = i
            combo.addItem(name)
        combo.setCurrentIndex(defaultIndex)
        halgo = QHBoxLayout()
        halgo.addWidget(QLabel(self.tr('Algorithm:')))
        halgo.addWidget(combo)
        self.modeCombo = combo

        # BG color
        color = QtGui.QColor(0, 0, 0)
        self.colorPicker = QPushButton('')
        self.colorPicker.setMaximumSize(QtCore.QSize(16, 16))
        self.colorPicker.clicked.connect(self.colorDialog)
        self.setPickerColor(color, self.colorPicker)
        self.transparentBg = QCheckBox(self.tr('Transparent'))
        self.transparentBg.setChecked(1)
        hbg = QHBoxLayout()
        hbg.addWidget(QLabel(self.tr('Background color:')))
        hbg.addWidget(self.colorPicker)
        hbg.addWidget(self.transparentBg)
        hbg.addStretch(1)

        # Features
        featureBox = QGroupBox(self.tr('Features'))
        features = self.widgetFeatureList()
        featureBox.setLayout(features)

        # Param Box
        paramBox = QGroupBox(self.tr('Parameters'))
        paramLayout = QVBoxLayout()
        paramLayout.addLayout(hcluster)
        paramLayout.addWidget(self.clusterSliderWidget)
        paramLayout.addLayout(halgo)
        paramLayout.addLayout(hbg)
        paramBox.setLayout(paramLayout)

        runButton = self.widgetRun()

        vbox = QVBoxLayout()
        vbox.addWidget(paramBox)
        vbox.addWidget(featureBox)
        vbox.addLayout(runButton)
        vbox.addStretch(1)

        return vbox
Example #34
0
'''
Find the household with the lowest carbon emissions from a singe group
'''


def find_greenest(cluster):
    min = 100000000
    min_index = -1
    for i in range(len(cluster)):
        if sum(cluster[1:len(cluster) - 1]) < min:
            min = cluster[i]
            min_index = i
    return min, min_index


# Preprocessing
preprocessor = Preprocessor()
preprocessor.run_preprocessor()

# Elbow method
elbow = Elbow()
elbow.run_elbow()

# Clustering
clusterer = Clusterer()
clusterer.run_clusterer()

# Regression
regressor = Regressor()
regressor.run_regressor()
    def run_hmm(self, algorithm, sw_info, parameter_in_dir, parameter_out_dir='', preclusters=None, hmm_type='', stripped=False, prefix='', \
                count_parameters=False, plotdir=None, make_clusters=False):  # @parameterfetishist

        if prefix == '' and stripped:
            prefix = 'stripped'
        print '\n%shmm' % prefix
        csv_infname = self.args.workdir + '/' + prefix + '_hmm_input.csv'
        csv_outfname = self.args.workdir + '/' + prefix + '_hmm_output.csv'
        self.write_hmm_input(csv_infname,
                             sw_info,
                             preclusters=preclusters,
                             hmm_type=hmm_type,
                             stripped=stripped,
                             parameter_dir=parameter_in_dir)
        print '    running'
        sys.stdout.flush()
        start = time.time()
        if self.args.n_procs > 1:
            self.split_input(self.args.n_procs,
                             infname=csv_infname,
                             prefix='hmm')
            procs = []
            for iproc in range(self.args.n_procs):
                cmd_str = self.get_hmm_cmd_str(algorithm,
                                               csv_infname,
                                               csv_outfname,
                                               parameter_dir=parameter_in_dir,
                                               iproc=iproc)
                procs.append(Popen(cmd_str.split()))
                time.sleep(0.1)
            for proc in procs:
                proc.wait()
            for iproc in range(self.args.n_procs):
                if not self.args.no_clean:
                    os.remove(
                        csv_infname.replace(
                            self.args.workdir,
                            self.args.workdir + '/hmm-' + str(iproc)))
            self.merge_hmm_outputs(csv_outfname)
        else:
            cmd_str = self.get_hmm_cmd_str(algorithm,
                                           csv_infname,
                                           csv_outfname,
                                           parameter_dir=parameter_in_dir)
            check_call(cmd_str.split())

        sys.stdout.flush()
        print '      hmm run time: %.3f' % (time.time() - start)

        hmminfo = self.read_hmm_output(algorithm,
                                       csv_outfname,
                                       make_clusters=make_clusters,
                                       count_parameters=count_parameters,
                                       parameter_out_dir=parameter_out_dir,
                                       plotdir=plotdir)

        if self.args.pants_seated_clustering:
            viterbicluster.cluster(hmminfo)

        clusters = None
        if make_clusters:
            if self.outfile is not None:
                self.outfile.write('hmm clusters\n')
            else:
                print '%shmm clusters' % prefix
            clusters = Clusterer(self.args.pair_hmm_cluster_cutoff,
                                 greater_than=True,
                                 singletons=preclusters.singletons)
            clusters.cluster(input_scores=hmminfo,
                             debug=self.args.debug,
                             reco_info=self.reco_info,
                             outfile=self.outfile,
                             plotdir=self.args.plotdir + '/pairscores')

        if self.args.outfname is not None:
            outpath = self.args.outfname
            if self.args.outfname[
                    0] != '/':  # if full output path wasn't specified on the command line
                outpath = os.getcwd() + '/' + outpath
            shutil.copyfile(csv_outfname, outpath)

        if not self.args.no_clean:
            if os.path.exists(
                    csv_infname
            ):  # if only one proc, this will already be deleted
                os.remove(csv_infname)
            os.remove(csv_outfname)

        return clusters
Example #36
0
    def widgetParameters(self):
        """Create parameters widgets.
        """
        # Cluster Count
        self.autoK = QCheckBox(self.tr('Auto'))
        self.clusterCount = QSpinBox(self)
        self.clusterCount.setValue(2)
        self.clusterCount.setMinimum(1)
        self.modeK = QComboBox(self)

        hcluster = QHBoxLayout()
        hcluster.addWidget(QLabel(self.tr('Cluster count:')))
        hcluster.addWidget(self.modeK)
        hcluster.addWidget(self.clusterCount)

        # Slider
        hslider = QHBoxLayout()
        clusterLabel = QLabel(self.tr('Cluster count'))
        self.clusterSliderLabel = QLabel()
        compactnessLabel = QLabel(self.tr('Compactness'))
        self.compactnessSliderLabel = QLabel()
        self.clusterSlider = QSlider(QtCore.Qt.Horizontal)
        self.clusterSlider.valueChanged[int].connect(self.sliderMoved)
        self.clusterSlider.setMinimumWidth(100)
        self.clusterSlider.setValue(50)
        self.clusterSlider.setMaximum(100)
        hslider.addWidget(clusterLabel)
        hslider.addWidget(self.clusterSliderLabel)
        hslider.addWidget(self.clusterSlider)
        hslider.addWidget(compactnessLabel)
        hslider.addWidget(self.compactnessSliderLabel)
        self.clusterSliderWidget = QWidget()
        self.clusterSliderWidget.setLayout(hslider)

        # Set default mode
        self.modeK.currentIndexChanged.connect(self.toggleClusterCount)
        default = Clusterer.getDefaultKMode()
        defaultIndex = 0
        for i, (mode, name) in enumerate(Clusterer.getAllKModes()):
            if mode == default:
                defaultIndex = i
            self.modeK.addItem(name)
        self.modeK.setCurrentIndex(defaultIndex)

        # Algo
        combo = QComboBox(self)
        default = Clusterer.getDefaultMode()
        defaultIndex = 0
        for i, (mode, name) in enumerate(Clusterer.getAllModes()):
            if mode == default:
                defaultIndex = i
            combo.addItem(name)
        combo.setCurrentIndex(defaultIndex)
        halgo = QHBoxLayout()
        halgo.addWidget(QLabel(self.tr('Algorithm:')))
        halgo.addWidget(combo)
        self.modeCombo = combo

        # BG color
        color = QtGui.QColor(0, 0, 0)
        self.colorPicker = QPushButton('')
        self.colorPicker.setMaximumSize(QtCore.QSize(16, 16))
        self.colorPicker.clicked.connect(self.colorDialog)
        self.setPickerColor(color, self.colorPicker)
        self.transparentBg = QCheckBox(self.tr('Transparent'))
        self.transparentBg.setChecked(1)
        hbg = QHBoxLayout()
        hbg.addWidget(QLabel(self.tr('Background color:')))
        hbg.addWidget(self.colorPicker)
        hbg.addWidget(self.transparentBg)
        hbg.addStretch(1)

        # Features
        featureBox = QGroupBox(self.tr('Features'))
        features = self.widgetFeatureList()
        featureBox.setLayout(features)

        # Param Box
        paramBox = QGroupBox(self.tr('Parameters'))
        paramLayout = QVBoxLayout()
        paramLayout.addLayout(hcluster)
        paramLayout.addWidget(self.clusterSliderWidget)
        paramLayout.addLayout(halgo)
        paramLayout.addLayout(hbg)
        paramBox.setLayout(paramLayout)

        runButton = self.widgetRun()

        vbox = QVBoxLayout()
        vbox.addWidget(paramBox)
        vbox.addWidget(featureBox)
        vbox.addLayout(runButton)
        vbox.addStretch(1)

        return vbox
# Grid of 100x100
# 3 circles of 15x15 with each 10 points
import testgenerator
from clusterer import Clusterer
from clustervisualizer import ClusterVisualizer

points = testgenerator.create_circle_points(1000, 50, 50, 20, point_mass=10)
clusterer = Clusterer(5, 10, 2)
clustervisualizer = ClusterVisualizer(clusterer)
clusterer.set_points(points)
clusterer.run()

Example #38
0
class Analyzer:
	bdm = None
	expLogColl = None
	#timeGran = timedelta(minutes=5)
	timeGran = timedelta(minutes=2)
	actuNames = None
	sensorNames = None
	zonelist = None
	feater = None
	clust = None
	
	def __init__(self):
		self.actuNames = ActuatorNames()
		self.sensorNames = SensorNames()
		self.bdm = BDWrapper()
		self.expLogColl = CollectionWrapper('experience_log')
		#self.zonelist = self.csv2list('metadata/partialzonelist.csv')
		self.zonelist = self.csv2list('metadata/zonelist.csv')
		self.feater = FeatureExtractor()
		self.clust = Clusterer()
	
	def csv2list(self, filename):
		outputList = list()
		with open(filename, 'r') as fp:
			reader = csv.reader(fp, delimiter=',')
			for row in reader:
				outputList.append(row[0])
		return outputList

	def get_actuator_uuid(self, zone=None, actuType=None):
		context = dict()
		if zone != None:
			context['room']=zone
		if actuType != None:
			context['template']=actuType
		uuids = self.bdm.get_sensor_uuids(context)
		if len(uuids)>1:
			raise QRError('Many uuids are found', context)
		elif len(uuids)==0:
			raise QRError('No uuid is found', context)
		else:
			return uuids[0]

	def normalize_data_avg(self, rawData, beginTime, endTime):
		procData = pd.Series({beginTime:float(rawData[0])})
		tp = beginTime
		while tp<=endTime:
			tp = tp+self.timeGran
			leftSeries = rawData[:tp]
			if len(leftSeries)>0:
				idx = len(leftSeries)-1
				leftVal = leftSeries[idx]
				leftIdx = leftSeries.index[idx]
			else:
				leftVal = None
			rightSeries = rawData[tp:]
			if len(rightSeries)>0:
				rightVal = rightSeries[0]
				rightIdx = rightSeries.index[0]
			else:
				rightVal = None
			if rightVal==None and leftVal!=None:
				newVal = leftVal
			elif rightVal!=None and leftVal==None:
				newVal = rightVal
			elif tp==leftIdx:
				newVal = leftVal
			elif tp==rightIdx:
				newVal = rightVal
			elif rightVal!=None and leftVal!=None:
				leftDist = (tp - leftIdx).total_seconds()
				rightDist = (rightIdx - tp).total_seconds()
				newVal = (leftVal*rightDist+rightVal*leftDist)/(rightDist+leftDist)
			else:
				print "ERROR: no data found in raw data"
				newVal = None
			newData = pd.Series({tp:newVal})
			procData = procData.append(newData)
		return procData

	def normalize_data_nextval_deprecated(self, rawData, beginTime, endTime):
		procData = pd.Series({beginTime:float(rawData[0])})
		tp = beginTime
		while tp<=endTime:
			tp = tp+self.timeGran
			leftSeries = rawData[:tp]
			if len(leftSeries)>0:
				idx = len(leftSeries)-1
				leftVal = leftSeries[idx]
				leftIdx = leftSeries.index[idx]
			else:
				leftVal = None
			rightSeries = rawData[tp:]
			if len(rightSeries)>0:
				rightVal = rightSeries[0]
				rightIdx = rightSeries.index[0]
			else:
				rightVal = None

			if rightVal != None:
				newVal = rightVal
			else:
				newVal = leftVal

			newData = pd.Series({tp:newVal})
			procData = procData.append(newData)
		return procData

	def normalize_data(self, rawData, beginTime, endTime, normType):
		rawData = rawData[beginTime:endTime]
		if not beginTime in rawData.index:
			rawData[beginTime] = rawData.head(1)[0]
			rawData = rawData.sort_index()
		if not endTime in rawData.index:
			rawData[endTime] = rawData.tail(1)[0]
			rawData = rawData.sort_index()
		if normType=='nextval':
			procData = rawData.resample('2Min', fill_method='pad')
		elif normType=='avg':
			procData = rawData.resample('2Min', how='mean')
		else:
			procData = None

		return procData
		

	def receive_a_sensor(self, zone, actuType, beginTime, endTime, normType):
		print zone, actuType
		uuid = self.get_actuator_uuid(zone, actuType)
		rawData = self.bdm.get_sensor_ts(uuid, 'PresentValue', beginTime, endTime)
		if actuType!=self.actuNames.damperCommand:
			rawData = self.remove_negativeone(rawData)
		procData = self.normalize_data(rawData, beginTime, endTime, normType)
		return procData

	def receive_entire_sensors_notstore(self, beginTime, endTime, normType, exceptZoneList=[]):
		#TODO: Should be parallelized here
		dataDict = dict()
		for zone in self.zonelist:
			if not zone in exceptZoneList:
				dataDict[zone] = self.receive_zone_sensors(zone, beginTime, endTime, normType)
		return dataDict
	
	def receive_entire_sensors(self, beginTime, endTime, filename, normType, exceptZoneList=[]):
#		filename='data/'+beginTime.isoformat()[0:-7].replace(':','_') + '.pkl'
		dataDict = self.receive_entire_sensors_notstore(beginTime, endTime, normType, exceptZoneList=exceptZoneList)
		with open(filename, 'wb') as fp:
			pickle.dump(dataDict, fp)
#			json.dump(dataDict,fp)

	def clustering(self, inputData, dataDict):
		fftFeat = self.feater.get_fft_features(inputData, dataDict)
		minmaxFeat = self.feater.get_minmax_features(dataDict)
		dtwFeat = self.feater.get_dtw_features(inputData, dataDict)
		freqFeat = self.feater.get_freq_features(inputData, dataDict)
		featDict = dict()
		for zone in self.zonelist:
			featList = list()
			featList.append(fftFeat[zone])
			featList.append(minmaxFeat[zone])
			featList.append(dtwFeat[zone])
			#featList.append(freqFeat[zone])
			featDict[zone] = featList
		print featDict['RM-4132']
		return self.clust.cluster_kmeans(featDict)
	
	def remove_negativeone(self, data):
		if -1 in data.values:
			indices = np.where(data==-1)
			for idx in indices:
				data[idx] = data[idx-1]
		return data

	def receive_zone_sensors(self, zone, beginTime, endTime, normType):
		zoneDict = dict()
		for actuType in self.actuNames.nameList+self.sensorNames.nameList:
			if actuType=='Actual Supply Flow':
				pass
			try:
				uuid = self.get_actuator_uuid(zone, actuType)
			except QRError:
				continue
#			if actuType == self.actuNames.commonSetpoint:
#				wcad = self.receive_a_sensor(zone, 'Warm Cool Adjust', beginTime, endTime, normType)
#				data = self.receive_a_sensor(zone, actuType, beginTime, endTime, normType)
#				data = data + wcad
#				pass
			if actuType != self.actuNames.damperCommand:
				if actuType==self.actuNames.occupiedCommand:
					pass
				data = self.receive_a_sensor(zone, actuType, beginTime, endTime, normType)
			else:
				data = self.receive_a_sensor(zone, actuType, beginTime, endTime, normType)
			zoneDict[actuType] = data
		return zoneDict


	def store_zone_sensors(self, zone, beginTime, endTime, normType, filename):
		data = self.receive_zone_sensors(zone, beginTime, endTime, normType)
#		with open(filename, 'wb') as fp:
#			w = csv.DictWriter(fp, data.keys())
#			w.writeheader()
#			w.writerow(data)
		for key, val in data.iteritems():
			val.to_csv('rm4132.csv', header=key, mode='a')

	def store_minmax_dict(self):
		minDict = defaultdict(dict)
		maxDict = defaultdict(dict)
		beginTime = datetime(2015,2,1)
		endTime = datetime(2015,9,1)
		shortBeginTime = datetime(2015,8,1)
		shortEndTime = datetime(2015,8,2)

		for zone in self.zonelist:
			for pointType in self.actuNames.nameList+self.sensorNames.nameList:
				try:
					if pointType=='Occupied Command':
						minDict[zone][pointType] = 1
						maxDict[zone][pointType] = 3
					elif pointType=='Cooling Command':
						minDict[zone][pointType] = 0
						maxDict[zone][pointType] = 100
					elif pointType=='Cooling Command' or pointType=='Heating Command':
						minDict[zone][pointType] = 0
						maxDict[zone][pointType] = 100
					elif pointType=='Occupied Clg Min' or pointType=='Occupied Htg Flow' or pointType=='Cooling Max Flow':
						uuid = self.get_actuator_uuid(zone, pointType)
						data = self.bdm.get_sensor_ts(uuid, 'Presentvalue', shortBeginTime, shortEndTime)
						minDict[zone][pointType] = min(data)
						maxDict[zone][pointType] = max(data)
					elif pointType=='Temp Occ Sts':
						minDict[zone][pointType] = 0
						maxDict[zone][pointType] = 1
					elif pointType=='Reheat Valve Command':
						minDict[zone][pointType] = 0
						maxDict[zone][pointType] = 100
					elif pointType=='Actual Supply Flow' or pointType=='Actual Sup Flow SP':
						uuid = self.get_actuator_uuid(zone, pointType)
						data = self.bdm.get_sensor_ts(uuid, 'Presentvalue', shortBeginTime, shortEndTime)
						maxFlow = data[0]
						minDict[zone][pointType] = 0
						maxDict[zone][pointType] = maxFlow
					elif pointType=='Damper Position':
						minDict[zone][pointType] = 0
						maxDict[zone][pointType] = 100
					elif pointType=='Damper Command':
						uuid = self.get_actuator_uuid(zone, pointType)
						data = self.bdm.get_sensor_ts(uuid, 'Presentvalue', shortBeginTime, shortEndTime)
						meanData = np.mean(data)
						stdData = np.std(data)
						meanAgain = np.mean(data[np.logical_and(data<=meanData+2*stdData, data>=meanData-2*stdData)])
						minDict[zone][pointType] = meanData-2*stdData
						maxDict[zone][pointType] = meanData+2*stdData
					else:
						uuid = self.get_actuator_uuid(zone, pointType)
						data = self.bdm.get_sensor_ts(uuid, 'Presentvalue', beginTime, endTime)
						minDict[zone][pointType] = min(data)
						maxDict[zone][pointType] = max(data)

				except:
					print "Something is wrong"
					pass
		with open('metadata/mindict.pkl', 'wb') as fp:
			pickle.dump(minDict, fp)
		with open('metadata/maxdict.pkl', 'wb') as fp:
			pickle.dump(maxDict, fp)
Example #39
0
    def __init__(self, appraisal, cluster_identity, marker, appraisal_colours):
        '''
        appraisal: Appraisal
        cluster_identity: float, as in Clusterer
        marker: str
            the marker being plotted
        '''
        self.appraisal_colours = appraisal_colours
        logging.debug("Generating plot info for %s" % marker)
        # Collect all OTUs from all samples so that they can be processed
        # together.
        all_binned_otus = []
        all_assembled_not_binned_otus = []
        all_not_found_otus = []
        max_count = 0

        # yuck. Sloppy scope in Python, but not in lambdas when I need it..
        def add_to_totality(otus, totality, max_count):
            count = 0
            for otu in otus:
                if otu.marker == marker:
                    totality.append(otu)
                    count += otu.count
            if count > max_count:
                return count
            else:
                return max_count

        for sample_appraisal in appraisal.appraisal_results:
            max_count = add_to_totality(sample_appraisal.binned_otus,
                                        all_binned_otus, max_count)
            max_count = add_to_totality(
                sample_appraisal.assembled_not_binned_otus(),
                all_assembled_not_binned_otus, max_count)
            max_count = add_to_totality(sample_appraisal.not_found_otus,
                                        all_not_found_otus, max_count)
        logging.debug("Found maximal count of seqs as %i" % max_count)

        sequence_to_cluster = {}
        cluster_rep_and_count = []
        collection = OtuTableCollection()
        collection.otu_table_objects = [
            all_not_found_otus, all_assembled_not_binned_otus, all_binned_otus
        ]
        for cotu in Clusterer().cluster(collection, cluster_identity):
            cluster_rep_and_count.append([cotu.sequence, cotu.count])
            for otu in cotu.otus:
                sequence_to_cluster[otu.sequence] = cotu

        # Sort the OTUs by descending order of counts, so that more abundant
        # OTUs get colour.
        sorted_cluster_rep_and_count = sorted(cluster_rep_and_count,
                                              key=lambda x: x[1],
                                              reverse=True)
        cluster_sequence_to_order = {}
        i = 0
        for pair in sorted_cluster_rep_and_count:
            cluster_sequence_to_order[pair[0]] = i
            i += 1

        self._sequence_to_cluster = sequence_to_cluster
        self._sorted_cluster_rep_and_count = sorted_cluster_rep_and_count
        self._cluster_sequence_to_order = cluster_sequence_to_order
        self.max_count = max_count
Example #40
0
def train(cfg,
          model,
          dataset,
          optimizer,
          scheduler=None,
          logger=None,
          is_continue=False,
          use_pretrained=False,
          cluster_vis_path=None):

    save_to = cfg.TRAIN.CHECKPOINT_PATH
    epochs = cfg.TRAIN.EPOCHS
    batch_size = cfg.TRAIN.BATCHSIZE

    if logger is None:
        print('>>> No tensorboard logger used in training.')
    else:
        print('>>> Logger is used in training.')
        counter = 0

    if len(save_to) == 0:
        print('>>> No checkpoints will be saved.')

    start_ep = 0  # initiate start epoch number

    # 继续训练至预定epoch全部完成
    if is_continue:
        print('>>> Continue training from the latest checkpoint.')
        if save_to is None:
            print('>>> Without checkpoint folder, cannot continue training!')
            exit(0)
        ckpts = glob.glob(os.path.join(save_to, '*.pth'))
        if len(ckpts) == 0:
            print('>>> No earlier checkpoints, train from the beginning.')
        else:
            start_ckpt = find_latest_checkpoint(ckpts)
            print('>>> Found earlier checkpoints, continue training with {}.'.
                  format(start_ckpt))

            # load latest model
            start_ep = torch.load(os.path.join(save_to, start_ckpt))['epoch']
            model_state = torch.load(os.path.join(
                save_to,
                start_ckpt))['model_state_dict']  # 加载权重、优化器、scheduler等信息
            opt_state = torch.load(os.path.join(
                save_to, start_ckpt))['optimizer_state_dict']
            model.load_state_dict(model_state)
            optimizer.load_state_dict(opt_state)
            optimizer = opt_to_gpu(optimizer, torch.cuda.is_available())
            if scheduler is not None:
                scheduler_state = torch.load(os.path.join(
                    save_to, start_ckpt))['scheduler_state_dict']
                scheduler.load_state_dict(scheduler_state)
            if logger is not None:
                counter = torch.load(os.path.join(
                    save_to, start_ckpt))['logger_counter']

    # 仅使用pretrained权重从头开始训练
    if use_pretrained:
        print('>>> Use pretrained model weights to start a new training.')
        model_state = torch.load(
            cfg.TRAIN.PRETRAINED_PATH)['model_state_dict']  # 只加载模型权重
        model.load_state_dict(model_state)

    if torch.cuda.is_available():
        model = model.cuda()

    # training loop
    for epoch in range(start_ep, epochs):
        # extract global features
        print('>>> Extracting global features ...')
        features, v_labels, cam_labels = extract_global_features(
            img_shape=(256, 256),
            batch_size=batch_size,
            workers=8,
            model=model,
            dataset=dataset,
            mode='train',
            is_cuda=torch.cuda.is_available())

        # clustering
        print('>>> Start clustering ...')
        features = merge_features_from_dict(features)
        pseudo_labels, num_ids, centroids = Clusterer(
            features, eps=0.5, is_cuda=torch.cuda.is_available()).cluster(
                visualize_path=cluster_vis_path, epoch=epoch + 1)

        # create non-outlier refined dataset
        print('>>> Refining dataset ...')
        good_dataset = refine_dataset((256, 256), dataset, pseudo_labels)
        sampler = ClusterSampler(good_dataset)
        sampler = torch.utils.data.BatchSampler(sampler,
                                                batch_size=cfg.TRAIN.BATCHSIZE,
                                                drop_last=False)
        # good_dataloader = DataLoader(good_dataset, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=4)
        good_dataloader = DataLoader(good_dataset,
                                     shuffle=False,
                                     batch_sampler=sampler,
                                     num_workers=8)

        # memory bank initialization
        memory = MemoryBank(num_feature_dims=2048,
                            num_samples=num_ids,
                            temp=0.07,
                            momentum=0.02)
        memory = init_memory_bank(memory, centroids)

        # training step
        for i, (imgs, pids, fnames, vids,
                camids) in enumerate(good_dataloader):
            if torch.cuda.is_available():
                imgs = imgs.cuda()
                memory = memory.cuda()
            optimizer.zero_grad()
            features = model(imgs)
            loss = memory(features,
                          pids)  # update memory bank and compute loss
            loss.backward()
            optimizer.step()

            if (i + 1) % 50 == 0:  # print loss each 50 iters
                print('[epoch: {}/{}][iter: {}/{}] loss: {}'.format(
                    epoch + 1, epochs, i + 1, len(good_dataloader), loss))

            # update logger
            if logger is not None:
                logger.add_scalar('loss', loss.item(), global_step=counter)
                logger.add_scalar('cluster_centroids',
                                  memory.num_samples,
                                  global_step=counter)
                logger.add_scalar(
                    'lr',
                    optimizer.state_dict()['param_groups'][0]['lr'],
                    global_step=counter)
                counter += 1

        # update scheduler
        if scheduler is not None:
            scheduler.step()

        # save checkpoint
        if len(save_to) != 0 and (epoch + 1) % cfg.TRAIN.SAVE_INTERVAL == 0:
            save_name = os.path.join(save_to,
                                     'backbone-epoch-{}.pth'.format(epoch + 1))
            state_dict = {
                'epoch':
                epoch + 1,
                'model_state_dict':
                model.state_dict(),
                'optimizer_state_dict':
                optimizer.state_dict(),
                'scheduler_state_dict':
                scheduler.state_dict() if scheduler is not None else None,
                'logger_counter':
                counter if logger is not None else None
            }
            torch.save(state_dict, save_name)
            print('>>> Checkpoint is saved as {}.'.format(save_name))
Example #41
0
 def __init__(self, config):
     self.clusterer = Clusterer(**config)
     self.pattern_generator = self.clusterer.pattern_generator