def get_clusterer(trainer, args, output_size, model): assert len( trainer.layer_list_all ) == 1, 'Active learning is only implemented for a single layer ablations' assert args.clustering, 'Active learning samples are associated with a specific clustering. The clustering flag ' \ 'is necessary' active_paths = torch.load( os.path.join(args.active_learning_name, 'a_paths.pth')) active_units = torch.load( os.path.join(args.active_learning_name, 'units.pth')) active_binary_masks = torch.load( os.path.join(args.active_learning_name, 'a_hmaps.pth')) trainer.active_dict = {} for i, path in enumerate(active_paths): trainer.active_dict[path] = { 'mask': active_binary_masks[i], 'units': active_units[i], 'index': i } cluster_path = os.path.join(args.active_learning_name, 'cluster') trainer.clusterer = Clusterer(trainer.loaders['train'], model, path_store=cluster_path, model_dim=args.embedding_dim, load_datapoints=True, load_histogram=True, load_clustering=True, load_name_final=True, save_results=True, output_size=output_size, args=args) return trainer.clusterer
def cluster(self, kmeans, hyper): clus1 = Clusterer( self.get_rel_docs(), APIAdapter.get_data_foldername(self.get_search_term()), kmeans, hyper) clus1.cluster() self.clusterer = clus1
def cdr3_length_precluster(self, waterer, preclusters=None): cdr3lengthfname = self.args.workdir + '/cdr3lengths.csv' with opener('w')(cdr3lengthfname) as outfile: writer = csv.DictWriter( outfile, ('unique_id', 'second_unique_id', 'cdr3_length', 'second_cdr3_length', 'score')) writer.writeheader() for query_name, second_query_name in self.get_pairs(preclusters): cdr3_length = waterer.info[query_name]['cdr3_length'] second_cdr3_length = waterer.info[second_query_name][ 'cdr3_length'] same_length = cdr3_length == second_cdr3_length if not self.args.is_data: assert cdr3_length == int( self.reco_info[query_name]['cdr3_length']) if second_cdr3_length != int( self.reco_info[second_query_name]['cdr3_length']): print 'WARNING did not infer correct cdr3 length' assert False writer.writerow({ 'unique_id': query_name, 'second_unique_id': second_query_name, 'cdr3_length': cdr3_length, 'second_cdr3_length': second_cdr3_length, 'score': int(same_length) }) clust = Clusterer( 0.5, greater_than=True) # i.e. cluster together if same_length == True clust.cluster(cdr3lengthfname, debug=False) os.remove(cdr3lengthfname) return clust
def map_segments_to_clusters(x): # print('mapper: %s working on %s' % (os.getpid(), x)) ((filename, start, end, size), config) = x clusterer = Clusterer(**config) lines = FileSegmentReader.read(filename, start, end, size) clusters = clusterer.find(lines) return [(FIXED_MAP_JOB_KEY, clusters)]
def hamming_precluster(self, preclusters=None): assert self.args.truncate_pairs start = time.time() print 'hamming clustering' chopped_off_left_sides = False hamming_info = [] all_pairs = self.get_pairs(preclusters) # print ' getting pairs: %.3f' % (time.time()-start); start = time.time() # all_pairs = itertools.combinations(self.input_info.keys(), 2) if self.args.n_fewer_procs > 1: pool = Pool(processes=self.args.n_fewer_procs) subqueries = self.split_input( self.args.n_fewer_procs, info=list(all_pairs), prefix='hamming' ) # NOTE 'casting' to a list here makes me nervous! sublists = [] for queries in subqueries: sublists.append([]) for id_a, id_b in queries: sublists[-1].append({ 'id_a': id_a, 'id_b': id_b, 'seq_a': self.input_info[id_a]['seq'], 'seq_b': self.input_info[id_b]['seq'] }) # print ' preparing info: %.3f' % (time.time()-start); start = time.time() subinfos = pool.map(utils.get_hamming_distances, sublists) # NOTE this starts the proper number of processes, but they seem to end up i/o blocking or something (wait % stays at zero, but they each only get 20 or 30 %cpu on stoat) pool.close() pool.join() # print ' starting pools: %.3f' % (time.time()-start); start = time.time() for isub in range(len(subinfos)): hamming_info += subinfos[isub] # print ' merging pools: %.3f' % (time.time()-start); start = time.time() else: hamming_info = self.get_hamming_distances(all_pairs) if self.outfile is not None: self.outfile.write('hamming clusters\n') clust = Clusterer( self.args.hamming_cluster_cutoff, greater_than=False ) # NOTE this 0.5 is reasonable but totally arbitrary clust.cluster(input_scores=hamming_info, debug=self.args.debug, outfile=self.outfile, reco_info=self.reco_info) # print ' clustering: %.3f' % (time.time()-start); start = time.time() if chopped_off_left_sides: print 'WARNING encountered unequal-length sequences, so chopped off the left-hand sides of each' print ' hamming time: %.3f' % (time.time() - start) return clust
def process_single_core(self, filenames): """ Process multiple files sequencially using a single processor """ clusterer = Clusterer(**self.cluster_config) for filename in filenames: with open(filename, 'r') as f: for line in f: clusterer.process_line(line) return clusterer.result()
def process_pipe(self): """ Process continuously from stdin input stream """ clusterer = Clusterer(**self.cluster_config) try: for line in sys.stdin: clusterer.process_line(line) except KeyboardInterrupt: pass finally: return clusterer.result()
def test(self): clusterer = Clusterer(k1=1, k2=1, max_dist=0.5, variables=[]) clusters = clusterer.find([ 'hello 1 y 3', 'hello 1 x 3', 'abc m n q', ]) self.assertEqual( clusters, [ [['hello', '1', 'y', '3'], 2, ['hello', '1', '---', '3']], [['abc', 'm', 'n', 'q'], 1, ['abc', 'm', 'n', 'q']] ] )
def test_min_members(self): clusterer = Clusterer( k1=1, k2=1, max_dist=0.5, variables=[], min_members=2) clusters = clusterer.find([ 'hello 1 y 3', 'hello 1 x 3', 'abc m n q', ]) self.assertEqual( clusters, [ [['hello', '1', 'y', '3'], 2, ['hello', '1', '---', '3']], ] )
def run(self): path = self.mw.sourcePathField.text() if not path: print "[Error] File path is empty" return try: img = Clusterer.readImage(path) imageBGRA = cv2.cvtColor(img, cv2.cv.CV_BGR2BGRA) self.mw.refreshSource(imageBGRA) features = self.mw.selectedFeatures if not features: return self.mw.clusterer = Clusterer() backgroundColor = self.mw.backgroundColor backgroundColor = backgroundColor.blue(), backgroundColor.green( ), backgroundColor.red() if self.mw.transparentBg.isChecked(): backgroundColor = None mode = self.mw.modeCombo.itemText(self.mw.modeCombo.currentIndex()) mode = Clusterer.getModeByName(mode) modeK = self.mw.modeK.itemText(self.mw.modeK.currentIndex()) modeK = Clusterer.getKModeByName(modeK) k = self.mw.clusterCount.value() self.mw.runButton.setEnabled(False) self.mw.clusters = self.mw.clusterer.getClusters( path, mode=mode, kmode=modeK, clusterCount=k, features=features, backgroundColor=backgroundColor, slider=self.mw.clusterSlider.value()) self.mw.currentCluster = 0 self.mw.refreshCluster() self.mw.saveButton.setEnabled(True) self.mw.clusterer.graph(self.mw.figure) self.mw.canvas.setMinimumSize(self.mw.canvas.size()) self.mw.canvas.draw() except (OSError, cv2.error, urllib2.HTTPError) as err: print err self.mw.runButton.setEnabled(True)
def search_click(self): _textval = self.searchbox.text() self._search_term = _textval if self.gene_button.isChecked() and self.fileselected: if self.fileName: goldencorpus = GoldenCorpus(_textval,self.fileName) goldencorpus.fetchData() self.rel_docs = goldencorpus.get_rel_docs_pmid() self.mesh_terms = goldencorpus.get_mesh_terms() mesh_explosion = DataForEachMeshTerm(self.mesh_terms,_textval) path = mesh_explosion.get_data_foldername(_textval) clus = Clusterer(self.rel_docs,path,True,5) self.representative_id,self.representative,self.best_mesh_terms_id, self.best_mesh_terms = clus.cluster() if self.representative: self.updateRepresentativeInformation() else: print("Error! getting file name") elif self.pmid_button.isChecked(): print("Golden corpus exists..") else: print("Please select related file..")
def gen_window_model(window_event, proc_events, clusterer=Clusterer()): global default_window_event if window_event == default_window_event.wm_name: return None assignments = {} clustered_events = {} for et in proc_events[window_event]: clusterer.clear_data() if et is EventType.NONE: continue try: for e in proc_events[window_event][et]: f = e.get_features() if len(f) == 0: break clusterer.append_data(f) if clusterer.shape[1] == 0: continue centroids, assigns = clusterer.cluster(clusterer.recommend_clusters(), 10) clustered_events[str(et)] = centroids for i in range(len(proc_events[window_event][et])): assignments[proc_events[window_event][et][i]] = assigns[i] except NotImplementedError as e: print(e) pass ngram = Ngram("") clustered_windowed_events = windowed_events[window_event][:] for i in range(len(clustered_windowed_events)): we = clustered_windowed_events[i] name = str(we.event_type) id = we.get_identifier() if not id is None: name += "[" + id + "]" if we in assignments: assignment = "{" + str(assignments[we]) + "}" if "{cluster}" in name: name = name.replace("{cluster}", assignment) else: name += "[" + assignment + "]" clustered_windowed_events[i] = name sequence = " ".join(clustered_windowed_events).replace("EventType.NONE", ngram.delimiter) ngram.construct(sequence, 5) ngram.calculate_probabilities() window_model = WindowModel(ngram, clustered_events) return window_model
def __init__(self, appraisal, cluster_identity, marker, appraisal_colours): ''' appraisal: Appraisal cluster_identity: float, as in Clusterer marker: str the marker being plotted ''' self.appraisal_colours = appraisal_colours logging.debug("Generating plot info for %s" % marker) # Collect all OTUs from all samples so that they can be processed # together. all_binned_otus = [] all_assembled_not_binned_otus = [] all_not_found_otus = [] max_count = 0 # yuck. Sloppy scope in Python, but not in lambdas when I need it.. def add_to_totality(otus, totality, max_count): count = 0 for otu in otus: if otu.marker == marker: totality.append(otu) count += otu.count if count > max_count: return count else: return max_count for sample_appraisal in appraisal.appraisal_results: max_count = add_to_totality(sample_appraisal.binned_otus, all_binned_otus, max_count) max_count = add_to_totality( sample_appraisal.assembled_not_binned_otus(), all_assembled_not_binned_otus, max_count) max_count = add_to_totality(sample_appraisal.not_found_otus, all_not_found_otus, max_count) logging.debug("Found maximal count of seqs as %i" % max_count) sequence_to_cluster = {} cluster_rep_and_count = [] collection = OtuTableCollection() collection.otu_table_objects = [ all_not_found_otus, all_assembled_not_binned_otus, all_binned_otus ] for cotu in Clusterer().cluster(collection, cluster_identity): cluster_rep_and_count.append([cotu.sequence, cotu.count]) for otu in cotu.otus: sequence_to_cluster[otu.sequence] = cotu # Sort the OTUs by descending order of counts, so that more abundant # OTUs get colour. sorted_cluster_rep_and_count = sorted(cluster_rep_and_count, key=lambda x: x[1], reverse=True) cluster_sequence_to_order = {} i = 0 for pair in sorted_cluster_rep_and_count: cluster_sequence_to_order[pair[0]] = i i += 1 self._sequence_to_cluster = sequence_to_cluster self._sorted_cluster_rep_and_count = sorted_cluster_rep_and_count self._cluster_sequence_to_order = cluster_sequence_to_order self.max_count = max_count
not_picked = clean[(clean['eligible'] == 1) & (clean['oz'] == 0)] picked = clean[clean['oz'] == 1] nonfeatures = drop_columns(picked, drop_cols) features = picked.columns ## standardize standardize = StandardScaler() X, features = picked.values, picked.columns.values X = standardize.fit_transform(X) ## build model cluster_labels = pd.DataFrame() for k in range(6, 7): pax = Clusterer(model, n_clusters=k, linkage=linkage, random_state=24) centers = pax.fit(X) pax.store_features(features) print("{} grouped {} clusters.".format(model, np.shape(centers)[0])) ## update labels and scores for column k filepath = "{}/{}/labels.pkl".format(data, model) with open(filepath, "rb") as f: k = pax.attributes['n_clusters'] model_labels_df = pickle.load(f) model_labels_df["k={}".format(k)] = pax.attributes['labels_'] model_labels_df["k{}silho_score".format( k)] = pax.get_silhouette_samples() model_labels_df.to_pickle(filepath) print("Updated labels @ {}".format(filepath))
def run_hmm(self, algorithm, sw_info, parameter_in_dir, parameter_out_dir='', preclusters=None, hmm_type='', stripped=False, prefix='', \ count_parameters=False, plotdir=None, make_clusters=False): # @parameterfetishist if prefix == '' and stripped: prefix = 'stripped' print '\n%shmm' % prefix csv_infname = self.args.workdir + '/' + prefix + '_hmm_input.csv' csv_outfname = self.args.workdir + '/' + prefix + '_hmm_output.csv' self.write_hmm_input(csv_infname, sw_info, preclusters=preclusters, hmm_type=hmm_type, stripped=stripped, parameter_dir=parameter_in_dir) print ' running' sys.stdout.flush() start = time.time() if self.args.n_procs > 1: self.split_input(self.args.n_procs, infname=csv_infname, prefix='hmm') procs = [] for iproc in range(self.args.n_procs): cmd_str = self.get_hmm_cmd_str(algorithm, csv_infname, csv_outfname, parameter_dir=parameter_in_dir, iproc=iproc) procs.append(Popen(cmd_str.split())) time.sleep(0.1) for proc in procs: proc.wait() for iproc in range(self.args.n_procs): if not self.args.no_clean: os.remove( csv_infname.replace( self.args.workdir, self.args.workdir + '/hmm-' + str(iproc))) self.merge_hmm_outputs(csv_outfname) else: cmd_str = self.get_hmm_cmd_str(algorithm, csv_infname, csv_outfname, parameter_dir=parameter_in_dir) check_call(cmd_str.split()) sys.stdout.flush() print ' hmm run time: %.3f' % (time.time() - start) hmminfo = self.read_hmm_output(algorithm, csv_outfname, make_clusters=make_clusters, count_parameters=count_parameters, parameter_out_dir=parameter_out_dir, plotdir=plotdir) if self.args.pants_seated_clustering: viterbicluster.cluster(hmminfo) clusters = None if make_clusters: if self.outfile is not None: self.outfile.write('hmm clusters\n') else: print '%shmm clusters' % prefix clusters = Clusterer(self.args.pair_hmm_cluster_cutoff, greater_than=True, singletons=preclusters.singletons) clusters.cluster(input_scores=hmminfo, debug=self.args.debug, reco_info=self.reco_info, outfile=self.outfile, plotdir=self.args.plotdir + '/pairscores') if self.args.outfname is not None: outpath = self.args.outfname if self.args.outfname[ 0] != '/': # if full output path wasn't specified on the command line outpath = os.getcwd() + '/' + outpath shutil.copyfile(csv_outfname, outpath) if not self.args.no_clean: if os.path.exists( csv_infname ): # if only one proc, this will already be deleted os.remove(csv_infname) os.remove(csv_outfname) return clusters
''' Find the household with the lowest carbon emissions from a singe group ''' def find_greenest(cluster): min = 100000000 min_index = -1 for i in range(len(cluster)): if sum(cluster[1:len(cluster) - 1]) < min: min = cluster[i] min_index = i return min, min_index # Preprocessing preprocessor = Preprocessor() preprocessor.run_preprocessor() # Elbow method elbow = Elbow() elbow.run_elbow() # Clustering clusterer = Clusterer() clusterer.run_clusterer() # Regression regressor = Regressor() regressor.run_regressor()
from clusterer import Clusterer import webbrowser # Get the user input track track_name = input( "Enter the name (artist optional) of a song: ") or 'Give it up Knife Party' # Run the clustering on the track c = Clusterer(track_name=track_name, alg_type='affprop') results = c.get_target_cluster() c.plot_clusters() print('Graph saved to ./Database/clusters.png') # convert the track ids returned from clustering back into track data print('Loading 20 of', len(results), 'track recommendations, please wait...') print() shift_tracks = [] for i, item in enumerate(results): shift_tracks += [c.ret.sp.track(item[1])] # output and save the recommended tracks to a file def output_recommendations(source, filename, tracks): print(source + ' Recommendations:') fout = open(filename, 'w') for track in tracks[:20]: print('track:', track['name'], '-', track['album']['artists'][0]['name']) print('track:', track['name'], '-',
def __init__(self, config): self.clusterer = Clusterer(**config) self.pattern_generator = self.clusterer.pattern_generator
not_picked = clean[(clean['eligible'] == 1) & (clean['oz'] == 0)] picked = clean[clean['oz'] == 1] nonfeatures = drop_columns(picked, drop_cols) features = picked.columns ## standardize standardize = StandardScaler() X, features = picked.values, picked.columns.values X = standardize.fit_transform(X) ## build model cluster_labels = pd.DataFrame() for k in range(6, 7): pax = Clusterer(model, n_clusters=k, random_state=24) centers = pax.fit(X) pax.store_features(features) print("{} grouped {} clusters.".format(model, np.shape(centers)[0])) ## update labels and scores for column k filepath = "{}/{}/labels.pkl".format(data, model) with open(filepath, "rb") as f: k = pax.attributes['n_clusters'] model_labels_df = pickle.load(f) model_labels_df["k={}".format(k)] = pax.attributes['labels_'] model_labels_df["k{}silhouette_score".format( k)] = pax.get_silhouette_samples() model_labels_df.to_pickle(filepath) print("Updated labels @ {}".format(filepath))
# Grid of 100x100 # 3 circles of 15x15 with each 10 points import testgenerator from clusterer import Clusterer from clustervisualizer import ClusterVisualizer points = testgenerator.create_circle_points(1000, 50, 50, 20, point_mass=10) clusterer = Clusterer(5, 10, 2) clustervisualizer = ClusterVisualizer(clusterer) clusterer.set_points(points) clusterer.run()
from fastapi import FastAPI from vector_space import VectorSpace from org_dataset import OrgDataset from org_recommender import OrgRecommender from clusterer import Clusterer from keyword_finder import KeywordFinder from keyword_matcher import KeywordMatcher from gcd_utils import get_account_liked_tags app = FastAPI() dataset = OrgDataset.load_instance('./orgs.pkl') vs = VectorSpace.load_instance('./test_vs.pkl') recommender = OrgRecommender(dataset, vs) c = Clusterer(dataset, vs, 20) kw_finder = KeywordFinder(dataset, vs) matcher = KeywordMatcher(c, kw_finder, vs.data_centroid) @app.get('/get_init_recs/') async def get_init_recs(userId: str, numOrgs: int): keywords = get_account_liked_tags(userId) centroid = matcher.get_kw_centroid(keywords) orgids = recommender.centroid_recommend(centroid, numOrgs) return_arr = [] for id in orgids: entry = {'orgId': id} return_arr.append(entry) return return_arr """Example get request for api on local host: http://127.0.0.1:8000/get_recommendations/?userId=334614c0-7f55-11ea-b1bc-2f9730f51173&numOrgs=2
# Grid of 100x100 # 3 circles of 15x15 with each 10 points import testgenerator from clusterer import Clusterer from clustervisualizer import ClusterVisualizer points = testgenerator.create_circle_points(200, 8, 15, 10) clusterer = Clusterer(1, 2, 2) clustervisualizer = ClusterVisualizer(clusterer) clusterer.set_points(points) clusterer.run()
def __init__(self, model, optimizer, all_loaders, args, resume_epoch): self.resume_epoch = resume_epoch self.args = args self.optimizer = torch.optim.SGD((model.parameters()), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) self.layer_list_all = args.layers self.layers_dict = { 'layer2': { 'name': 'layer2', 'depth': 512, 'size': 4 }, 'layer3': { 'name': 'layer3', 'depth': 512, 'size': 8 }, 'layer4': { 'name': 'layer4', 'depth': 512, 'size': 8 }, 'layer5': { 'name': 'layer5', 'depth': 256, 'size': 16 }, 'layer6': { 'name': 'layer6', 'depth': 256, 'size': 16 }, } self.generator = gantest.GanTester(args.path_model_gan, self.layer_list_all, device=torch.device('cuda')) self.z = self.generator.standard_z_sample(200000) self.model = model self.optimizer = optimizer self.loaders = all_loaders self.loss_type = args.loss_type # Other parameters self.margin = args.margin self.clustering = args.clustering self.epoch = 0 self.unorm = utils.UnNormalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) output_size = 32 if 'large' in args.audio_model else 256 if args.active_learning: active_learning.get_clusterer(self, args, output_size, model) else: if args.clustering: print('Creating cluster from scratch') cluster_path = os.path.join( self.args.results, 'clusters', args.name_checkpoint + '_' + str(time.time())) self.clusterer = Clusterer( self.loaders['train'], model, path_store=cluster_path, model_dim=args.embedding_dim, save_results=True, output_size=output_size, args=self.args, path_cluster_load=args.path_cluster_load) self.epochs_clustering = self.args.epochs_clustering self.clusters = self.mean_clust = self.std_clust = self.cluster_counts = self.clusters_unit = None
from database import Database from youtube import YouTube from clusterer import Clusterer env = 'desktop' db_name = 'comment_sense_3' db = Database(env, db_name) yt = YouTube() videoId = 'kQibkV_V8-c' video_data = yt.video(videoId) comment_topics = db.comment_topics(videoId) cl = Clusterer(video_data, db) topics = cl.cluster(comment_topics) print(topics)
def train(cfg, model, dataset, optimizer, scheduler=None, logger=None, is_continue=False, use_pretrained=False, cluster_vis_path=None): save_to = cfg.TRAIN.CHECKPOINT_PATH epochs = cfg.TRAIN.EPOCHS batch_size = cfg.TRAIN.BATCHSIZE if logger is None: print('>>> No tensorboard logger used in training.') else: print('>>> Logger is used in training.') counter = 0 if len(save_to) == 0: print('>>> No checkpoints will be saved.') start_ep = 0 # initiate start epoch number # 继续训练至预定epoch全部完成 if is_continue: print('>>> Continue training from the latest checkpoint.') if save_to is None: print('>>> Without checkpoint folder, cannot continue training!') exit(0) ckpts = glob.glob(os.path.join(save_to, '*.pth')) if len(ckpts) == 0: print('>>> No earlier checkpoints, train from the beginning.') else: start_ckpt = find_latest_checkpoint(ckpts) print('>>> Found earlier checkpoints, continue training with {}.'. format(start_ckpt)) # load latest model start_ep = torch.load(os.path.join(save_to, start_ckpt))['epoch'] model_state = torch.load(os.path.join( save_to, start_ckpt))['model_state_dict'] # 加载权重、优化器、scheduler等信息 opt_state = torch.load(os.path.join( save_to, start_ckpt))['optimizer_state_dict'] model.load_state_dict(model_state) optimizer.load_state_dict(opt_state) optimizer = opt_to_gpu(optimizer, torch.cuda.is_available()) if scheduler is not None: scheduler_state = torch.load(os.path.join( save_to, start_ckpt))['scheduler_state_dict'] scheduler.load_state_dict(scheduler_state) if logger is not None: counter = torch.load(os.path.join( save_to, start_ckpt))['logger_counter'] # 仅使用pretrained权重从头开始训练 if use_pretrained: print('>>> Use pretrained model weights to start a new training.') model_state = torch.load( cfg.TRAIN.PRETRAINED_PATH)['model_state_dict'] # 只加载模型权重 model.load_state_dict(model_state) if torch.cuda.is_available(): model = model.cuda() # training loop for epoch in range(start_ep, epochs): # extract global features print('>>> Extracting global features ...') features, v_labels, cam_labels = extract_global_features( img_shape=(256, 256), batch_size=batch_size, workers=8, model=model, dataset=dataset, mode='train', is_cuda=torch.cuda.is_available()) # clustering print('>>> Start clustering ...') features = merge_features_from_dict(features) pseudo_labels, num_ids, centroids = Clusterer( features, eps=0.5, is_cuda=torch.cuda.is_available()).cluster( visualize_path=cluster_vis_path, epoch=epoch + 1) # create non-outlier refined dataset print('>>> Refining dataset ...') good_dataset = refine_dataset((256, 256), dataset, pseudo_labels) sampler = ClusterSampler(good_dataset) sampler = torch.utils.data.BatchSampler(sampler, batch_size=cfg.TRAIN.BATCHSIZE, drop_last=False) # good_dataloader = DataLoader(good_dataset, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=4) good_dataloader = DataLoader(good_dataset, shuffle=False, batch_sampler=sampler, num_workers=8) # memory bank initialization memory = MemoryBank(num_feature_dims=2048, num_samples=num_ids, temp=0.07, momentum=0.02) memory = init_memory_bank(memory, centroids) # training step for i, (imgs, pids, fnames, vids, camids) in enumerate(good_dataloader): if torch.cuda.is_available(): imgs = imgs.cuda() memory = memory.cuda() optimizer.zero_grad() features = model(imgs) loss = memory(features, pids) # update memory bank and compute loss loss.backward() optimizer.step() if (i + 1) % 50 == 0: # print loss each 50 iters print('[epoch: {}/{}][iter: {}/{}] loss: {}'.format( epoch + 1, epochs, i + 1, len(good_dataloader), loss)) # update logger if logger is not None: logger.add_scalar('loss', loss.item(), global_step=counter) logger.add_scalar('cluster_centroids', memory.num_samples, global_step=counter) logger.add_scalar( 'lr', optimizer.state_dict()['param_groups'][0]['lr'], global_step=counter) counter += 1 # update scheduler if scheduler is not None: scheduler.step() # save checkpoint if len(save_to) != 0 and (epoch + 1) % cfg.TRAIN.SAVE_INTERVAL == 0: save_name = os.path.join(save_to, 'backbone-epoch-{}.pth'.format(epoch + 1)) state_dict = { 'epoch': epoch + 1, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'scheduler_state_dict': scheduler.state_dict() if scheduler is not None else None, 'logger_counter': counter if logger is not None else None } torch.save(state_dict, save_name) print('>>> Checkpoint is saved as {}.'.format(save_name))
def main(): # Lecture des arguments passés en ligne de commandes try: opts, args = getopt.getopt(sys.argv[1:], "et:rcn:s", ["enc=", "chemin=", "nc=", "mots="]) except getopt.GetoptError as err: print(err) sys.exit() task = None clusterWordList = [] # Mise en mémoire des aruments passés en ligne de commandes for opt, arg in opts: if opt == '-e': task = 'training' elif opt == '-t': if arg.isnumeric(): windowSize = arg else: print("Erreur! Taille de la fenêtre!") sys.exit(1) elif opt == '-r': task = 'search' elif opt == '-c': task = 'clustering' elif opt == '-n': if arg.isnumeric(): wordQty = arg else: print("Erreur! Nombre de mots à afficher par centroïdes!") sys.exit(1) elif opt == '-s': task = 'table' elif opt == '--enc': fileEncoding = arg elif opt == '--chemin': filePath = arg elif opt == '--nc': if arg.isnumeric(): clusteringType = 'random' clusterQty = arg else: print("Erreur! Nombre de centroïdes à afficher!") sys.exit(1) elif opt == '--mots': clusteringType = 'words' clusterWordList = arg.split(" ") clusterQty = len(clusterWordList) if task == 'training': trainer = Trainer(filePath, fileEncoding, windowSize) trainer.execute() elif task == 'search': try: searcher = Searcher(windowSize) if searcher.isWindowValid: searcher.execute() else: print("Aucune donnée pour la taille de fenêtre {}".format(windowSize)) except: print("Erreur! Base de données inexistante!") elif task == 'clustering': clusterer = Clusterer(windowSize, wordQty, clusterWordList, clusterQty, clusteringType) if clusterer.isWindowValid: clusterer.execute() else: print("Aucune donnée pour la taille de fenêtre {}".format(windowSize)) elif task == 'table': db = DBManager() print("Création de Dictionnaire_Commun") db.createTableDict() print("Création de Cooccurrences") db.createTableCooc() print("Fermeture de la connexion") db.closeConnection() else: print("Erreur! Arguments -e, -r ou -c introuvables") sys.exit() print("\nFin du programme") return 0