def test_pers_load(self): for binary in [True, False]: src = StringIO() p = cPickle.Pickler(src) p.persistent_id = persistent_id p.binary = binary value = MyData('abc') p.dump(value) up = cPickle.Unpickler(StringIO(src.getvalue())) up.persistent_load = persistent_load res = up.load() self.assertEqual(res.value, value.value) # errors src = StringIO() p = cPickle.Pickler(src) p.persistent_id = persistent_id p.binary = binary value = MyData('abc') p.dump(value) up = cPickle.Unpickler(StringIO(src.getvalue())) # exceptions vary betwee cPickle & Pickle try: up.load() self.assertUnreachable() except Exception, e: pass
def main(feats_path, n_comps): with open(feats_path, 'rb') as handle: unpickler = pickle.Unpickler(handle) labels = unpickler.load() labels = { name: vector for name, vector in labels.items() if vector is not None } names = list(labels.keys()) vectors = list(labels.values()) print('[INFO] Conducting PCA on ' + feats_path + ' with ' + str(n_comps) + ' components') pca = PCA(n_components=n_comps) vectors = pca.fit_transform(vectors) # save reduced vectors base = path.basename(feats_path) name = path.splitext(base)[0] output = name + '_ncomps' + str(n_comps) + '.pickle' print('[INFO] Saving reduced vectors to ' + output) with open(output, 'wb') as handle: pickle.dump(dict(zip(names, vectors)), handle)
def restore_model(self, path="tnt_pos_tagger.pic"): object_path = open("tnt_pos_tagger.pic", 'r') tag_object = cPickle.Unpickler(object_path) tagger = tag_object.load() object_path.close() return tagger
def main(feats_path): with open(feats_path, 'rb') as handle: unpickler = pickle.Unpickler(handle) labels = unpickler.load() labels = { name: vector for name, vector in labels.items() if vector is not None } features = np.asarray(list(labels.values())) print('[INFO] Conducting t-SNE on ' + feats_path) tsne = TSNE(metric='braycurtis', verbose=1, n_iter=5000, random_state=42, n_jobs=-1) projection = tsne.fit_transform(features) # save reduced vectors base = path.basename(feats_path) name = path.splitext(base)[0] output = name + '_tsne.pickle' print('[INFO] Saving reduced vectors to ' + output) with open(output, 'wb') as handle: pickle.dump(projection, handle)
def main(label_path): # connection to mongodb mongoConn = MongoClient(csh_db_cfg.DB_HOST + ":" + str(csh_db_cfg.DB_PORT)) cshTransDB = mongoConn[csh_db_cfg.TRANSCRIPTION_DB_NAME] cshTransDB.authenticate(csh_db_cfg.TRANSCRIPTION_DB_USER, csh_db_cfg.TRANSCRIPTION_DB_PASS) cshCollection = cshTransDB[csh_db_cfg.TRANS_DB_MeetingMinColl] print('[INFO] Loading saved cluster labels from ' + label_path) with open(label_path, 'rb') as handle: unpickler = pickle.Unpickler(handle) labels = unpickler.load() print('[INFO] Saving cluster labels to MongoDB') for name, label in labels.items(): # set 0th cluster to noise if label == 0: label = -1 searchQuery = {'anonymizedImageFile': name} updateQuery = { '$set': { 'cluster': str(label), } } cshCollection.find_one_and_update(searchQuery, updateQuery)
def main(feat_path, dict_path): print('[INFO] Working...') # load image PHOCs print('[INFO] Loading image PHOCs from \'' + feat_path + '\'') with open(feat_path, 'rb') as handle: unpickler = pickle.Unpickler(handle) image_phocs = unpickler.load() # load dictionary PHOCs print('[INFO] Loading dicitonary (word) PHOCs from \'' + dict_path + '\'') with open(dict_path, 'rb') as handle: unpickler = pickle.Unpickler(handle) dict_phocs = unpickler.load() words = list(dict_phocs.keys()) word_phocs = np.array(list(dict_phocs.values())) # initiailize NearestNeighbors learner print('[INFO] Fitting nearest neighbors learner') n_neighbors = 5 nn = NearestNeighbors(n_neighbors=n_neighbors, algorithm='auto', metric=distance.braycurtis, n_jobs=-1) nn = nn.fit(word_phocs) while True: results = {} # get user input image_q = input('[INPUT] Input an image to predict text for: ') if image_q not in image_phocs: print('[INFO] Query image not found, please try another.') continue phoc_q = image_phocs[image_q].reshape(1, -1) # calculate braycurtis disimilarities and find nearest neighbors print('[INFO] Finding top candidate predictions...') dist, ind = nn.kneighbors(phoc_q) dist, ind = dist[0][::-1], ind[0][::-1] # present results print('[INFO] Results:') for i in range(n_neighbors): print('{0}\t{1}'.format(words[ind[i]], dist[i]))
def test_persistent_load(self): class MyData(object): def __init__(self, value): self.value = value def persistent_id(obj): if hasattr(obj, 'value'): return 'MyData: %s' % obj.value return None def persistent_load(id): return MyData(id[8:]) for binary in [True, False]: src = StringIO() p = cPickle.Pickler(src) p.persistent_id = persistent_id p.binary = binary value = MyData('abc') p.dump(value) up = cPickle.Unpickler(StringIO(src.getvalue())) up.persistent_load = persistent_load res = up.load() self.assertEqual(res.value, value.value) # errors src = StringIO() p = cPickle.Pickler(src) p.persistent_id = persistent_id p.binary = binary value = MyData('abc') p.dump(value) up = cPickle.Unpickler(StringIO(src.getvalue())) # exceptions vary between cPickle & Pickle try: up.load() AssertUnreachable() except Exception, e: pass
def __init__(self, save_file_name, gffutils_db, multimappers_disct): logger.info("Loading read assignments from " + save_file_name) assert os.path.exists(save_file_name) self.save_file_name = save_file_name self.unpickler = pickle.Unpickler(open(save_file_name, "rb"), fix_imports=False) self.current_gene_info_obj = None self.is_updated = False self.gffutils_db = gffutils_db self.multimapped_reads = multimappers_disct
def unpickleData(fname): f = gzip.open(fname) u = pickle.Unpickler(f) data = [] try: while True: rec = u.load() data.append(rec) except EOFError: pass f.close() return data
def load_read_info(self, dump_filename): gc.disable() if not self.multimapped_reads: multimap_unpickler = pickle.Unpickler(open(dump_filename + "_multimappers", "rb"), fix_imports=False) while True: try: obj = multimap_unpickler.load() if isinstance(obj, list): assignment_list = obj read_id = assignment_list[0].read_id self.multimapped_reads[read_id] = assignment_list else: raise ValueError("Multimap assignment file {} is corrupted!".format(dump_filename)) except EOFError: break info_unpickler = pickle.Unpickler(open(dump_filename + "_info", "rb"), fix_imports=False) total_assignments = info_unpickler.load() polya_assignments = info_unpickler.load() gc.enable() return total_assignments, polya_assignments
def main(feat_path, images_path): print('[INFO] Working...') # similarity threshold threshold = 0.6 # load feature vectors print('[INFO] Loading features from ' + feat_path) with open(feat_path, 'rb') as handle: unpickler = pickle.Unpickler(handle) index = unpickler.load() while True: results = {} # get user input image_q = input('[INPUT] Input an image to compare: ') if image_q not in index: print('[INFO] Query image not found, please try another.') continue feature_q = index[image_q] # calculate Bray Curtis dissimilarities print('[INFO] Calculating Bray Curtis dissimilarities...') for image_n, feature_n in index.items(): score = 1 - distance.braycurtis(feature_n, feature_q) if score > threshold: results[image_n] = score del results[image_q] results = [(image_n, results[image_n]) for image_n in \ sorted(results, key=results.get, reverse=True)] # present results print('[INFO] Results:') for image_n, score in results: print('{0}\t{1}'.format(image_n, score)) img = cv2.imread(images_path + '/' + image_n, 0) cv2.imshow(image_n, img) k = cv2.waitKey(0) & 0xFF cv2.destroyAllWindows() # press ESC to exit if k == 27: break
def load_assigned_reads(save_file_name, gffutils_db, multimapped_reads): gc.disable() logger.info("Loading read assignments from " + save_file_name) assert os.path.exists(save_file_name) save_file_name = save_file_name unpickler = pickle.Unpickler(open(save_file_name, "rb"), fix_imports=False) read_storage = [] current_gene_info = None while True: try: obj = unpickler.load() if isinstance(obj, ReadAssignment): read_assignment = obj assert current_gene_info is not None read_assignment.gene_info = current_gene_info if read_assignment.read_id in multimapped_reads: resolved_assignment = None for a in multimapped_reads[read_assignment.read_id]: if a.start == read_assignment.start() and a.end == read_assignment.end() and \ a.gene_id == current_gene_info.gene_db_list[0].id and \ a.chr_id == read_assignment.chr_id: if resolved_assignment is not None: logger.warning("Duplicate read: %s %s %s" % (read_assignment.read_id, a.gene_id, a.chr_id)) resolved_assignment = a if not resolved_assignment: logger.warning("Incomplete information on read %s" % read_assignment.read_id) elif resolved_assignment.assignment_type == ReadAssignmentType.noninformative: continue else: read_assignment.assignment_type = resolved_assignment.assignment_type read_assignment.multimapper = resolved_assignment.multimapper read_storage.append(read_assignment) elif isinstance(obj, GeneInfo): if current_gene_info and read_storage: yield current_gene_info, read_storage read_storage = [] current_gene_info = obj current_gene_info.db = gffutils_db else: raise ValueError("Read assignment file {} is corrupted!".format(save_file_name)) except EOFError: break gc.enable() if current_gene_info and read_storage: yield current_gene_info, read_storage
def get_score_information(xml_path, feature_path, feature_stats, start_tempo, composer, vel_pair): xml_object, xml_notes = read_xml_to_notes(xml_path) beats = xml_object.get_beat_positions() measure_positions = xml_object.get_measure_positions() with open(feature_path, "rb") as f: u = cPickle.Unpickler(f) feature_dict = u.load() # TODO: not an array I think..? test_x = feature_dict['input_data'] note_locations = feature_dict['note_location']['data'] edges = feature_dict['graph'] return test_x, xml_notes, xml_object, edges, note_locations
def load(self, path): """ Load the parameters of a saved off memory file Parameters ---------- path: str The path of where the saved off file exists """ restore_path = path + "/memory.info" if os.path.exists(restore_path): with open(restore_path, "rb") as file: # info = pickle.load(file) p = pickle.Unpickler(file) # p.fast = True info = p.load() p.memo.clear() self._storage = info["storage"] self._maxsize = info["maxsize"] self._next_idx = info["next_idx"]
def loadData(self, fname, label_vector_size=0): f = gzip.open(fname) u = pickle.Unpickler(f) data = [] label = [] try: while True: rec = u.load() data.append(rec[0]) if label_vector_size > 0: lbl_vector = np.zeros(label_vector_size) lbl_vector[rec[1]] = 1 label.append(lbl_vector) else: label.append(rec[1]) except EOFError: pass return [np.array(data, dtype=rec[0].dtype), np.array(label)]
def save_data_pair_dataset_by_piece(): data_path = Path('/home/yoojin/data/emotionDataset/feature_for_analysis') with open(data_path.joinpath('pairdataset.dat'), 'rb') as f: u = cPickle.Unpickler(f) emotion_pair_data = u.load() e1_pairs = [] e2_pairs = [] e3_pairs = [] e4_pairs = [] e5_pairs = [] data_pair_set_by_piece = [] for pair_set in tqdm(emotion_pair_data.data_pair_set_by_piece): set_list = [] for pair_data in pair_set: data = DataPair(pair_data).make_dict() if pair_data.emotion is 1: e1_pairs.append(data) elif pair_data.emotion is 2: e2_pairs.append(data) elif pair_data.emotion is 3: e3_pairs.append(data) elif pair_data.emotion is 4: e4_pairs.append(data) else: e5_pairs.append(data) set_list.append(data) data_pair_set_by_piece.append(set_list) with open(data_path.joinpath("data_pair_set_by_piece.dat"), "wb") as f: pickle.dump(data_pair_set_by_piece, f, protocol=2) with open(data_path.joinpath("e1_pairs.dat"), "wb") as f: pickle.dump(e1_pairs, f, protocol=2) with open(data_path.joinpath("e2_pairs.dat"), "wb") as f: pickle.dump(e2_pairs, f, protocol=2) with open(data_path.joinpath("e3_pairs.dat"), "wb") as f: pickle.dump(e3_pairs, f, protocol=2) with open(data_path.joinpath("e4_pairs.dat"), "wb") as f: pickle.dump(e4_pairs, f, protocol=2) with open(data_path.joinpath("e5_pairs.dat"), "wb") as f: pickle.dump(e5_pairs, f, protocol=2)
def main(label_path): print('[INFO] Loading saved clustering label from \'' + label_path + '\'') with open(label_path, 'rb') as handle: unpickler = pickle.Unpickler(handle) labels = unpickler.load() unique_labels = set(labels.values()) print('[INFO] ' + str(len(unique_labels) - 1) + ' unique clusters and ' + \ str(list(labels.values()).count(-1)) + ' noise points found') print( '[INFO] Press ESC to stop viewing a cluster, press any other key to continue' ) while True: # get user input label = input('[INFO] Input a number from ' + str(min(unique_labels)) + \ ' to ' + str(max(unique_labels)) + ' to view a cluster: ') try: label = int(label) except: print('[ERROR] Expected an int') continue if label not in unique_labels: print('[ERROR] Input not in specified range') continue # view items in clusters for im, l in labels.items(): if l == label: img = cv2.imread('../r37/' + im, 0) cv2.imshow(im, img) k = cv2.waitKey(0) & 0xFF cv2.destroyAllWindows() # press ESC to exit if k == 27: break
def load(self, path): """ Load the parameters of a saved off memory file Parameters ---------- path: str The path of where the saved off file exists """ restore_path = path + "/adaptive_memory.info" if os.path.exists(restore_path): with open(restore_path, "rb") as file: p = pickle.Unpickler(file) info = p.load() p.memo.clear() self._alpha = info["alpha"] self._it_sum = info["it_sum"] self._it_min = info["it_min"] self._max_priority = info["max_priority"] self._next_idx = info["next_idx"] self._storage = info["storage"] self._maxsize = info["maxsize"]
def load(self, path): """ Load the parameters of a saved off memory file Parameters ---------- path: str The path of where the saved off file exists """ restore_path = path + "/memory.info" if os.path.exists(restore_path): with open(restore_path, "rb") as file: # info = pickle.load(file) p = pickle.Unpickler(file) # p.fast = True info = p.load() p.memo.clear() self._storage = info["storage"] self._maxsize = info["maxsize"] self._next_idx = info["next_idx"] for obses_t, actions, rewards, obses_tp1, dones, rewards_decoms in self._storage: if np.sum(np.isnan(obses_t)) > 0 or np.sum( obses_t == float('inf')) > 0: print(obses_t) input() if rewards == float('inf') or rewards == float('nan'): print(rewards) input() if np.sum(np.isnan(obses_tp1)) > 0 or np.sum( obses_tp1 == float('inf')) > 0: print(obses_tp1) input() if np.sum(np.isnan(rewards_decoms)) > 0 or np.sum( rewards_decoms == float('inf')) > 0: print(rewards_decoms) input() # self._storage = list(info["storage"][:30000]) # self._maxsize = info["maxsize"] # self._next_idx = 30000
def main(feats_path, mcs, ms): print('[INFO] Preparing to cluster features from ' + feats_path) # load feature vectors with open(feats_path, 'rb') as handle: unpickler = pickle.Unpickler(handle) index = unpickler.load() index = { name: vector for name, vector in index.items() if vector is not None } # reshape 3d vectors to 2d dataset = list(index.values()) dataset = np.asarray(dataset) # clusterer model print('[INFO] Using HDBSCAN to cluster features with mcs=' + str(mcs) + ', ms=' + str(ms)) clusterer = hdbscan.HDBSCAN(min_cluster_size=mcs, min_samples=ms, metric='braycurtis', algorithm='best', core_dist_n_jobs=-1) # cluster clusterer.fit(dataset) # save labels base = path.basename(feats_path) name = path.splitext(base)[0] output = '../labels/hdbscan_' + name + '_mcs' + str(mcs) + '_ms' + str( ms) + '.pickle' print('[INFO] Saving predicted labels to ' + output) with open(output, 'wb') as handle: pickle.dump(dict(zip(index.keys(), clusterer.labels_)), handle, protocol=4)
def main(feats_path, n): with open(feats_path, 'rb') as handle: unpickler = pickle.Unpickler(handle) labels = unpickler.load() labels = {name: vector for name, vector in labels.items() if vector is not None} names = list(labels.keys()) vectors = list(labels.values()) print('[INFO] Sampling ' + str(n) + ' instances from ' + feats_path) random.seed(42) indices = random.sample(range(len(labels)), n) subset = {names[i]: vectors[i] for i in indices} # save subset base = path.basename(feats_path) name = path.splitext(base)[0] output = name + '_sub' + str(n) + '.pickle' print('[INFO] Saving subset to ' + output) with open(output, 'wb') as handle: pickle.dump(subset, handle)
def main(): print('Working...') # similarity threshold threshold = 0.5 # load feature vectors with open('features.pickle', 'rb') as handle: unpickler = pickle.Unpickler(handle) index = unpickler.load() while True: results = {} # get user input image_q = input('Input an image to compare: ') if image_q not in index: print('Query image not found, please try another.') continue feature_q = index[image_q] # calculate cosine similarities print('Calculating cosine similarities...') for image_n, feature_n in index.items(): score = 1 - spatial.distance.cosine(feature_n, feature_q) if score > threshold: results[image_n] = score del results[image_q] results = [(image_n, results[image_n]) for image_n in \ sorted(results, key=results.get, reverse=True)] # present results print('Results:') for image_n, score in results: print('{0}\t{1}'.format(image_n, score))
def main(): print('Working...') # load feature vectors with open('features.pickle', 'rb') as handle: unpickler = pickle.Unpickler(handle) index = unpickler.load() # reshape 3d vectors to 2d dataset = list(index.values()) dataset = np.asarray(dataset) n_samples, n_x, n_y = dataset.shape dataset = dataset.reshape((n_samples, n_x * n_y)) # train model model = DBSCAN(metric='cosine', n_jobs=-1).fit(dataset) # save model with open('dbscan_model.pickle', 'wb') as handle: pickle.dump(model, handle) print('DBSCAN model trained and saved to dbscan_model.pickle')
def load_pickle(file): with open(file, "rb") as f: data = pickle.Unpickler(f).load() return data
def _load_file(self, file_name): with open(self.path.joinpath(file_name), 'rb') as f: u = cPickle.Unpickler(f) dataset = u.load() return dataset
def unpickle(name): with open(name, 'rb') as f: pickler = cpickle.Unpickler(f) objects = pickler.load() return objects
def main(feats_path, max_cluster_size): print('[INFO] Preparing to cluster features from ' + feats_path) # load feature vectors with open(feats_path, 'rb') as handle: unpickler = pickle.Unpickler(handle) index = unpickler.load() index = { name: vector for name, vector in index.items() if vector is not None } # labels labels = {} # start timer start = time.time() # cluster iteratively min_cluster_size = max_cluster_size while len(index) > 0 and min_cluster_size >= 2: print('[INFO] Clustering ' + str(len(index)) + ' points with min_cluster_size=' + str(min_cluster_size)) clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size, min_samples=None, metric='braycurtis', algorithm='best', core_dist_n_jobs=-1) # reshape 3d vectors to 2d dataset = list(index.values()) dataset = np.asarray(dataset) # fit clusterer clusterer.fit(dataset) # update labels (treat cluster 0 as noise) new_labels = clusterer.labels_ current_max = max(labels.values()) if len(labels) > 0 else -1 new_labels = [ label + current_max if label > 0 else -1 for label in new_labels ] new_labels = dict(zip(index.keys(), new_labels)) labels.update(new_labels) # get features of noise points to recluster index = {k: v for k, v in index.items() if new_labels[k] == -1} min_cluster_size -= 1 # end timer and print end = time.time() print('[INFO] Clustering completed after ' + str(end - start) + ' seconds') # save labels to disk base = path.basename(feats_path) name = path.splitext(base)[0] output = '../labels/iterative_' + name + '_mcs' + str( max_cluster_size) + '_labels.pickle' print('[INFO] Saving labels to ' + output) with open(output, 'wb') as handle: pickle.dump(labels, handle, protocol=4)
def load_datafile(path, name): with open(path.joinpath(name), 'rb') as f: u = cPickle.Unpickler(f) data = u.load() return data
def get_data(filename): return picles.Unpickler(open('grantData_hw3/' + filename + '.pickle', 'rb')).load()
features = data_set.get_average_by_perform(features) data_set.save_features_as_csv(features, target_features, path='features_by_piecewise_average.csv') features = data_set.features_to_list(target_features) data_set.save_features_by_features_as_csv(features, target_features, path='note.csv') ''' data_set = data_class.DataSet('pyScoreParser/chopin_cleaned/Haydn/', 'folder') data_set.load_all_performances() # score_extractor = feat_ext.ScoreExtractor(['composer_vec']) for piece in data_set.pieces: piece.meta.composer = 'Haydn' # piece.score_features['composer_vec'] = score_extractor.get_composer_vec(piece) data_set.extract_all_features() data_set.save_dataset('HaydnTest.dat') with open('HaydnTest.dat', "rb") as f: u = cPickle.Unpickler(f) data_set = u.load() # perform_extractor = feat_ext.PerformExtractor(['beat_dynamics', 'measure_dynamics']) # for piece in data_set.pieces: # for performance in piece.performances: # performance.perform_features = perform_extractor.extract_perform_features(piece, performance) # data_set.save_dataset('HaydnTest.dat') pair_data = dft.PairDataset(data_set) pair_data.update_dataset_split_type() pair_data.update_mean_stds_of_entire_dataset() pair_data.save_features_for_virtuosoNet('HaydnTestFeature')