def main(_): print("Parameters: ") for k, v in FLAGS.__flags.items(): print("{} = {}".format(k, v)) if not os.path.exists("./prepro/"): os.makedirs("./prepro/") if FLAGS.prepro: img_feat, tags_idx, a_tags_idx, vocab_processor = data_utils.load_train_data(FLAGS.train_dir, FLAGS.tag_path, FLAGS.prepro_dir, FLAGS.vocab) else: img_feat = cPickle.load(open(os.path.join(FLAGS.prepro_dir, "img_feat.dat"), 'rb')) tags_idx = cPickle.load(open(os.path.join(FLAGS.prepro_dir, "tag_ids.dat"), 'rb')) a_tags_idx = cPickle.load(open(os.path.join(FLAGS.prepro_dir, "a_tag_ids.dat"), 'rb')) vocab_processor = VocabularyProcessor.restore(FLAGS.vocab) img_feat = np.array(img_feat, dtype='float32')/127.5 - 1. test_tags_idx = data_utils.load_test(FLAGS.test_path, vocab_processor) print("Image feature shape: {}".format(img_feat.shape)) print("Tags index shape: {}".format(tags_idx.shape)) print("Attribute Tags index shape: {}".format(a_tags_idx.shape)) print("Vocab size: {}".format(len(vocab_processor._reverse_mapping))) print("Vocab max length: {}".format(vocab_processor.max_document_length)) data = Data(img_feat, tags_idx, a_tags_idx, test_tags_idx, FLAGS.z_dim, vocab_processor) Model = getattr(sys.modules[__name__], FLAGS.model) print(Model) model = Model(data, vocab_processor, FLAGS) model.build_model() model.train()
def loadlevel(): dellevel() for sp_overlay in store.store['spo']: sp_overlay.delete() del store.store['spo'][:] del store.store['gp'][:] loadlev = open('saved_level','rb') store.store['gt'] = cPickle.load(loadlev) store.store['gp'] = cPickle.load(loadlev) for g_tile in store.store['gt']: sp_tile = Sp_Tile(x=clevel.ct(g_tile.coor[0]), y=clevel.ct(g_tile.coor[1]), img=getim(g_tile), bt= store.map_batch,id=g_tile.id) sp_tile.rotation=g_tile.rot store.store['spt'].append(sp_tile) if g_tile.overlays: for ol in g_tile.overlays: sp_overlay = Sp_Tile(x=clevel.ct(ol.x),y=clevel.ct(ol.y), img=getim(ol), bt=store.item_batch,id=ol.id, ol=True) store.store['spo'].append(sp_overlay) for g_player in store.store['gp']: sp_overlay = Sp_Tile(x=clevel.ct(g_player.coor[0]), y=clevel.ct(g_player.coor[1]), img = getim(g_player), id= g_player.id, bt = store.player_batch) store.store['spo'].append(sp_overlay) loadlev.close()
def get_camp_info(camp, src="ipinyou"): if src == "ipinyou": info = pickle.load(open(ipinyouPath + camp + "/info.txt", "rb")) elif src == "vlion": info = pickle.load(open(vlionPath + camp + "/info.txt", "rb")) elif src == "yoyi": info = pickle.load(open(yoyiPath + camp + "/info.txt", "rb")) return info
def load(self, filename): try: file = open(filename, 'rb') if len(self.buffer) == 0: self.buffer = pickle.loads(pickle.load(file)) else: buf = pickle.loads(pickle.load(file)) self.merge(buf) file.close() return True except Exception as e: return False
def preprocess(words_file = "tools/word_data.pkl", authors_file="tools/email_authors.pkl"): """ this function takes a pre-made list of email texts (by default word_data.pkl) and the corresponding authors (by default email_authors.pkl) and performs a number of preprocessing steps: -- splits into training/testing sets (10% testing) -- vectorizes into tfidf matrix -- selects/keeps most helpful features after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions 4 objects are returned: -- training/testing features -- training/testing labels """ ### the words (features) and authors (labels), already largely preprocessed ### this preprocessing will be repeated in the text learning mini-project authors_file_handler = open(authors_file, "rb") authors = cPickle.load(authors_file_handler) authors_file_handler.close() words_file_handler = open(words_file, "rb") word_data = cPickle.load(words_file_handler) words_file_handler.close() ### test_size is the percentage of events assigned to the test set ### (remainder go into training) features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42) ### text vectorization--go from strings to lists of numbers vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) ### feature selection, because text is super high dimensional and ### can be really computationally chewy as a result selector = SelectPercentile(f_classif, percentile=1) selector.fit(features_train_transformed, labels_train) features_train_transformed = selector.transform(features_train_transformed).toarray() features_test_transformed = selector.transform(features_test_transformed).toarray() ### info on the data print ("no. of Chris training emails:", sum(labels_train)) print ("no. of Sara training emails:", len(labels_train)-sum(labels_train)) return features_train_transformed, features_test_transformed, labels_train, labels_test
def __init__(self, master, main_window): ttk.Frame.__init__(self, master) self.window = main_window self.major_components = ["PrimaryWeapon", "PrimaryWeapon2", "SecondaryWeapon", "SecondaryWeapon2", "Systems"] self.middle_components = ["Engine", "ShieldProjector"] self.minor_components = ["Magazine", "Capacitor", "Reactor", "Armor", "Sensor", "Thruster"] # Open all required databases self.icons_path = path.abspath(path.join(path.dirname(path.realpath(__file__)), "..", "assets", "icons")) with open(path.join(get_assets_directory(), "ships.db"), "rb") as f: # Contains data on the components self.ships_data = pickle.load(f) with open(path.join(get_assets_directory(), "categories.db"), "rb") as f: # Contains data on the ships (specifically descriptions and the like) self.categories_data = pickle.load(f) with open(path.join(get_assets_directory(), "companions.db"), "rb") as f: # Contains data on the Crew members self.companions_data = pickle.load(f) # ScrollFrame to contain the component lists (ToggledFrames) and the CrewSelectFrame self.components_lists_frame = VerticalScrollFrame(self, canvaswidth=260, canvasheight=315) self.ship_select_frame = ShipSelectFrame(self, self.set_ship, self.set_faction) self.components_lists = OrderedDict() self.faction = "Imperial" self.category = "Scout" self.ship = Ship("Bloodmark") self.character = None self.ship_name = None # Header above the Components ToggledFrames self.components_lists_header_label = ttk.Label( self.components_lists_frame.interior, text="Components", justify=tk.LEFT, font=("Calibiri", 12)) for category in COMPONENTS: # Bloodmark is the default around which the widgets are created if category not in self.ships_data["Imperial_S-SC4_Bloodmark"]: continue self.components_lists[category] = \ ComponentListFrame( self.components_lists_frame.interior, category, self.ships_data["Imperial_S-SC4_Bloodmark"][category], self.set_component, self.toggle_callback) self.component_frame = ttk.Frame(self) self.current_component = ComponentWidget( self.component_frame, self.ships_data["Imperial_S-SC4_Bloodmark"]["PrimaryWeapon"][0], self.ship, "PrimaryWeapon") self.crew_select_frame = CrewListFrame( self.components_lists_frame.interior, self.faction, self.companions_data, self.set_crew_member) # Image for on the ShipStats button self.ship_stats_image = open_icon("spvp_targettracker", (49, 49)) self.ship_stats_button = ttk.Button( self, text="Show ship statistics", command=self.show_ship_stats, image=self.ship_stats_image, compound=tk.LEFT) self.reset()
def __init__(self, ship: Ship, ships_data: dict, companions_data: dict): """ :param ship: Ship object """ self.stats = dict() self.ship = ship if ships_data is None: with open(os.path.join(get_assets_directory(), "ships.db"), "rb") as fi: ships_data = pickle.load(fi) if companions_data is None: with open(os.path.join(get_assets_directory(), "companions.db"), "rb") as fi: companions_data = pickle.load(fi) self.ships_data = ships_data.copy() self.companions_data = companions_data.copy() self.calc_ship_stats()
def xgboost_pred(df_all_file,d_col_drops,n_estimators,learning_rate,max_depth): ### Load pickle_file = '%s/%s'%(Dir,df_all_file) with open(pickle_file, 'rb') as f: save = pickle.load(f) df_all = save['df_all'] del save # hint to help gc free up memory print('df_all', df_all.shape) ########################## df_train = df_all.iloc[:num_train] df_test = df_all.iloc[num_train:] id_test = df_test['id'] y_train = df_train['relevance'].values #y_train = pd.DataFrame(df_train['relevance'].values,columns=['relevance']) X_train =df_train[:] X_test = df_test[:] print("--- Features Set: %s minutes ---" % round(((time.time() - start_time)/60),2)) X_train2 = X_train.drop(d_col_drops,axis=1).values # Prediction xgb_model = xgb.XGBRegressor(n_estimators=n_estimators,learning_rate=learning_rate,max_depth=max_depth,seed=2016,silent=False, nthread=-1, gamma=0.000001, min_child_weight=1, max_delta_step=0, subsample=1, colsample_bytree=1, colsample_bylevel=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, base_score=0.5, missing=None) xgb_model.fit(X_train2, y_train) X_test2 = X_test.drop(d_col_drops, axis=1).values y_pred = xgb_model.predict(X_test2) #y_pred = [max(1., min(x, 3.)) for x in y_pred] pd.DataFrame({"id": id_test, "relevance": y_pred}).to_csv('%s/submission_v6_B_xgboost_same_query_score.csv' % (Dir), index=False) print("--- Training & Testing: %s minutes ---" % round(((time.time() - start_time) / 60), 2)) return xgb_model,y_pred
def xgboost_test(df_all_file,d_col_drops,n_estimators,learning_rate,max_depth): ### Load pickle_file = '%s/%s'%(Dir,df_all_file) with open(pickle_file, 'rb') as f: save = pickle.load(f) df_all = save['df_all'] del save # hint to help gc free up memory print('df_all', df_all.shape) ########################## df_train = df_all.iloc[:num_train] df_test = df_all.iloc[num_train:] id_test = df_test['id'] y_train = df_train['relevance'].values #y_train = pd.DataFrame(df_train['relevance'].values,columns=['relevance']) X_train =df_train[:] X_test = df_test[:] print("--- Features Set: %s minutes ---" % round(((time.time() - start_time)/60),2)) X_train2 = X_train.drop(d_col_drops,axis=1).values ### Custom CV from sklearn.cross_validation import train_test_split X_train3, X_valid3, y_train3, y_valid3 = train_test_split(X_train2, y_train, test_size=0.2, random_state=2009) xgb_model = xgb.XGBRegressor(n_estimators=n_estimators,learning_rate=learning_rate,max_depth=max_depth,seed=2016) xgb_model.fit(X_train3, y_train3) y_pred = xgb_model.predict(X_valid3) y_pred=[max(1.,min(x,3.)) for x in y_pred] return xgb_model,y_pred,y_valid3
def method_pred(model,df_all_file,d_col_drops): ### Load pickle_file = '%s/%s'%(Dir,df_all_file) with open(pickle_file, 'rb') as f: save = pickle.load(f) df_all = save['df_all'] del save # hint to help gc free up memory print('df_all', df_all.shape) ########################## df_train = df_all.iloc[:num_train] df_test = df_all.iloc[num_train:] id_test = df_test['id'] y_train = df_train['relevance'].values #y_train = pd.DataFrame(df_train['relevance'].values,columns=['relevance']) X_train =df_train[:] X_test = df_test[:] print("--- Features Set: %s minutes ---" % round(((time.time() - start_time)/60),2)) X_train2 = X_train.drop(d_col_drops,axis=1).values # Prediction model.fit(X_train2, y_train) X_test2 = X_test.drop(d_col_drops, axis=1).values y_pred = model.predict(X_test2) return model, y_pred
def load_results(pickle_file): ''' Load in a saved pickle file. Parameters ---------- pickle_file : str Name of filename to load in. Returns ------- self : Save statistic class Statistic instance with saved results. Examples -------- Load saved results. >>> stat = Statistic.load_results("stat_saved.pkl") # doctest: +SKIP ''' with open(pickle_file, 'rb') as input: self = pickle.load(input) return self
def load_data(): """Return the MNIST data as a tuple containing the training data, the validation data, and the test data. The ``training_data`` is returned as a tuple with two entries. The first entry contains the actual training images. This is a numpy ndarray with 50,000 entries. Each entry is, in turn, a numpy ndarray with 784 values, representing the 28 * 28 = 784 pixels in a single MNIST image. The second entry in the ``training_data`` tuple is a numpy ndarray containing 50,000 entries. Those entries are just the digit values (0...9) for the corresponding images contained in the first entry of the tuple. The ``validation_data`` and ``test_data`` are similar, except each contains only 10,000 images. This is a nice data format, but for use in neural networks it's helpful to modify the format of the ``training_data`` a little. That's done in the wrapper function ``load_data_wrapper()``, see below. """ f = gzip.open('../data/mnist.pkl.gz', 'rb') training_data, validation_data, test_data = _pickle.load(f, encoding='bytes') f.close() return (training_data, validation_data, test_data)
def region_filtered_flux(self, s_flux): """Finds the flux through the filtered region, accounts for materials""" if self.print_matrix_sums: print_sum_matrix(s_flux, 'region_filtered_flux s_flux') # Strong Feeling there is a big fat bug somewhere around here thickness=sum(self.thickness) # in cm mat=self.mat ea=self.ea reg_flux=matchdim(s_flux) f=open("mu_data\\"+mat+".pkl", "rb") edata=pickle.load(f) f.close() elem_energy=[i[0] for i in edata] elem_flux=[i[1] for i in edata] # you knew it, absurdly slow code below. Really no way to get around it though. Filtering sucks, O(n^3) for i in range(0, len(s_flux)): si=s_flux[i] for j in range(0, len(s_flux[0])): sij=si[j] muexp=[-1*thickness*mu3(elem_energy, elem_flux, ea[m]) for m in range(0, len(ea))] for m in range(0, len(ea)): reg_flux[i][j][m]=sij[m]*math.exp(muexp[m]) if self.print_matrix_sums: print_sum_matrix(s_flux, 'region_filtered_flux reg_flux') return reg_flux
def load_models(models_dir): """ Load saved models from disk. This will attempt to unpickle all files in a directory; any files that give errors on unpickling (such as README.txt) will be skipped. Inputs: - models_dir: String giving the path to a directory containing model files. Each model file is a pickled dictionary with a 'model' field. Returns: A dictionary mapping model file names to models. """ models = {} for model_file in os.listdir(models_dir): with open(os.path.join(models_dir, model_file), 'rb') as f: try: models[model_file] = pickle.load(f,encoding='bytes')['model'] except pickle.UnpicklingError: continue return models # cifar10_dir = 'cs231n/datasets/cifar-10-batches-py' # cifar10_dir = 'C:/Work/Deep Learning/assignment1/cs231n/datasets/cifar-10-batches-py' # X_train, y_train, X_test, y_test = load_CIFAR10(cifar10_dir)
def load(self, path): """ Load model parameters from path. """ logger.info("Loading from %s ..." % path) file = open(path, 'rb') state = pickle.load(file) self.__setstate__(state) file.close()
def load_dictionary(loc='./data/book_dictionary_large.pkl'): """ Load a dictionary """ with open(loc, 'rb') as f: worddict = pkl.load(f) return worddict
def start(): config = DataConfig() histories = sorted(glob.glob(config.history_location+"*.pickle")) data = {} for hist in histories: file = open(hist, 'rb') h = pickle.loads(pickle.load(file)) for k, v in h.items(): if k not in data.keys(): data[k] = [] for item in v: data[k].append(item) legend = [] plt.subplot(2, 1, 1) for kv in data.items(): legend.append(kv[0]) plt.plot(kv[1]) plt.legend(legend) for i, kv in enumerate(data.items()): plt.subplot(2, 3, i+4) plt.title(kv[0]) plt.plot(kv[1]) plt.tight_layout() plt.show()
def get_features(self): features = [] clips_path = self.get_clips_path() for clip, path in clips_path.items(): with open(path, 'rb') as f: feature = cPickle.load(f, encoding='latin1') features.append(self.modify_nan(feature.reshape(-1))) return np.array(features)
def _load_samples(self, full_filepath): f = gzip.open(full_filepath, 'rb') train_set, valid_set, test_set = cPickle.load(f) f.close() images = test_set[0] labels = test_set[1] images = (images - 0.5) * 2 return np.float32(images), labels
def load_CIFAR_batch(filename): """ load single batch of cifar """ with open(filename, 'rb') as f: datadict = pickle.load(f) X = datadict['data'] Y = datadict['labels'] X = X.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype("float") Y = np.array(Y) return X, Y
def load_preprocessed(self, vocab_file, tensor_file): with open(vocab_file, 'rb') as f: self.chars = cPickle.load(f) self.vocab_size = len(self.chars) self.vocab = dict(zip(self.chars, range(len(self.chars)))) self.tensor = np.load(tensor_file) train_size = int(self.tensor.shape[0] * 0.9) self.valid = self.tensor[train_size:] self.train = self.tensor[:train_size]
def play_multiple_games(self): outfiles = Parallel(n_jobs=self.workers)(delayed(self.play_games)() for g in range(self.workers)) results = [] for filename in outfiles: f = open(filename,'rb') results.extend(cPickle.load(f)) f.close() os.remove(filename) return results
def unpickle(file): import _pickle as cPickle fo = open(file, 'rb') dict = cPickle.load(fo,encoding='latin1') fo.close() if 'data' in dict: dict['data'] = dict['data'].reshape((-1, 3, 32, 32)).swapaxes(1, 3).swapaxes(1, 2).reshape(-1, 32*32*3) / 256. return dict
def mergeStagedWTL(config): #only run if not already merging in another process if os.path.exists(config.data.performance_location+"win_matrix_temp.csv"): return None else: win_matrix_file = open(config.data.performance_location+"win_matrix_temp.csv", "w+") merged_data = [] files = glob.glob(config.data.performance_location+"staged_*.pickle") for file in files: try: data = pickle.load(open(file, "rb")) found = False for i in range(len(merged_data)): if merged_data[i]['player1'] == data['player1'] and merged_data[i]['player2'] == data['player2']: found = True merged_data[i]['wins'] += data['wins'] merged_data[i]['ties'] += data['ties'] merged_data[i]['losses'] += data['losses'] break elif merged_data[i]['player1'] == data['player2'] and merged_data[i]['player2'] == data['player1']: found = True merged_data[i]['wins'] += data['losses'] merged_data[i]['ties'] += data['ties'] merged_data[i]['losses'] += data['wins'] break if not found: merged_data.append(data) os.remove(file) except Exception as e: continue if os.path.exists(config.data.performance_location+"win_matrix.csv"): df = pd.read_csv(config.data.performance_location+"win_matrix.csv", index_col=0) else: df = pd.DataFrame() for elem in merged_data: if not elem["player1"] in list(df): df[elem["player1"]] = 0.0 df.loc[elem["player1"]] = 0.0 df = df.sort_index(axis=0).sort_index(axis=1) if not elem["player2"] in list(df): df[elem["player2"]] = 0.0 df.loc[elem["player2"]] = 0.0 df = df.sort_index(axis=0).sort_index(axis=1) df.at[elem["player1"], elem["player2"]] = df.at[elem["player1"], elem["player2"]] + elem["wins"] + 0.5*elem["ties"] df.at[elem["player2"], elem["player1"]] = df.at[elem["player2"], elem["player1"]] + elem["losses"] + 0.5*elem["ties"] df.to_csv(win_matrix_file) win_matrix_file.close() if checkFileOpen(config.data.performance_location+"win_matrix.csv"): print("Waiting for %s to be close."% os.path.normpath(config.data.performance_location+"win_matrix.csv")) while checkFileOpen(config.data.performance_location+"win_matrix.csv"): sleep(0.1) shutil.move(config.data.performance_location+"win_matrix_temp.csv", config.data.performance_location+"win_matrix.csv") return df
def load_CIFAR_batch(filename, astype='float'): """ load single batch of cifar """ print(filename) with open(filename, 'rb') as f: datadict = pickle.load(f, encoding='latin1') X = datadict['data'] Y = datadict['labels'] X = X.reshape(10000, 3, 32, 32).transpose(0,2,3,1).astype(astype) Y = np.array(Y) return X, Y
def __read_test_times_file(fd): try: with gzip.GzipFile(fileobj=fd, mode='rb') as gzf: times = cPickle.load(gzf) except Exception: # File doesn't exist, isn't readable, is malformed---whatever. # Just ignore it. return None else: return times
def get_data(name): """Load data from the given name""" gen_data = {} # new version if os.path.isfile(name + 'data.pickle'): curent_f = open(name + 'data.pickle', 'rb') d2 = cPickle.load(curent_f) # Old version else: curent_f = open(name, 'rb') d1 = cPickle.load(curent_f) data1 = d1[0] data = np.array([data1[:, :, :, :, :, 0], data1[:, :, :, :, :, 1]]) # Convert log e to log2 normalization_factor = 1 / np.log2(2.718281) epochsInds = np.arange(0, data.shape[4]) d2 = {} d2['epochsInds'] = epochsInds d2['information'] = data / normalization_factor return d2
def loadPickle(): #open file for editing pickl = open('data.pkl', 'rb'); #load object from file unpickledObj = pickle.load(pickl) # close at the end pickl.close() return unpickledObj
def open_database(self): """Open the file database""" path = os.path.join(get_temp_directory(), "files.db") if not os.path.exists(path): self.db = {"version": settings["sharing"]["version"]} self.save_database() with open(path, "rb") as fi: self.db = pickle.load(fi) if self.db["version"] != settings["sharing"]["version"]: os.remove(path) self.open_database()
def unPickleIt(pickle_path): # might throw the file not found exception ''' function to unpickle the object from the given path @param pickle_path => the path where the pickle file is located @return => the object extracted from the saved path ''' with open(pickle_path, 'rb') as dumped_pickle: obj = pickle.load(dumped_pickle) return obj # return the unpickled object
def train(dim_word=100, # word vector dimensionality dim_dec=1000, dim_attention=512, dim_coverage=512, kernel_coverage=[5,5], kernel_Convenc=[3,1], dim_ConvBlock=[32,64,64,128], layersNum_block=[4,4,4,4], encoder='gru', decoder='gru_cond', patience=4, # early stopping patience max_epochs=5000, finish_after=10000000, # finish after this many updates dispFreq=100, decay_c=0., # L2 regularization penalty alpha_c=0., # alignment regularization clip_c=-1., # gradient clipping threshold lrate=1e-8, # learning rate dim_target=62, # source vocabulary size input_channels=123, # target vocabulary size maxlen=100, # maximum length of the description maxImagesize=1, # maximum size of the input image optimizer='rmsprop', batch_Imagesize=16, valid_batch_Imagesize=16, batch_size=16, valid_batch_size=16, saveto='model.npz', bn_saveto='bn_model.npz', validFreq=1000, saveFreq=1000, # save the parameters after every saveFreq updates sampleFreq=100, # generate some samples after every sampleFreq datasets=['feature.pkl', 'label.txt'], valid_datasets=['feature_valid.pkl', 'label_valid.txt'], dictionaries=['lexicon.txt'], valid_output=['decode.txt'], valid_result=['result.txt'], use_dropout=False, reload_=False): # Model options model_options = locals().copy() # load dictionaries and invert them worddicts = load_dict(dictionaries[0]) worddicts_r = [None] * len(worddicts) for kk, vv in worddicts.items(): worddicts_r[vv] = kk # reload options if reload_ and os.path.exists(saveto): with open('%s.pkl' % saveto, 'rb') as f: models_options = pkl.load(f) print('Loading data') train,train_uid_list = dataIterator(datasets[0], datasets[1], worddicts, batch_size=batch_size, batch_Imagesize=batch_Imagesize,maxlen=maxlen,maxImagesize=maxImagesize) valid,valid_uid_list = dataIterator(valid_datasets[0], valid_datasets[1], worddicts, batch_size=valid_batch_size, batch_Imagesize=valid_batch_Imagesize,maxlen=maxlen,maxImagesize=maxImagesize) print('Building model') params = init_params(model_options) bn_params = init_bn_params(model_options) # reload parameters if reload_ and os.path.exists(saveto): params = load_params(saveto, params) bn_params = load_params(bn_saveto, bn_params) tparams = init_tparams(params) bn_tparams = init_tparams(bn_params) trng, use_noise, \ x, x_mask, y, y_mask, \ opt_ret, \ cost = \ build_model(tparams, bn_tparams, model_options) inps = [x, x_mask, y, y_mask] print('Buliding sampler') f_init, f_next = build_sampler(tparams, bn_tparams, model_options, trng, use_noise) # before any regularizer print('Building f_log_probs...') f_log_probs = theano.function(inps, cost, profile=profile) print('Done') cost = cost.mean() # apply L2 regularization on weights if decay_c > 0.: decay_c = theano.shared(numpy.float32(decay_c), name='decay_c') weight_decay = 0. for kk, vv in tparams.iteritems(): tmp = kk.split('_') if tmp[-2] != 'bn': weight_decay += (vv ** 2).sum() weight_decay *= decay_c cost += weight_decay # regularize the alpha weights if alpha_c > 0. and not model_options['decoder'].endswith('simple'): alpha_c = theano.shared(numpy.float32(alpha_c), name='alpha_c') alpha_reg = alpha_c * ( (tensor.cast(y_mask.sum(0)//x_mask.sum(0), 'float32')[:, None] - opt_ret['dec_alphas'].sum(0))**2).sum(1).mean() cost += alpha_reg # after all regularizers - compile the computational graph for cost print('Building f_cost...') f_cost = theano.function(inps, cost, profile=profile) print('Done') print('Computing gradient...') grads = tensor.grad(cost, wrt=itemlist(tparams)) print('Done') # apply gradient clipping here if clip_c > 0.: g2 = 0. for g in grads: g2 += (g**2).sum() new_grads = [] for g in grads: new_grads.append(tensor.switch(g2 > (clip_c**2), g / tensor.sqrt(g2) * clip_c, g)) grads = new_grads # compile the optimizer, the actual computational graph is compiled here lr = tensor.scalar(name='lr') print('Building optimizers...') f_grad_shared, f_update = eval(optimizer)(lr, tparams, bn_tparams, opt_ret, grads, inps, cost) print('Done') # print model parameters print("Model params:\n{0}".format( pprint.pformat(sorted([p for p in params])))) # end print('Optimization') history_errs = [] # reload history if reload_ and os.path.exists(saveto): history_errs = list(numpy.load(saveto)['history_errs']) best_p = None best_bn_p = None bad_count = 0 if validFreq == -1: validFreq = len(train) if saveFreq == -1: saveFreq = len(train) if sampleFreq == -1: sampleFreq = len(train) uidx = 0 estop = False halfLrFlag = 0 bad_counter = 0 ud_s = 0 ud_epoch = 0 cost_s = 0. for eidx in xrange(max_epochs): n_samples = 0 ud_epoch = time.time() random.shuffle(train) # shuffle data for x, y in train: n_samples += len(x) uidx += 1 use_noise.set_value(1.) ud_start = time.time() x, x_mask, y, y_mask = prepare_data(model_options, x, y) if x is None: print('Minibatch with zero sample under length ', maxlen) uidx -= 1 continue # compute cost, grads and copy grads to shared variables cost = f_grad_shared(x, x_mask, y, y_mask) cost_s += cost # do the update on parameters f_update(lrate) ud = time.time() - ud_start ud_s += ud # check for bad numbers, usually we remove non-finite elements # and continue training - but not done here if numpy.isnan(cost) or numpy.isinf(cost): print('NaN detected') return 1., 1., 1. # verbose if numpy.mod(uidx, dispFreq) == 0: ud_s /= 60. cost_s /= dispFreq print('Epoch ', eidx, 'Update ', uidx, 'Cost ', cost_s, 'UD ', ud_s, 'lrate ',lrate, 'bad_counter', bad_counter) ud_s = 0 cost_s = 0. # save the best model so far if numpy.mod(uidx, saveFreq) == 0: print('Saving...') if best_p is not None: params = best_p bn_params = best_bn_p else: params = unzip(tparams) bn_params = unzip(bn_tparams) numpy.savez(saveto, history_errs=history_errs, **params) numpy.savez(bn_saveto, history_errs=history_errs, **bn_params) pkl.dump(model_options, open('%s.pkl' % saveto, 'wb')) print('Done') # generate some samples with the model and display them if numpy.mod(uidx, sampleFreq) == 0: # FIXME: random selection? use_noise.set_value(0.) fpp_sample=open(valid_output[0],'w') valid_count_idx=0 # FIXME: random selection? for x,y in valid: for xx in x: xx_pad = numpy.zeros((xx.shape[0],xx.shape[1],xx.shape[2]), dtype='float32') # input_channels * height * width xx_pad[:,:, :] = xx / 255. stochastic = False sample, score = gen_sample(tparams, f_init, f_next, xx_pad[None, :, :, :], model_options, trng=trng, k=10, maxlen=1000, stochastic=stochastic, argmax=False) if stochastic: ss = sample else: score = score / numpy.array([len(s) for s in sample]) ss = sample[score.argmin()] fpp_sample.write(valid_uid_list[valid_count_idx]) valid_count_idx=valid_count_idx+1 for vv in ss: if vv == 0: # <eol> break fpp_sample.write(' '+worddicts_r[vv]) fpp_sample.write('\n') fpp_sample.close() print('valid set decode done') ud_epoch = time.time() - ud_epoch ud_epoch /= 60. print('epoch cost time ... ', ud_epoch) # validate model on validation set and early stop if necessary if numpy.mod(uidx, validFreq) == 0: use_noise.set_value(0.) valid_errs = pred_probs(f_log_probs, prepare_data, model_options, valid) valid_err_cost = valid_errs.mean() # compute wer os.system('python compute-wer.py ' + valid_output[0] + ' ' + valid_datasets[1] + ' ' + valid_result[0]) fpp=open(valid_result[0]) stuff=fpp.readlines() fpp.close() m=re.search('WER (.*)\n',stuff[0]) valid_per=100. * float(m.group(1)) m=re.search('ExpRate (.*)\n',stuff[1]) valid_sacc=100. * float(m.group(1)) valid_err=valid_per #valid_err=0.7*valid_per-0.3*valid_sacc history_errs.append(valid_err) if uidx/validFreq == 0 or valid_err <= numpy.array(history_errs).min(): # the first time valid or worse model best_p = unzip(tparams) best_bn_p = unzip(bn_tparams) bad_counter = 0 if uidx/validFreq != 0 and valid_err > numpy.array(history_errs).min(): bad_counter += 1 if bad_counter > patience: if halfLrFlag==2: print('Early Stop!') estop = True break else: print('Lr decay and retrain!') bad_counter = 0 lrate = lrate / 2 params = best_p bn_params = best_bn_p halfLrFlag += 1 if numpy.isnan(valid_err): #ipdb.set_trace() print('valid_err nan') print('Valid WER: %.2f%%, ExpRate: %.2f%%, Cost: %f' % (valid_per,valid_sacc,valid_err_cost)) # finish after this many updates if uidx >= finish_after: print('Finishing after %d iterations!' % uidx) estop = True break print('Seen %d samples' % n_samples) if estop: break if best_p is not None: zipp(best_p, tparams) zipp(best_bn_p, bn_tparams) use_noise.set_value(0.) valid_err = pred_probs(f_log_probs, prepare_data, model_options, valid).mean() print('Valid ', valid_err) params = copy.copy(best_p) bn_params = copy.copy(best_bn_p) numpy.savez(saveto, zipped_params=best_p, history_errs=history_errs, **params) numpy.savez(bn_saveto, zipped_params=best_bn_p, history_errs=history_errs, **bn_params) return valid_err
def load_pickle(filename): f = open(filename, "rb") p = cPickle.load(f) f.close() return (p)
def load_bigrams(self, name, path ='../../feature_groups/lda_pickles'): print("loading bigram: "+name) path = os.path.join(path, name) with open(path, "rb") as f: self.bigrams = _pickle.load(f)
def voc_eval( det_lines, #检测结果 annopath, #标记目录 imagesetfile, #检测图像文件名 classname, #筛选类别名称 cachedir, #缓存目录 ovthresh=0.5, #阈值 use_07_metric=False #AP计算方式 ): ''' @param det_splitlines [list ]某类检测结果文件 数据: [ [imagename1, confidence, xmin, ymin, xmax, ymax], #(图像1的第一个结果) [imagename1, confidence, xmin, ymin, xmax, ymax], #(图像1的第二个结果) [imagename2, confidence, xmin, ymin, xmax, ymax], #(图像2的第一个结果) ] @param annopath [str ]标注目录 annopath.format(imagename) should be the xml annotations file. #xml 标注文件。 annopath=>'annotations/{}.xml'=>annopath.format('2008_000001')=>'annotations/2008_000001.xml' @param imagesetfile [str ]检测图像集文件 文本文件,每行一个图像文件名,不含扩展名 该文件格式: 2008_000001 2008_000002 @param classname [str ]检测类别名称,用于筛选imagesetfile @param cachedir [str ]缓存目录,用于存放原始数据集的加载文件 @param ovthresh [float]IoU阈值 @param use_07_metric [bool ]AP计算模式 Whether to use VOC07's 11 point AP computation (default False) #是否使用VOC07的AP计算方法,voc07是11个点采样。 @return rec, prec, ap rec ---召回率,向量 prec---准确率,向量 ap-----平均准确率,标量 计算方法: 检测结果数为:N=5 按置信度由高到低排序 TP/FP计算: 筛选某类的检测结果及该类的gt_bbox TP[:],FP[:]初始化为False 遍历检测结果 如果检测bbox与该类gt_bbox的IoU大于阈值, 则 TP[i]=1 虚警处理(同一个gt_bbox在不同的检测结果中出现):FP[i]=1 否则 FP[i]=1 TP:[1, 0, 1, 1, 0],积分值=>TP_int=[1,1,2,3,3] FP:[0, 1, 0, 0, 1],积分值=>FP_int=[0,1,1,1,2] prec:TP_int/(TP_int+FP_int)=>[1, 1/2, 2/3, 3/4, 3/5] rec :TP_int/N=>[1/5, 1/5, 2/5, 3/5, 3/5] ap: if use_07_metric: # 11 point metric ap = 0. for t in np.arange(0., 1.1, 0.1): if np.sum(rec >= t) == 0: p = 0 else: p = np.max(prec[rec >= t]) ap = ap + p / 11. else: # correct AP calculation # first append sentinel values at the end mrec = np.concatenate(([0.], rec, [1.])) mpre = np.concatenate(([0.], prec, [0.])) # compute the precision envelope for i in range(mpre.size - 1, 0, -1): mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) # to calculate area under PR curve, look for points # where X axis (recall) changes value i = np.where(mrec[1:] != mrec[:-1])[0] # and sum (\Delta recall) * prec ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) ''' # assumes detections are in detpath.format(classname) # assumes annotations are in annopath.format(imagename) # assumes imagesetfile is a text file with each line an image name # cachedir caches the annotations in a pickle file #原始数据集缓存文件 =>[str ] cachefile # first load gt 加载ground truth。 if not os.path.isdir(cachedir): os.mkdir(cachedir) cachefile = os.path.join(cachedir, 'annots.pkl') #只读文件名称。 #读取所有测试图片名称 =>[list] imagenames # read list of images with open(imagesetfile, 'r') as f: lines = f.readlines() #读取所有待检测图片名。 imagenames = [x.strip() for x in lines] #待检测图像文件名字存于数组imagenames,长度1000。 #加载原始数据文件 =>[dict] recs{文件名:标注结构体数据} if not os.path.isfile(cachefile): #如果只读文件不存在,则只好从原始数据集中重新加载数据 # load annots recs = {} for i, imagename in enumerate(imagenames): recs[imagename] = parse_rec( annopath.format(imagename) ) #parse_rec函数读取当前图像标注文件,返回当前图像标注,存于recs字典(key是图像名,values是gt) if i % 100 == 0: print('Reading annotation for {:d}/{:d}'.format( i + 1, len(imagenames))) #进度条。 # save print('Saving cached annotations to {:s}'.format(cachefile)) print(type(recs)) with open(cachefile, 'wb') as f: cPickle.dump(recs, f) #recs字典c保存到只读文件。 else: # load with open(cachefile, 'rb') as f: recs = cPickle.load(f) #如果已经有了只读文件,加载到recs。 #提取类别为classname的原始数据集 # extract gt objects for this class #按类别获取标注文件,recall和precision都是针对不同类别而言的,AP也是对各个类别分别算的。 class_recs = {} #当前类别的标注 npos = 0 #npos标记的目标数量 for imagename in imagenames: #筛选类别为classname的原始数据集 => R R = [obj for obj in recs[imagename] if obj['name'] == classname] #过滤,只保留recs中指定类别的项,存为R。 #提取bbox,gt bbox = np.array([x['bbox'] for x in R]) #抽取bbox difficult = np.array([x['difficult'] for x in R ]).astype(np.bool) #如果数据集没有difficult,所有项都是0. #检测结果,默认为False det = [False] * len(R) #len(R)就是当前类别的gt目标个数,det表示是否检测到,初始化为false。 #gt目标数量(排除difficult为True的目标) npos = npos + sum( ~difficult) #自增,非difficult样本数量,如果数据集没有difficult,npos数量就是gt数量。 #当前类别标注(不含difficult为True的目标) class_recs[imagename] = { 'bbox': bbox, #检测边框 'difficult': difficult, #difficult属性 'det': det #检测结果 } #三个属性值长度相同 # read dets 读取检测结果 #detfile = detpath.format(classname) #with open(detfile, 'r') as f: # lines = f.readlines() #splitlines = [x.strip().split(' ') for x in lines] #假设检测结果有20000个,则splitlines长度20000 splitlines = det_lines image_ids = [x[0] for x in splitlines ] #检测结果中的图像名,image_ids长度20000,但实际图像只有1000张,因为一张图像上可以有多个目标检测结果 confidence = np.array([float(x[1]) for x in splitlines]) #检测结果置信度 BB = np.array([[float(z) for z in x[2:]] for x in splitlines]) #变为浮点型的bbox。 # sort by confidence 将20000各检测结果按置信度排序 sorted_ind = np.argsort(-confidence) #对confidence的index根据值大小进行降序排列。 sorted_scores = np.sort(-confidence) #降序排列。 print('BB.shape:', BB.shape) print('sorted_ind.shape:', sorted_ind.shape) BB = BB[sorted_ind, :] #重排bbox,由大概率到小概率。 image_ids = [image_ids[x] for x in sorted_ind] #对image_ids相应地进行重排。 # go down dets and mark TPs and FPs nd = len(image_ids) #注意这里是20000,不是1000 tp = np.zeros(nd) # true positive,长度20000 fp = np.zeros(nd) # false positive,长度20000 for d in range(nd): #遍历所有检测结果,因为已经排序,所以这里是从置信度最高到最低遍历 R = class_recs[image_ids[d]] #当前检测结果所在图像的所有同类别gt bb = BB[d, :].astype(float) #当前检测结果bbox坐标,1个bbox ovmax = -np.inf BBGT = R['bbox'].astype(float) #当前检测结果所在图像的所有同类别gt的bbox坐标,含有N个 if BBGT.size > 0: # compute overlaps 计算当前检测结果,与该检测结果所在图像的标注重合率,一对多用到python的broadcast机制 # intersection ixmin = np.maximum(BBGT[:, 0], bb[0]) iymin = np.maximum(BBGT[:, 1], bb[1]) ixmax = np.minimum(BBGT[:, 2], bb[2]) iymax = np.minimum(BBGT[:, 3], bb[3]) iw = np.maximum(ixmax - ixmin + 1., 0.) ih = np.maximum(iymax - iymin + 1., 0.) inters = iw * ih # union uni = ((bb[2] - bb[0] + 1.) * (bb[3] - bb[1] + 1.) + (BBGT[:, 2] - BBGT[:, 0] + 1.) * (BBGT[:, 3] - BBGT[:, 1] + 1.) - inters) overlaps = inters / uni ovmax = np.max(overlaps) #最大重合率 jmax = np.argmax(overlaps) #最大重合率对应的gt if ovmax > ovthresh: #如果当前检测结果与真实标注最大重合率满足阈值 if not R['difficult'][jmax]: if not R['det'][jmax]: tp[d] = 1. #正检数目+1 R['det'][ jmax] = 1 #该gt被置为已检测到,下一次若还有另一个检测结果与之重合率满足阈值,则不能认为多检测到一个目标 else: #相反,认为检测到一个虚警 fp[d] = 1. else: #不满足阈值,肯定是虚警 fp[d] = 1. # compute precision recall fp = np.cumsum(fp) #积分图,在当前节点前的虚警数量,fp长度 tp = np.cumsum(tp) #积分图,在当前节点前的正检数量 rec = tp / float(npos) #召回率,长度20000,从0到1 # avoid divide by zero in case the first detection matches a difficult # ground truth 准确率,长度20000,长度20000,从1到0 prec = tp / np.maximum(tp + fp, np.finfo(np.float64).eps) ap = voc_ap(rec, prec, use_07_metric) return rec, prec, ap
npop = 100 # number of episodes sigma = 0.1 alpha = 0.03 iter_num = 300 aver_reward = None allow_writing = True reload = False # reload = True basically loads a pre-made (and pretty good) model - it's supposed to be kind of a demo iterations = 1000 #graphing set up areward = [] print(hl_size, version, npop, sigma, alpha, iter_num) if reload: model = pickle.load(open('model-pedal%d.p' % version, 'rb')) # loads pre-made model else: # creates new, random model model = {} #np.random.randn fills with random samples from standardized normal distribution model['W1'] = np.random.randn(24, hl_size) / np.sqrt( 24) # input-hiddenlayer ... 24 x hl_size model['W2'] = np.random.randn(hl_size, 4) / np.sqrt( hl_size) # hiddenlayer-output def afunction(x): return x / (1 + np.absolute(x)) def get_action(state, model): #print(state)
import _pickle as pickle # cPickle from collections import Counter #tallies the total count of words in a list import keras import postprocessing as pr #possibly from https://github.com/steerapi/seq2seq-show-att-tell/blob/master/generate_pretrained_embedding.py # load data with open('../data/tokens.pkl', 'rb') as fp: # use create_picle_file.py #heads, desc, and keywords as separate arrays heads, descs, keywords = pickle.load(fp) # headings tupal i = 0 heads[i] # Remainders : Super wi-fi edition #Articles descs[i] #tokenize text, return vocab in order of usage (the should come first) def get_vocab(combinedText): # TODO try to get vocab and count in another way, words = combinedText.split() vocab = [word for word, word_count in Counter(words).most_common()] return vocab vocab = get_vocab(heads[i] + descs[i]) print(vocab[:50]) print('...', len(vocab))
from neoStructures import * import matplotlib.pyplot as plt from os.path import isdir, join import _pickle import seaborn as sns sns.set() sns.set_context('paper') import sys print(sys.version) ####################### # Load neoEpoch data with _pickle data_dir = join('pySpikeAnalysis', 'sample_data') if isdir('pySpikeAnalysis') else join( '..', '..', 'pySpikeAnalysis', 'sample_data') neo_epoch_filename = r'neoepoch_071118_1132.p' with open(join(data_dir, neo_epoch_filename), 'rb') as f: neo_epoch = _pickle.load(f) neo_epoch.save_fig = 0 ############################## # See information about NeoAll print(neo_epoch) ############################## # Plot the raster plot for unit 4 neo_epoch.plot_rasterplot(4)
## BPE args parser.add_argument('--bpe_codes', type=str, default='data/bpe.codes') parser.add_argument('--bpe_vocab', type=str, default='data/vocab.txt') parser.add_argument('--bpe_vocab_thresh', type=int, default=50) args = parser.parse_args() os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu # load saved models pp_model = torch.load(args.pp_model) parse_model = torch.load(args.parse_model) # load vocab pp_vocab, rev_pp_vocab = cPickle.load(open(args.vocab, 'rb')) tag_file = codecs.open(args.parse_vocab, 'r', 'utf-8') parse_gen_voc = {} for idx, line in enumerate(tag_file): line = line.strip() parse_gen_voc[line] = idx rev_label_voc = dict((v,k) for (k,v) in parse_gen_voc.items()) # load paraphrase network pp_args = pp_model['config_args'] net = SCPN(pp_args.d_word, pp_args.d_hid, pp_args.d_nt, pp_args.d_trans, len(pp_vocab), len(parse_gen_voc) - 1, pp_args.use_input_parse) net.cuda() net.load_state_dict(pp_model['state_dict']) net.eval()
def main(): parser = argparse.ArgumentParser(description='Mask R-CNN') parser.add_argument('--gpu', '-g', type=int, default=0) parser.add_argument('--lr', '-l', type=float, default=1e-3) parser.add_argument('--out', '-o', default='result', help='Output directory') parser.add_argument('--iteration', '-i', type=int, default=200000) parser.add_argument('--weight', '-w', type=str, default='') parser.add_argument('--label_file', '-f', type=str, default='data/label_coco.txt') parser.add_argument('--backbone', type=str, default='fpn') parser.add_argument('--head_arch', '-a', type=str, default='fpn') parser.add_argument('--multi_gpu', '-m', type=int, default=0) parser.add_argument('--batch_size', '-b', type=int, default=1) args = parser.parse_args() print('lr:{}'.format(args.lr)) print('output:{}'.format(args.out)) print('weight:{}'.format(args.weight)) print('label file:{}'.format(args.label_file)) print('iteration::{}'.format(args.iteration)) print('backbone architecture:{}'.format(args.backbone)) print('head architecture:{}'.format(args.head_arch)) if args.multi_gpu: print( 'try to use chainer.training.updaters.MultiprocessParallelUpdater') if not chainer.training.updaters.MultiprocessParallelUpdater.available( ): print('MultiprocessParallelUpdater is not available') args.multi_gpu = 0 with open(args.label_file, "r") as f: labels = f.read().strip().split("\n") faster_rcnn = MaskRCNNResnet50(n_fg_class=len(labels), backbone=args.backbone, head_arch=args.head_arch) faster_rcnn.use_preset('evaluate') model = FPNMaskRCNNTrainChain(faster_rcnn, mask_loss_fun=calc_mask_loss) if exists(args.weight): chainer.serializers.load_npz(args.weight, model.faster_rcnn, strict=False) if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() model.to_gpu() optimizer = chainer.optimizers.MomentumSGD(lr=args.lr, momentum=0.9) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.WeightDecay(rate=0.0005)) pkl_file = 'train_data.pkl' if isfile(pkl_file): print('pklから読み込みます') dataload_start = time.time() with open(pkl_file, 'rb') as f: coco_train_data = pickle.load(f) dataload_end = time.time() print('pklからの読み込み {}'.format(dataload_end - dataload_start)) else: dataload_start = time.time() coco_train_data = COCOMaskLoader(category_filter=labels) dataload_end = time.time() print('普通の読み込み {}'.format(dataload_end - dataload_start)) print('次回のために保存します') with open(pkl_file, 'wb') as f: pickle.dump(coco_train_data, f) train_data = TransformDataset(coco_train_data, Transform(faster_rcnn)) if args.multi_gpu: train_iters = [ chainer.iterators.SerialIterator(train_data, 1, repeat=True, shuffle=True) for i in range(8) ] updater = chainer.training.updater.MultiprocessParallelUpdater( train_iters, optimizer, device=range(8)) else: train_iter = chainer.iterators.SerialIterator( train_data, batch_size=args.batch_size, repeat=True, shuffle=False) updater = chainer.training.updater.StandardUpdater(train_iter, optimizer, device=args.gpu) trainer = chainer.training.Trainer(updater, (args.iteration, 'iteration'), args.out) trainer.extend(extensions.snapshot_object( model.faster_rcnn, 'model_{.updater.iteration}.npz'), trigger=(5000, 'iteration')) trainer.extend(extensions.ExponentialShift('lr', 0.1), trigger=(2, 'epoch')) log_interval = 100, 'iteration' trainer.extend(chainer.training.extensions.observe_lr(), trigger=log_interval) trainer.extend(extensions.LogReport(trigger=log_interval)) trainer.extend(extensions.PrintReport([ 'iteration', 'epoch', 'elapsed_time', 'lr', 'main/loss', 'main/mask_loss', 'main/roi_loc_loss', 'main/roi_cls_loss', 'main/rpn_loc_loss', 'main/rpn_cls_loss', ]), trigger=(100, 'iteration')) trainer.extend(extensions.ProgressBar(update_interval=200)) trainer.extend(extensions.dump_graph('main/loss')) save_args(args, args.out) trainer.extend(CommandsExtension(), trigger=(100, 'iteration')) trainer.run()
f.write('abc\n') f.write('def') f.write('feg') l1 = [[3, 6, 9], [6, 8, 5], [9, 7, 2]] out_file = 'out_list.txt' with open(out_file, 'w') as f: f.write(str(l1)) print('------ PICKLE -------') import _pickle out_file = 'out_list.dat' with open(out_file, 'wb') as f: _pickle.dump(l1, f) with open(out_file, 'rb') as f: out_obj = _pickle.load(f) print(out_obj) print(sum(out_obj[0])) print('------ JSON -------') import json out_file = 'out_list.json' with open(out_file, 'w') as f: json.dump(l1, f) with open(out_file, 'r') as f: out_obj = json.load(f) print(out_obj) print(sum(out_obj[0]))
def read(num_steps, binaresed=True): ### root = './data/' data_set = pd.read_table(root + 'train.tsv', sep='\t', header=None) data_set = data_set.drop(columns=[0, 8, 9, 10, 11, 12]) data_set = data_set.rename( columns={ 1: "label", 2: "statement", 3: "subject", 4: "speaker", 5: "job", 6: "state", 7: "party", 13: "venue" }) embeddings_index = {} with open(root + 'glove.6B.100d.txt', encoding="utf8") as fp: for line in fp: values = line.split() vectors = np.asarray(values[1:], dtype='float32') embeddings_index[values[0].lower()] = vectors val_set = pd.read_table(root + 'valid.tsv', sep='\t', header=None) val_set = val_set.drop(columns=[0, 8, 9, 10, 11, 12]) val_set = val_set.rename( columns={ 1: "label", 2: "statement", 3: "subject", 4: "speaker", 5: "job", 6: "state", 7: "party", 13: "venue" }) ### test_set = pd.read_table(root + 'test.tsv', sep='\t', header=None) test_set = test_set.drop(columns=[0, 8, 9, 10, 11, 12]) test_set = test_set.rename( columns={ 1: "label", 2: "statement", 3: "subject", 4: "speaker", 5: "job", 6: "state", 7: "party", 13: "venue" }) ### if binaresed == True: dim_class = 2 label_dict = { 'pants-fire': 0, 'false': 0, 'barely-true': 0, 'half-true': 1, 'mostly-true': 1, 'true': 1 } label_reverse_arr = [ 'pants-fire', 'false', 'barely-true', 'half-true', 'mostly-true', 'true' ] else: dim_class = 6 label_dict = { 'pants-fire': 0, 'false': 1, 'barely-true': 2, 'half-true': 3, 'mostly-true': 4, 'true': 5 } label_reverse_arr = [ 'pants-fire', 'false', 'barely-true', 'half-true', 'mostly-true', 'true' ] ### __Transform the label into real scalars__ def create_one_hot(x): return keras.utils.to_categorical(label_dict[x], num_classes=6) data_set['label_id'] = data_set['label'].apply(lambda x: label_dict[x]) val_set['label_id'] = val_set['label'].apply(lambda x: label_dict[x]) test_set['label_id'] = test_set['label'].apply(lambda x: label_dict[x]) ### __Transform speakers as real scalars__ speakers = [ 'barack-obama', 'donald-trump', 'hillary-clinton', 'mitt-romney', 'scott-walker', 'john-mccain', 'rick-perry', 'chain-email', 'marco-rubio', 'rick-scott', 'ted-cruz', 'bernie-s', 'chris-christie', 'facebook-posts', 'charlie-crist', 'newt-gingrich', 'jeb-bush', 'joe-biden', 'blog-posting', 'paul-ryan' ] speaker_dict = {} for cnt, speaker in enumerate(speakers): speaker_dict[speaker] = cnt def map_speaker(speaker): if isinstance(speaker, str): speaker = speaker.lower() matches = [s for s in speakers if s in speaker] if len(matches) > 0: return speaker_dict[matches[0]] #Return index of first match else: return len(speakers) else: return len(speakers) #Nans or un-string data goes here. data_set['speaker_id'] = data_set['speaker'].apply(map_speaker) val_set['speaker_id'] = val_set['speaker'].apply(map_speaker) ### __Transform job as real scalar__ data_set['job'].value_counts()[:10] job_list = [ 'president', 'u.s. senator', 'governor', 'president-elect', 'presidential candidate', 'u.s. representative', 'state senator', 'attorney', 'state representative', 'congress' ] job_dict = { 'president': 0, 'u.s. senator': 1, 'governor': 2, 'president-elect': 3, 'presidential candidate': 4, 'u.s. representative': 5, 'state senator': 6, 'attorney': 7, 'state representative': 8, 'congress': 9 } def map_job(job): if isinstance(job, str): job = job.lower() matches = [s for s in job_list if s in job] if len(matches) > 0: return job_dict[matches[0]] #Return index of first match else: return 10 #This maps any other job to index 10 else: return 10 #Nans or un-string data goes here. data_set['job_id'] = data_set['job'].apply(map_job) val_set['job_id'] = val_set['job'].apply(map_job) ### __Transform party as real scalar__ data_set['party'].value_counts() party_dict = { 'republican': 0, 'democrat': 1, 'none': 2, 'organization': 3, 'newsmaker': 4 } #default index for rest party is 5 def map_party(party): if party in party_dict: return party_dict[party] else: return 5 data_set['party_id'] = data_set['party'].apply(map_party) val_set['party_id'] = val_set['party'].apply(map_party) ### __Transform states as real scalar__ #print data_set['state'].value_counts()[0:50] #Possible groupings (50 groups + 1 for rest) states = [ 'Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine' 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin', 'Wyoming' ] #states_dict = {} #i = 0 #for state in states: # state_key = state.lower() # states_dict[state_key] = i # i += 1 #print len(states_dict.keys()) states_dict = { 'wyoming': 48, 'colorado': 5, 'washington': 45, 'hawaii': 10, 'tennessee': 40, 'wisconsin': 47, 'nevada': 26, 'north dakota': 32, 'mississippi': 22, 'south dakota': 39, 'new jersey': 28, 'oklahoma': 34, 'delaware': 7, 'minnesota': 21, 'north carolina': 31, 'illinois': 12, 'new york': 30, 'arkansas': 3, 'west virginia': 46, 'indiana': 13, 'louisiana': 17, 'idaho': 11, 'south carolina': 38, 'arizona': 2, 'iowa': 14, 'mainemaryland': 18, 'michigan': 20, 'kansas': 15, 'utah': 42, 'virginia': 44, 'oregon': 35, 'connecticut': 6, 'montana': 24, 'california': 4, 'massachusetts': 19, 'rhode island': 37, 'vermont': 43, 'georgia': 9, 'pennsylvania': 36, 'florida': 8, 'alaska': 1, 'kentucky': 16, 'nebraska': 25, 'new hampshire': 27, 'texas': 41, 'missouri': 23, 'ohio': 33, 'alabama': 0, 'new mexico': 29 } def map_state(state): if isinstance(state, str): state = state.lower() if state in states_dict: return states_dict[state] else: if 'washington' in state: return states_dict['washington'] else: return 50 #This maps any other location to index 50 else: return 50 #Nans or un-string data goes here. data_set['state_id'] = data_set['state'].apply(map_state) val_set['state_id'] = val_set['state'].apply(map_state) ### __Transform subject as real scalar__ data_set['subject'].value_counts()[0:5] #Possible groups (14) subject_list = [ 'health', 'tax', 'immigration', 'election', 'education', 'candidates-biography', 'economy', 'gun', 'jobs', 'federal-budget', 'energy', 'abortion', 'foreign-policy' ] subject_dict = { 'health': 0, 'tax': 1, 'immigration': 2, 'election': 3, 'education': 4, 'candidates-biography': 5, 'economy': 6, 'gun': 7, 'jobs': 8, 'federal-budget': 9, 'energy': 10, 'abortion': 11, 'foreign-policy': 12 } #health-care,taxes,immigration,elections,education,candidates-biography,guns, #economy&jobs ,federal-budget,energy,abortion,foreign-policy,state-budget, rest #Economy & Jobs is bundled together, because it occurs together def map_subject(subject): if isinstance(subject, str): subject = subject.lower() matches = [s for s in subject_list if s in subject] if len(matches) > 0: return subject_dict[matches[0]] #Return index of first match else: return 13 #This maps any other subject to index 13 else: return 13 #Nans or un-string data goes here. data_set['subject_id'] = data_set['subject'].apply(map_subject) val_set['subject_id'] = val_set['subject'].apply(map_subject) ### __Transform venue as real scalar__ data_set['venue'].value_counts()[0:15] venue_list = [ 'news release', 'interview', 'tv', 'radio', 'campaign', 'news conference', 'press conference', 'press release', 'tweet', 'facebook', 'email' ] venue_dict = { 'news release': 0, 'interview': 1, 'tv': 2, 'radio': 3, 'campaign': 4, 'news conference': 5, 'press conference': 6, 'press release': 7, 'tweet': 8, 'facebook': 9, 'email': 10 } def map_venue(venue): if isinstance(venue, str): venue = venue.lower() matches = [s for s in venue_list if s in venue] if len(matches) > 0: return venue_dict[matches[0]] #Return index of first match else: return 11 #This maps any other venue to index 11 else: return 11 #Nans or un-string data goes here. #possibe groups (12) #news release, interview, tv (television), radio, campaign, news conference, press conference, press release, #tweet, facebook, email, rest data_set['venue_id'] = data_set['venue'].apply(map_venue) val_set['venue_id'] = val_set['venue'].apply(map_venue) ### #Tokenize statement and vocab test vocab_dict = {} from keras.preprocessing.text import Tokenizer if not os.path.exists('vocab.p'): t = Tokenizer() t.fit_on_texts(data_set['statement']) vocab_dict = t.word_index cpickle.dump(t.word_index, open("vocab.p", "wb")) print('Vocab dict is created') print('Saved vocab dict to pickle file') else: print('Loading vocab dict from pickle file') vocab_dict = cpickle.load(open("vocab.p", "rb")) ## #Get all preprocessing done for test data test_set['job_id'] = test_set['job'].apply(map_job) #Job test_set['party_id'] = test_set['party'].apply(map_party) #Party test_set['state_id'] = test_set['state'].apply(map_state) #State test_set['subject_id'] = test_set['subject'].apply(map_subject) #Subject test_set['venue_id'] = test_set['venue'].apply(map_venue) #Venue test_set['speaker_id'] = test_set['speaker'].apply(map_speaker) #Speaker #To access particular word_index. Just load these. #To read a word in a sentence use keras tokenizer again, coz easy from keras.preprocessing.text import text_to_word_sequence from keras.preprocessing import sequence #text = text_to_word_sequence(data_set['statement'][0]) #print text #val = [vocab_dict[t] for t in text] #print val def pre_process_statement(statement): text = text_to_word_sequence(statement) val = [0] * 10 val = [vocab_dict[t] for t in text if t in vocab_dict] #Replace unk words with 0 index return val #Creating embedding matrix to feed in embeddings directly bruv num_words = len(vocab_dict) + 1 embedding_matrix = np.zeros((num_words, EMBEDDING_DIM)) for word, i in vocab_dict.items(): embedding_vector = embeddings_index.get(word) if embedding_vector is not None: # words not found in embedding index will be all-zeros. embedding_matrix[i] = embedding_vector #I have reset embeddings_index since it would take a lot of memory embeddings_index = None #### #Hyper parameter definitions vocab_length = len(vocab_dict.keys()) data_set['word_ids'] = data_set['statement'].apply(pre_process_statement) val_set['word_ids'] = val_set['statement'].apply(pre_process_statement) test_set['word_ids'] = test_set['statement'].apply(pre_process_statement) X_train = data_set['word_ids'] Y_train = data_set['label_id'] X_val = val_set['word_ids'] Y_val = val_set['label_id'] X_test = test_set['word_ids'] Y_test = test_set['label_id'] X_train = sequence.pad_sequences(X_train, maxlen=num_steps, padding='post', truncating='post') Y_train = keras.utils.to_categorical(Y_train, num_classes=dim_class) X_val = sequence.pad_sequences(X_val, maxlen=num_steps, padding='post', truncating='post') Y_val = keras.utils.to_categorical(Y_val, num_classes=dim_class) X_test = sequence.pad_sequences(X_test, maxlen=num_steps, padding='post', truncating='post') Y_test = keras.utils.to_categorical(Y_test, num_classes=dim_class) ### #Meta data preparation a = keras.utils.to_categorical(data_set['party_id'], num_classes=num_party) b = keras.utils.to_categorical(data_set['state_id'], num_classes=num_state) c = keras.utils.to_categorical(data_set['venue_id'], num_classes=num_venue) d = keras.utils.to_categorical(data_set['job_id'], num_classes=num_job) e = keras.utils.to_categorical(data_set['subject_id'], num_classes=num_sub) f = keras.utils.to_categorical(data_set['speaker_id'], num_classes=num_speaker) X_train_meta = np.hstack((a, b, c, d, e, f)) #concat a and b a_val = keras.utils.to_categorical(val_set['party_id'], num_classes=num_party) b_val = keras.utils.to_categorical(val_set['state_id'], num_classes=num_state) c_val = keras.utils.to_categorical(val_set['venue_id'], num_classes=num_venue) d_val = keras.utils.to_categorical(val_set['job_id'], num_classes=num_job) e_val = keras.utils.to_categorical(val_set['subject_id'], num_classes=num_sub) f_val = keras.utils.to_categorical(val_set['speaker_id'], num_classes=num_speaker) X_val_meta = np.hstack( (a_val, b_val, c_val, d_val, e_val, f_val)) #concat a_val and b_val a_test = keras.utils.to_categorical(test_set['party_id'], num_classes=num_party) b_test = keras.utils.to_categorical(test_set['state_id'], num_classes=num_state) c_test = keras.utils.to_categorical(test_set['venue_id'], num_classes=num_venue) d_test = keras.utils.to_categorical(test_set['job_id'], num_classes=num_job) e_test = keras.utils.to_categorical(test_set['subject_id'], num_classes=num_sub) f_test = keras.utils.to_categorical(test_set['speaker_id'], num_classes=num_speaker) X_test_meta = np.hstack((a_test, b_test, c_test, d_test, e_test, f_test)) #concat all test data return (X_train_meta, X_val_meta, X_test_meta), (X_train, Y_train), (X_val, Y_val), ( X_test, Y_test ), vocab_length, EMBEDDING_DIM, embedding_matrix, label_reverse_arr
# -*- coding: utf-8 -*- """ Created on Mon May 6 16:27:02 2019 @author: Turing """ from _pickle import load from nltk.stem.snowball import SpanishStemmer diccionario_polaridad = {} inputt = open('diccionario_polaridades.pk1', 'rb') diccionario_polaridad = load(inputt) inputt.close() resenhas_categoria = {} inputt = open('resenhas_por_categoria_dict.pk1', 'rb') resenhas_categoria = load(inputt) inputt.close() categoria_polaridad = [] palabras_no_encontradas = 0 total_palabras_encontradas = 0 ss = SpanishStemmer() for categoria in resenhas_categoria: valor_categoria = 0 palabras_encontradas = 0 for resenha in resenhas_categoria[categoria]: for word in resenha[0].split(): if diccionario_polaridad.get(ss.stem(word).lower()):
'SrTEMP_FLUX_ADJ','SrTEMP_FLUX_FB',\ 'SrWV_FLUX_ADJ','SrWV_FLUX_FB',\ 'TrWV_FLUX_ADJ','TrWV_FLUX_FB',\ 'TrALB_FLUX_ADJ','TrALB_FLUX_FB'] Kernels = ['CAM3', 'CAM5', 'ECHAM6_ctr', 'ERA', 'GFDL', 'HadGEM2'] print('read in data') start_time = time.time() Variables = dict() for v in range(len(Vari)): Variables[Vari[v]] = dict() for k in range(len(Kernels)): if Vari[v] == 'DIR_FLUX' or Vari[v] == 'DIR_FLUXCS': Variables[Vari[v]][Kernels[k]] = pk.load(open(\ Source+models+'_'+Vari[v]+'_Grid.pi','rb'))[Kernels[k]] else: Variables[Vari[v]][Kernels[k]] = np.expand_dims(pk.load(open(\ Source+models+'_'+Vari[v]+'_Grid.pi','rb'))[Kernels[k]],axis=0) end_time = time.time() - start_time print('time to read in =', end_time / 60, 'minutes') start_time = time.time() print('area averaging') Variables_AA = dict() Variables_Flatten = dict() for v in range(len(Vari)): Variables_AA[Vari[v]] = dict() Variables_Flatten[Vari[v]] = dict() print('on variable', Vari[v])
model_bin.compile(loss='binary_crossentropy', optimizer=nadam_opt) model_bin.fit(training_LSTM_8_pad, y_LSTM_8, epochs=4) #early_stopping = EarlyStopping(patience=0,mode="min",monitor='val_loss') final = time.time() model_bin.save("./binaryKeras.model") model_bin = tensorflow.keras.models.load_model("./binaryKeras.model") explainer_bin = shap.DeepExplainer(model_bin, training_LSTM_8_pad) training_LSTM_8_pad_B = explainer_bin.shap_values(training_LSTM_8_pad) pickle.dump(training_LSTM_8_pad_B, open("./fitxerShapleyLSTM_8_B", "wb")) training_LSTM_8_pad_B = pickle.load(open("./fitxerShapleyLSTM_8_B", "rb")) nameVariables = list(proyectos.columns) nameVariables[0] = "UN LDCs" nameVariables[1] = "GDP per capita" nameVariables[2] = "Public Grant" nameVariables[3] = "Budget Previous Year" nameVariables[4] = "Donor Aid Budget" nameVariables[5] = "Latin America Mission" nameVariables[6] = "Africa Mission" shap_Specific = [] shap_SpecificValues = [] shap_Specific_P = [] shap_SpecificValues_P = [] len(training_LSTM_8_pad_B[0][0])
from utilities import label_img_to_color from model import ENet_model from config import output_dir, run_dir, demo_dir # environ['CUDA_VISIBLE_DEVICES'] = "2" @cli_parse class G: model_id = "demo_sequence" data_path = "../../datasets/miniscapes-processed/demoVideo/stuttgart_00" results_dir = "../../runs/image-segmentation/stuttgart_02" # load the mean color channels of the train imgs: train_mean_channels = cPickle.load(open(path.join(output_dir, "mean_channels.pkl"), "rb")) # load the sequence data: seq_frame_paths = [] frame_names = sorted(listdir(G.data_path)) for step, frame_name in enumerate(tqdm(frame_names)): frame_path = path.join(G.data_path, frame_name) seq_frame_paths.append(frame_path) # validate_files(seq_frame_paths) # exit() # define where to place the resulting images: try: makedirs(G.results_dir) except FileExistsError as e:
def load(self): if self.cv is None: cl_train, cl_val = None, None else: cl_train, cl_val = self.filter_cl() with open(self.pkl_path, 'rb') as pkl_file: # read response res = pkl.load(pkl_file) res = res.loc[res['SOURCE'] == self.source] if cl_train is not None: res_train = res[res['ccl_name'].isin(cl_train)] res_val = res[res['ccl_name'].isin(cl_val)] else: res_train = res res_val = None # load cl properties and filter by geneGE genomics = pkl.load(pkl_file) cols = [ x if x.startswith('geneGE_') else None for x in genomics.columns.tolist() ] cols = list(filter(lambda x: x is not None, cols)) genomics = genomics[cols] # load drug descriptors drug = pkl.load(pkl_file) df_y_train = res_train.reset_index(drop=True) df_x_train_cl = df_y_train.merge(genomics, left_on='ccl_name', how='left', right_index=True) df_x_train_dr = df_y_train.merge(drug, left_on='ctrpDrugID', how='left', right_index=True) df_x_train_cl.drop(columns=[ 'SOURCE', 'ccl_name', 'ctrpDrugID', 'area_under_curve', 'groupID' ], inplace=True) df_x_train_dr.drop(columns=[ 'SOURCE', 'ccl_name', 'ctrpDrugID', 'area_under_curve', 'groupID' ], inplace=True) df_y_val = res_val.reset_index(drop=True) df_x_val_cl = df_y_val.merge(genomics, left_on='ccl_name', how='left', right_index=True) df_x_val_dr = df_y_val.merge(drug, left_on='ctrpDrugID', how='left', right_index=True) df_x_val_cl.drop(columns=[ 'SOURCE', 'ccl_name', 'ctrpDrugID', 'area_under_curve', 'groupID' ], inplace=True) df_x_val_dr.drop(columns=[ 'SOURCE', 'ccl_name', 'ctrpDrugID', 'area_under_curve', 'groupID' ], inplace=True) return (df_y_train, df_x_train_cl, df_x_train_dr), (df_y_val, df_x_val_cl, df_x_val_dr)
def load_model_pickle(self, name, path ='../../feature_groups/lda_pickles'): print("loading model: "+name) path = os.path.join(path, name) with open(path, "rb") as f: self.model = _pickle.load(f)
import _pickle import sklearn import numpy import sys try: project = sys.argv[1] input_path = sys.argv[2] model_path = 'trained_model/' + sys.argv[1] + '.pkl' except: print('No argument, default model: mesos') project = 'mesos' input_path = 'input/mesos.pkl' model_path = 'trained_model/' + 'mesos' + '_porru.pkl' input = numpy.array(_pickle.load(input_path)) clf = _pickle.load(model_path) predict = clf.predict() print(predict)
def Load_Data(data_name): if data_name == 'IMDB': data = cPickle.load(open('./imdb.pkl', 'rb'), encoding='iso-8859-1') if data_name == 'ELEC': data = cPickle.load(open('./ELEC_30k_cwc.pkl', 'rb'), encoding='iso-8859-1') if data_name == 'IMDB_10': new_data = cPickle.load( open('/home/s/CNN-BiLSTM2/hedwig/BCPGDS_decoder/imdb_data.pkl', 'rb')) if data_name == 'Reuters': new_data = cPickle.load(open('./Reuters_data.pkl', 'rb')) new_data_bow = cPickle.load(open('./Reuters_data_bow.pkl', 'rb')) if data_name == 'IMDB_10' or data_name == 'Reuters': #doc_labels = data['labels'] #word_freq = data['word_freq'] word2index = new_data.stoi #word2index = {key: idx for key, idx in word2index1.items() if idx < 3000} index2word = new_data.itos #train_doc_word = data['train_doc_word'] train_doc_split = new_data.data['train'] if data_name == 'IMDB_10': train_doc_split = train_doc_split train_doc_label = np.array(new_data.label['train']) else: train_doc_split = train_doc_split train_doc_label = np.array(new_data_bow.label['train']) if data_name == 'IMDB_10': train_doc_label = train_doc_label else: train_doc_label = train_doc_label #test_doc_word = data['test_doc_word'] test_doc_split = new_data.data['test'] #test_doc_label = np.array(new_data_bow.label['test']) if data_name == 'IMDB_10': test_doc_split = test_doc_split test_doc_label = np.array(new_data.label['test']) else: test_doc_split = test_doc_split test_doc_label = np.array(new_data_bow.label['test']) seq_max_len = 0 # seq_min_len = 999 train_doc_len = [] for i in range(len(train_doc_split)): train_doc_len.append(len(train_doc_split[i])) test_doc_len = [] for i in range(len(test_doc_split)): test_doc_len.append(len(test_doc_split[i])) Data_save = {} Data_save['word2index'] = word2index Data_save['index2word'] = index2word Data_save['train_doc_split'] = train_doc_split Data_save['train_doc_label'] = train_doc_label Data_save['test_doc_split'] = test_doc_split Data_save['test_doc_label'] = test_doc_label cPickle.dump(Data_save, open('./Reuters_new.pkl', 'wb')) else: if data_name == 'ELEC': doc_labels = data['Label'] else: doc_labels = data['labels'] word_freq = data['word_freq'] word2index = data['word2index'] index2word = data['index2word'] train_doc_word = data['train_doc_word'] train_doc_index = data['train_doc_index'] train_doc_label = np.array(data['train_doc_label']) test_doc_word = data['test_doc_word'] test_doc_index = data['test_doc_index'] test_doc_label = np.array(data['test_doc_label']) #================================================== #preprocess num_words = len(index2word) index2word[num_words] = '<pad_zero>' word2index['<pad_zero>'] = num_words num_words = num_words + 1 # num_words = len(index2word) # index2word[1] = '<pad_zero>' # word2index['<pad_zero>'] = 1 # #num_words = num_words + 1 seq_max_len = 0 # seq_min_len = 999 train_doc_split = [] train_doc_split_len = [] train_doc_len = [] split_index = [ word2index['.'], word2index['!'], word2index['?'], word2index['..'], word2index[';'] ] for i in range(len(train_doc_index)): [seqs_len, seqs] = Seq_Split(train_doc_index[i], split_index, word2index['<pad_zero>']) train_doc_split.append(seqs) train_doc_split_len.append(seqs_len) # tmp_min = Seq_Min_Len(seqs) # if tmp_min < seq_min_len: # seq_min_len = tmp_min tmp_max = Seq_Max_Len(seqs) if tmp_max > seq_max_len: seq_max_len = tmp_max train_doc_len.append(len(seqs)) test_doc_split = [] test_doc_split_len = [] test_doc_len = [] for i in range(len(test_doc_index)): [seqs_len, seqs] = Seq_Split(test_doc_index[i], split_index, word2index['<pad_zero>']) test_doc_split.append(seqs) test_doc_split_len.append(seqs_len) # tmp_min = Seq_Min_Len(seqs) # if tmp_min < seq_min_len: # seq_min_len = tmp_min tmp_max = Seq_Max_Len(seqs) if tmp_max > seq_max_len: seq_max_len = tmp_max test_doc_len.append(len(seqs)) doc_max_len = max(Seq_Max_Len(train_doc_split), Seq_Max_Len(test_doc_split)) doc_min_len = min(Seq_Min_Len(train_doc_split), Seq_Min_Len(test_doc_split)) doc_max_len_word = max(Word_Max_len(train_doc_split), Word_Max_len(test_doc_split)) doc_ave_len = (Seq_Ave_Len(train_doc_split) + Seq_Ave_Len(test_doc_split)) / 2 return word2index, train_doc_split, train_doc_label, train_doc_len, test_doc_split, test_doc_label, test_doc_len
def simple_search_dag( criteria, db=None, nbblocks=[64], min_seg_len=15, parallel=False, verbosity=0, timing=0, modbbs=None, make_edges=True, merge_bblock=None, merge_segment=None, precache_splices=False, precache_only=False, bbs=None, bblock_ranges=[], only_seg=None, source=None, print_edge_summary=False, no_duplicate_bases=False, shuffle_bblocks=False, use_saved_bblocks=False, output_prefix="./worms", only_ivertex=[], **kw, ): bbdb, spdb = db queries, directions = zip(*criteria.bbspec) tdb = time() if bbs is None: bbs = list() savename = output_prefix + "_bblocks.pickle" if use_saved_bblocks and os.path.exists(savename): with open(savename, "rb") as inp: bbnames_list = _pickle.load(inp) # for i, l in enumerate(bbnames_list) # if len(l) >= nbblocks[i]: # assert 0, f"too many bblocks in {savename}" for i, bbnames in enumerate(bbnames_list): bbs.append([bbdb.bblock(n) for n in bbnames[:nbblocks[i]]]) else: for iquery, query in enumerate(queries): if hasattr(criteria, "cloned_segments"): msegs = [ i + len(queries) if i < 0 else i for i in criteria.cloned_segments() ] if iquery in msegs[1:]: print("seg", iquery, "repeating bblocks from", msegs[0]) bbs.append(bbs[msegs[0]]) continue bbs0 = bbdb.query( query, max_bblocks=nbblocks[iquery], shuffle_bblocks=shuffle_bblocks, parallel=parallel, ) bbs.append(bbs0) if bblock_ranges: bbs_sliced = list() assert len(bblock_ranges) == 2 * len(bbs) for ibb, bb in enumerate(bbs): lb, ub = bblock_ranges[2 * ibb:2 * ibb + 2] bbs_sliced.append(bb[lb:ub]) bbs = bbs_sliced for ibb, bb in enumerate(bbs): print("bblocks", ibb) for b in bb: print(" ", bytes(b.file).decode("utf-8")) bases = [ Counter(bytes(b.base).decode("utf-8") for b in bbs0) for bbs0 in bbs ] assert len(bbs) == len(queries) for i, v in enumerate(bbs): assert len(v) > 0, 'no bblocks for query: "' + queries[i] + '"' print("bblock queries:", str(queries)) print("bblock numbers:", [len(b) for b in bbs]) print("bblocks id:", [id(b) for b in bbs]) print("bblock0 id ", [id(b[0]) for b in bbs]) print("base_counts:") for query, basecount in zip(queries, bases): counts = " ".join(f"{k}: {c}" for k, c in basecount.items()) print(f" {query:10}", counts) if criteria.is_cyclic: # for a, b in zip(bbs[criteria.from_seg], bbs[criteria.to_seg]): # assert a is b bbs[criteria.to_seg] = bbs[criteria.from_seg] if use_saved_bblocks and not os.path.exists(savename): bbnames = [[bytes(b.file).decode("utf-8") for b in bb] for bb in bbs] with open(savename, "wb") as out: _pickle.dump(bbnames, out) else: bbs = bbs.copy() assert len(bbs) == len(criteria.bbspec) if modbbs: modbbs(bbs) if merge_bblock is not None and merge_bblock >= 0: # print('cloned_segments', criteria.bbspec, criteria.cloned_segments()) if hasattr(criteria, "cloned_segments") and merge_segment is None: for i in criteria.cloned_segments(): # print(' ', 'merge seg', i, 'merge_bblock', merge_bblock) bbs[i] = (bbs[i][merge_bblock], ) else: if merge_segment is None: merge_segment = 0 # print(' ', 'merge_segment not None') # print(' ', [len(b) for b in bbs]) # print(' ', 'merge_segment', merge_segment) # print(' ', 'merge_bblock', merge_bblock, len(bbs[merge_segment])) bbs[merge_segment] = (bbs[merge_segment][merge_bblock], ) tdb = time() - tdb # info( # f'bblock creation time {tdb:7.3f} num bbs: ' + # str([len(x) for x in bbs]) # ) if precache_splices: bbnames = [[bytes(bb.file) for bb in bbtup] for bbtup in bbs] bbpairs = set() # for bb1, bb2, dirn1 in zip(bbnames, bbnames[1:], directions): for i in range(len(bbnames) - 1): bb1 = bbnames[i] bb2 = bbnames[i + 1] dirn1 = directions[i] rev = dirn1[1] == "N" if bbs[i] is bbs[i + 1]: bbpairs.update((a, a) for a in bb1) else: bbpairs.update( (b, a) if rev else (a, b) for a in bb1 for b in bb2) precompute_splicedb(db, bbpairs, verbosity=verbosity, parallel=parallel, **kw) if precache_only: return bbs verts = [None] * len(queries) edges = [None] * len(queries[1:]) if source: srcdirn = [ "".join("NC_"[d] for d in source.verts[i].dirn) for i in range(len(source.verts)) ] # yapf: disable srcverts, srcedges = list(), list() for i, bb in enumerate(bbs): for isrc, bbsrc in enumerate(source.bbs): # fragile code... detecting this way can be wrong # print(i, isrc, directions[i], srcdirn[isrc]) if directions[i] != srcdirn[isrc]: continue if [b.filehash for b in bb] == [b.filehash for b in bbsrc]: # super hacky fix, really need to be passed info on what's what if srcverts and srcverts[-1] + 1 != isrc: continue verts[i] = source.verts[isrc] srcverts.append(isrc) for i, bb in enumerate(zip(bbs, bbs[1:])): bb0, bb1 = bb for isrc, bbsrc in enumerate(zip(source.bbs, source.bbs[1:])): bbsrc0, bbsrc1 = bbsrc if directions[i] != srcdirn[isrc]: continue if directions[i + 1] != srcdirn[isrc + 1]: continue he = [b.filehash for b in bb0] == [b.filehash for b in bbsrc0] he &= [b.filehash for b in bb1] == [b.filehash for b in bbsrc1] if not he: continue edges[i] = source.edges[isrc] srcedges.append(isrc) if not make_edges: edges = [] tvertex = time() exe = InProcessExecutor() if parallel: exe = cf.ThreadPoolExecutor(max_workers=parallel) with exe as pool: if only_seg is not None: save = bbs, directions bbs = [bbs[only_seg]] directions = [directions[only_seg]] verts = [verts[only_seg]] futures = list() for i, bb in enumerate(bbs): dirn = directions[i] if verts[i] is None: futures.append( pool.submit(Vertex, bb, dirn, min_seg_len=min_seg_len)) verts_new = [f.result() for f in futures] isnone = [i for i in range(len(verts)) if verts[i] is None] for i, inone in enumerate(isnone): verts[inone] = verts_new[i] if source: print('use new vertex', inone) if only_ivertex: # raise NotImplementedError print("!!!!!!! using one ivertex !!!!!", only_ivertex, len(verts), [v.len for v in verts]) if len(only_ivertex) != len(verts): print( "NOT altering verts, len(only_ivertex)!=len(verts) continuing...", "this is ok if part of a sub-protocol") else: for i, v in enumerate(verts): if v.len > 1: # could already have been "trimmed" assert only_ivertex[i] < v.len v.reduce_to_only_one_inplace(only_ivertex[i]) # print('x2exit', v.x2exit.shape) # print('x2orig', v.x2orig.shape) # print('ires', v.ires.shape) # print('isite', v.isite.shape) # print('ichain', v.ichain.shape) # print('ibblock', v.ibblock.shape) # print('inout', v.inout.shape, v.inout[10:]) # print('inbreaks', v.inbreaks.shape, v.inbreaks[10:]) # print('dirn', v.dirn.shape) # # assert 0 # print(i, len(verts_new), len(verts)) if isnone: assert i + 1 == len(verts_new) assert all(v for v in verts) if only_seg is not None: verts = [None] * only_seg + verts + [None] * (len(queries) - only_seg - 1) bbs, directions = save tvertex = time() - tvertex # info( # f'vertex creation time {tvertex:7.3f} num verts ' + # str([v.len if v else 0 for v in verts]) # ) if make_edges: tedge = time() for i, e in enumerate(edges): if e is not None: continue edges[i], edge_analysis = Edge( verts[i], bbs[i], verts[i + 1], bbs[i + 1], splicedb=spdb, verbosity=verbosity, precache_splices=precache_splices, **kw, ) allok = all(x[6] for x in edge_analysis) if allok: continue print("=" * 80) print("info for edges with no valid splices", edges[i].total_allowed_splices()) for tup in edge_analysis: iblk0, iblk1, ofst0, ofst1, ires0, ires1 = tup[:6] ok, f_clash, f_rms, f_ncontact, f_ncnh, f_nhc = tup[6:12] m_rms, m_ncontact, m_ncnh, m_nhc = tup[12:] if ok: continue assert len(bbs[i + 0]) > iblk0 assert len(bbs[i + 1]) > iblk1 print("=" * 80) print("egde Bblock A", bytes(bbs[i][iblk0].file)) print("egde Bblock B", bytes(bbs[i + 1][iblk1].file)) print( f"bb {iblk0:3} {iblk1:3}", f"ofst {ofst0:4} {ofst1:4}", f"resi {ires0.shape} {ires1.shape}", ) print( f"clash_ok {int(f_clash*100):3}%", f"rms_ok {int(f_rms*100):3}%", f"ncontact_ok {int(f_ncontact*100):3}%", f"ncnh_ok {int(f_ncnh*100):3}%", f"nhc_ok {int(f_nhc*100):3}%", ) print( f"min_rms {m_rms:7.3f}", f"max_ncontact {m_ncontact:7.3f}", f"max_ncnh {m_ncnh:7.3f}", f"max_nhc {m_nhc:7.3f}", ) print("=" * 80) fok = np.stack([x[7:12] for x in edge_analysis]).mean(axis=0) rmsmin = np.array([x[12] for x in edge_analysis]).min() fmx = np.stack([x[13:] for x in edge_analysis]).max(axis=0) print(f"{' SPLICE FAIL SUMMARY ':=^80}") print(f"splice clash ok {int(fok[0]*100):3}%") print(f"splice rms ok {int(fok[1]*100):3}%") print(f"splice ncontacts ok {int(fok[2]*100):3}%") print(f"splice ncontacts_no_helix ok {int(fok[3]*100):3}%") print(f"splice nhelixcontacted ok {int(fok[4]*100):3}%") print(f"min rms of any failing {rmsmin}") print( f"max ncontact of any failing {fmx[0]} (maybe large for non-5-helix splice)" ) print( f"max ncontact_no_helix {fmx[1]} (will be 999 for non-5-helix splice)" ) print( f"max nhelix_contacted {fmx[2]} (will be 999 for non-5-helix splice)" ) print("=" * 80) assert edges[i].total_allowed_splices() > 0, "invalid splice" tedge = time() - tedge if print_edge_summary: _print_edge_summary(edges) # info( # f'edge creation time {tedge:7.3f} num splices ' + # str([e.total_allowed_splices() # for e in edges]) + ' num exits ' + str([e.len for e in edges]) # ) spdb.sync_to_disk() toret = SearchSpaceDag(criteria.bbspec, bbs, verts, edges) if timing: toret = toret, tdb, tvertex, tedge return toret
def get_group_cached_ts(group_file): ret_dict = {} have_axis = None have_dist = None have_ms = None for each_group_file in group_file: print('Loading group file %s' % each_group_file) all_data = cPickle.load(open(each_group_file, 'r')) max_kNN_len = all_data['max_kNN_len'] ret_dict = update_cached_dict(ret_dict, 'max_kNN_len', max_kNN_len, update_type='update') kNN_list = all_data['kNN_list'] kNN_list = np.asarray(kNN_list) ret_dict = update_cached_dict(ret_dict, 'kNN_list', kNN_list) kNN_valid_flag = all_data['kNN_valid_flag'] kNN_valid_flag = np.asarray(kNN_valid_flag) ret_dict = update_cached_dict(ret_dict, 'kNN_valid_flag', kNN_valid_flag) ret_dict = update_cached_dict(ret_dict, 'mult_mat', all_data['mult_mat']) ret_dict = update_cached_dict(ret_dict, 'mult_mat_space', all_data['mult_mat_space']) ret_dict = update_cached_dict(ret_dict, 'mult_mat_rev', all_data['mult_mat_rev']) ret_dict = update_cached_dict(ret_dict, 'num_p_rep', all_data['num_p_rep']) ret_dict = update_cached_dict(ret_dict, 'grav_flag', all_data['grav_flag']) depth_list, all_father_list = get_depth_father_list(all_data) max_depth = np.max(depth_list) ret_dict = update_cached_dict(ret_dict, 'max_depth', max_depth, update_type='update') ret_dict = update_cached_dict(ret_dict, 'depth_list', depth_list) ret_dict = update_cached_dict(ret_dict, 'father_list', all_father_list) drt_father_list = get_drt_father_list(all_data) ret_dict = update_cached_dict(ret_dict, 'drt_father_list', drt_father_list) no, no_wo_group = all_data['mult_mat'].shape no_wo_group_flag = np.arange(no) < no_wo_group ret_dict = update_cached_dict(ret_dict, 'no_wo_group_flag', no_wo_group_flag) if have_axis: assert 'all_axis' in all_data, \ "All group files should include all_axis! %s" % each_group_file if 'all_axis' in all_data: have_axis = True ret_dict = update_cached_dict(ret_dict, 'all_axis', all_data['all_axis']) else: have_axis = False if have_dist: assert 'all_dist' in all_data, \ "All group files should include all_dist! %s" % each_group_file if 'all_dist' in all_data: have_dist = True ret_dict = update_cached_dict(ret_dict, 'all_dist', all_data['all_dist']) else: have_dist = False if have_ms: assert 'L2H_attribute' in all_data, \ "All group files should include super node! %s" % each_group_file if 'L2H_attribute' in all_data: have_ms = True for key_now in [ 'L2H_attribute', 'WG_attribute', 'H2L_attribute', 'L2H_division', 'WG_division' ]: ret_dict = update_cached_dict(ret_dict, key_now, all_data[key_now]) else: have_dist = False ret_dict = pad_to_tensors(ret_dict) return ret_dict
def prepared(self): self.trainset = cPickle.load( open(self.root + 'item_user_recrods.pkl', 'rb')) #用户u :用户已看过的item:评分 self.news_sim_mat = cPickle.load( open(self.root + 'item_sim_mat.pkl', 'rb')) #新闻m1: 新闻m2: 相似度
def _load_pickled(filepath): with open(filepath, 'rb') as f: data = cPickle.load(f, encoding='latin-1') return data
def unpickled(filename): #assert os.path.isdir(filename) assert os.path.isfile(filename) with open(filename, 'rb') as fo: dict = cPickle.load(fo) return dict
import os def display(loc, dpname): top, right, bottom, left = loc cv2.rectangle(frame, (left * 4, top * 4), (right * 4, bottom * 4), (0, 0, 255), 2) cv2.rectangle(frame, (left * 4, bottom * 4 - 35), (right * 4, bottom * 4), (0, 0, 255), cv2.FILLED) font = cv2.FONT_HERSHEY_DUPLEX cv2.putText(frame, dpname, (left * 4 + 6, bottom * 4 - 6), font, 1.0, (255, 255, 255), 1) faces = {} for face in os.listdir("faces/"): if not face.startswith("."): with open("faces/" + face, 'rb') as fp: face_info = c.load(fp) faces[face] = {} faces[face]["info"] = face_info faces[face]["name"] = face cam = cv2.VideoCapture(0) while True: _, frame = cam.read() sframe = cv2.resize(frame, (0, 0), fx=0.25, fy=0.25) sframe = sframe[:, :, ::-1] face_locations = f.face_locations(sframe) for loc in face_locations: dpname = "unknown" face_enc = f.face_encodings(sframe, [loc])[0] for face in faces: match = f.compare_faces([faces[face]["info"]], face_enc, tolerance=0.5)
def main(args): def simulate_sorted_idxes(test_batch_num): if (test_batch_num + 1) * batch_size <= len(validation): end_num = (test_batch_num + 1) * batch_size else: end_num = len(validation) #batch_size = end_num - batch_num*batch_size sorted_idxes = sorted( validation[test_batch_num * batch_size:end_num], key=lambda idx: len( sent2idxtensor(tokenized_eng_sentences[idx], idx)), reverse=True) return sorted_idxes def syntax_bleu_acc(pairs_dict, sorted_idexes_dict): acc_list = [] bleu_list = [] for k, pairs_list in pairs_dict.items(): acc = 0 for idx, tup in enumerate(pairs_list): tp1, tp2 = tup[0], tup[1] idx_of_binary = sorted_idexes_dict[k][idx] assert len(tp1) == len(tp2), k assert len([0] + lf_binary_entsRAW[idx_of_binary] + [0]) == len(tp1), "tp1: " + str( tp1) + " , " + "binary : " + str( lf_binary_entsRAW[idx_of_binary]) np_binary = -( np.array([0] + lf_binary_entsRAW[idx_of_binary] + [0]) - 1) tp1, tp2 = np.array(tp1) * np_binary, np.array(tp2) * np_binary acc += list(tp1) == list(tp2) bleu = sentence_bleu([list(tp2)], tp1) bleu_list.append(bleu) acc = acc / len(pairs_list) acc_list.append(acc) return acc_list, bleu_list global split_num global shuffle_scheme lf_binary_entsRAW = cPickle.load(open("data/raw_lf_binary_ent.p", "rb")) split_num = args.split_num shuffle_scheme = args.shuffle_scheme batch_size = 32 exec( open('data_prep/data_prepRAW_Shuffle.py').read(), globals(), globals()) sorted_idexes_dict = {} test_batch_num = 0 while (test_batch_num) * batch_size < len(validation): sorted_idexes_dict[test_batch_num + 1] = simulate_sorted_idxes(test_batch_num) test_batch_num += 1 batch_size = 32 directory = "outputs/" + args.loading_dir + "/validation_results" file_name = directory + "/validation_result.p" dict_pairs = cPickle.load(open(file_name, "rb")) try: tr_pairs = dict_pairs['translation_pairs'] except: tr_pairs = dict_pairs['pairs_dict']['translation_pairs'] tr_pairs = clean_pairs(tr_pairs) syntax_acc_list = syntax_bleu_acc(tr_pairs, sorted_idexes_dict) print("syntax acc is : ", np.mean(syntax_acc_list[0])) print("bleu mean is : ", np.mean(syntax_acc_list[1])) cPickle.dump(syntax_acc_list[1], open(directory + "/bleu_list.p", "wb"))
# for student, points in student_points.items(): # if points >= limit: # passed.append(student) # set(passed) solution_student.append(None) # Assignment 20: # a = [1, 2, 3, 4] # b = a # b.append(80) # a solution_student.append(None) if __name__ == '__main__': import _pickle with open('./solution/solution_2.pkl', 'rb') as solution_file: solution_tutors = _pickle.load(solution_file) if solution_tutors == solution_student: print('Solved!') else: false_answers = [] for index, answer in enumerate(solution_student): if answer != solution_tutors[index]: false_answers.append(str(index + 1)) else: pass print('Try Again! Answer(s) for the assignment(s) {} are wrong'.format(', '.join(false_answers)))
def load(file_path): f = open(file_path, 'rb') model = pickle.load(f) f.close() return model
def unpickle(file): with open(file, "rb") as fo: return cPickle.load(fo, encoding="latin1")
'filters'), help='path to the output filters directory', metavar='FILTERS_PATH') parser.add_option( '-c', '--count0', dest='count0', default='4', help=('number of chunks to extract from the first convolutional ' + 'layer, this number is halved for each next layer'), metavar='COUNT0') options, args = parser.parse_args() model = build_weighted_model(options.weights_path) pickle_data_0 = pickle.load(open('../ai-data/data_part0.pkl', 'rb')) pickle_data_1 = pickle.load(open('../ai-data/data_part1.pkl', 'rb')) pickle_data_2 = pickle.load(open('../ai-data/data_part2.pkl', 'rb')) pickle_data_3 = pickle.load(open('../ai-data/data_part3.pkl', 'rb')) pickle_data_concat = { 'x': np.concatenate((pickle_data_0['x'], pickle_data_1['x'], pickle_data_2['x'], pickle_data_3['x'])), 'y': np.concatenate((pickle_data_0['y'], pickle_data_1['y'], pickle_data_2['y'], pickle_data_3['y'])) } extract_filters(model, pickle_data_concat, options.filters_path, int(options.count0))