def _get_normalized_density(data, tag_groups, discretizing_divisor, begin_time, end_time): def data_producer(): l = len(tag_groups) for i, tag_group in enumerate(tag_groups): if type(tag_group) is str: tag_group = {tag_group} print("{}/{} ({})".format(i + 1, l, str(tag_group))) try: T, R = extract_rating_by_time( data, lambda v: any([tag in v["tags"] for tag in tag_group])) yield T, R, tag_group except RuntimeError as re: warnings.warn("No such tags: {}".format(str(tag_group)), RuntimeWarning) def frequency_sorter_function(X): return np.max(X) - np.min(X) tag_time_rating = [] for T, R, tag_group in data_producer(): T = discretize(T, discretizing_divisor=discretizing_divisor, begin_x=begin_time, end_x=end_time, normalize=True) swing = frequency_sorter_function(T) tag_time_rating.append([swing, tag_group, T]) return sorted(tag_time_rating, key=lambda x: -x[0])
def hit_test(self, position, direction, max_distance=8): """Tests whether a block is hit. We draw a line from the position towards with a given direction for max_distance blocks-length. If at any time we hit a block we return that block and the previous block. The previous block is the block we pass through before we hit the block. Args: position: The position from which we draw a line. direction: The direction we draw the line in. max_distance: The maximum length in number of blocks Returns: A tuple (prev, curr) with the previous and current block if a block has been hit, (None, None) otherwise. """ x, y, z = position x_dir, y_dir, z_dir = direction num_steps = 10 x_step = x_dir/num_steps y_step = y_dir/num_steps z_step = z_dir/num_steps prev_pos = None for step in xrange(num_steps*max_distance): block_pos = discretize((x, y, z)) if prev_pos != block_pos and self.world.occupied(block_pos): return prev_pos, block_pos prev_pos = block_pos x, y, z = x+x_step, y+y_step, z+z_step return None, None
def position_intersects_object(self, position, obj): """Checks whether a position intersects with an object. """ x, y, z = discretize(obj.position) for dy in xrange(obj.height): if position == (x, y-dy, z): return True return False
def action(self, state, train=True): if train and (random.random() < self.exploration_p): return self.action_space.sample() self.exploration_p -= EXPLORATION_DEC if self.exploration_p < MIN_EXPLORATION: self.exploration_p = MIN_EXPLORATION discrete = utils.discretize(state, self.observation_space, QUANTA) return np.argmax(self.q[tuple(discrete)])
def get_wheel_bbox(points, shifted_threshold, dim = 500): heightmap = np.zeros((dim,dim)).astype('uint8') x_co, y_co, z_co = utils.discretize(points) x_co = x_co[z_co > shifted_threshold] y_co = y_co[z_co > shifted_threshold] heightmap[x_co, y_co] = 255 kernel = np.ones((5,5),np.uint8) heightmap = cv2.dilate(heightmap,kernel,iterations = 2) _, cnt, _ = cv2.findContours(heightmap, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE) c = max(cnt, key = cv2.contourArea) rect = cv2.minAreaRect(c) box = cv2.boxPoints(rect) bbox = np.int0(box) return bbox, heightmap
def generate_heightmap(points, threshold, dim=500, mask=False): heightmap = np.zeros((dim, dim, 3)).astype('float') x_co, y_co, z_co = utils.discretize(points) x_co -= np.amin(x_co) y_co -= np.amin(y_co) min_val = np.amin(z_co) z_co -= min_val threshold -= min_val utils.publish_threshold_frame(threshold) heightmap[x_co, y_co, 0] = z_co * 20 heightmap[x_co, y_co, 1] = z_co * 20 heightmap[x_co, y_co, 2] = z_co * 20 if mask: masked_x_co = x_co[z_co < threshold] masked_y_co = y_co[z_co < threshold] heightmap[masked_x_co, masked_y_co, 1] = 1 return heightmap, threshold
def generate_heightmap(points, dim=500): ''' Create a 2D representation of the point cloud Input: Nx3 array of points in cloud Value of threshold (debugging purposes) Dimension of 2D image Mask Variable (debugging purposes) Output: 2D image of size (dim x dim) representing the point cloud data ''' heightmap = np.zeros((dim, dim)).astype('float') x_co, y_co, z_co = utils.discretize(points) x_co -= np.amin(x_co) y_co -= np.amin(y_co) min_val = np.amin(z_co) z_co -= min_val heightmap[x_co, y_co] = z_co * 10 return heightmap
def train(model, train_queue, criterion, optimizer, gen): model.train() for step, (inputs, targets) in enumerate(train_queue): #model.copy_arch_parameters(population.get_population()[step % args.pop_size].arch_parameters) #assert utils.check_equality(model, population.get_population()[step % args.pop_size].arch_parameters) discrete_alphas = utils.discretize( population.get_population()[step % args.pop_size].arch_parameters, device) model.copy_arch_parameters(discrete_alphas) assert utils.check_equality(model, discrete_alphas) n = inputs.size(0) inputs = inputs.to(device) targets = targets.to(device) optimizer.zero_grad() logits = model(inputs) loss = criterion(logits, targets) loss.backward() nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip) optimizer.step() prec1, prec5 = utils.accuracy(logits, targets, topk=(1, 5)) population.get_population()[step % args.pop_size].objs.update( loss.data, n) population.get_population()[step % args.pop_size].top1.update( prec1.data, n) population.get_population()[step % args.pop_size].top5.update( prec5.data, n) #population.get_population()[step % args.pop_size].accumulate() #print(step) if (step + 1) % 100 == 0: # break logging.info("[{} Generation]".format(gen)) logging.info( "Using Training batch #{} for {}/{} architecture with loss: {}, prec1: {}, prec5: {}" .format( step, step % args.pop_size, len(population.get_population()), population.get_population()[step % args.pop_size].objs.avg, population.get_population()[step % args.pop_size].top1.avg, population.get_population()[step % args.pop_size].top5.avg))
def validation(model, valid_queue, criterion, gen): #model.eval() for i in range(len(population.get_population())): valid_start = time.time() #model.copy_arch_parameters(population.get_population()[i].arch_parameters) #assert utils.check_equality(model, population.get_population()[i].arch_parameters) discrete_alphas = utils.discretize( population.get_population()[i].arch_parameters, device) model.copy_arch_parameters(discrete_alphas) assert utils.check_equality(model, discrete_alphas) population.get_population()[i].objs.reset() population.get_population()[i].top1.reset() population.get_population()[i].top5.reset() with torch.no_grad(): for step, (inputs, targets) in enumerate(valid_queue): n = inputs.size(0) inputs = inputs.to(device) targets = targets.to(device) logits = model(inputs) loss = criterion(logits, targets) prec1, prec5 = utils.accuracy(logits, targets, topk=(1, 5)) population.get_population()[i].objs.update(loss.data, n) population.get_population()[i].top1.update(prec1.data, n) population.get_population()[i].top5.update(prec5.data, n) #print(step) #if (step + 1) % 10 == 0: # break #print("Finished in {} seconds".format((time.time() - valid_start) )) logging.info( "[{} Generation] {}/{} finished with validation loss: {}, prec1: {}, prec5: {}" .format(gen, i + 1, len(population.get_population()), population.get_population()[i].objs.avg, population.get_population()[i].top1.avg, population.get_population()[i].top5.avg))
def analyze_density_by_time(data): print("Analyze density by time") clean_folders([RESULT_FOLDER]) T, R = extract_rating_by_time(data, lambda x: True) print("Analyze density by time: hourly for weekdays") rolled_timestamps = _roll_timestamps(T, _TimestampFormat.HOURLY_WEEKDAY) N_rolled, R_rolled = discretize(X=rolled_timestamps, Y=R, bins=2 * 24, normalize=False) corresponding_time_ticks = np.arange(0, 24 * 60 * 60, 30 * 60) path_to_save = os.path.join(RESULT_FOLDER, "hourly_weekday") draw_rating_hourly(corresponding_time_ticks, R_rolled, N_rolled, path_to_save=path_to_save) path_to_save = os.path.join(RESULT_FOLDER, "lores_hourly_weekday") draw_rating_hourly(corresponding_time_ticks, R_rolled, N_rolled, path_to_save=path_to_save, figsize=(12, 6)) print("Analyze density by time: daily for weeks") rolled_timestamps = _roll_timestamps(T, _TimestampFormat.DAILY) N_rolled, R_rolled = discretize(X=rolled_timestamps, Y=R, bins=7 * 24, normalize=False) corresponding_time_ticks = np.arange(0, 7 * 24 * 60 * 60, 60 * 60) path_to_save = os.path.join(RESULT_FOLDER, "daily") draw_rating_daily(corresponding_time_ticks, R_rolled, N_rolled, path_to_save=path_to_save) path_to_save = os.path.join(RESULT_FOLDER, "lores_daily") draw_rating_daily(corresponding_time_ticks, R_rolled, N_rolled, path_to_save=path_to_save, figsize=(12, 6)) print("Analyze density by time: monthly") rolled_timestamps = _roll_timestamps(T, _TimestampFormat.MONTHLY) N_rolled, R_rolled = discretize(X=rolled_timestamps, Y=R, bins=30 * 12, normalize=False) corresponding_time_ticks = np.arange(0, 30 * 24 * 60 * 60, 2 * 60 * 60) path_to_save = os.path.join(RESULT_FOLDER, "monthly") draw_rating_monthly(corresponding_time_ticks, R_rolled, N_rolled, path_to_save=path_to_save) path_to_save = os.path.join(RESULT_FOLDER, "lores_monthly") draw_rating_monthly(corresponding_time_ticks, R_rolled, N_rolled, path_to_save=path_to_save, figsize=(12, 6))
def main(): parser = argparse.ArgumentParser(description="") parser.add_argument("-b", "--biom-file", help="An input biom file", required=True) parser.add_argument("-m", "--mapping-file", help="A mapping file", required=True) parser.add_argument("-c", "--class-label", help="Which data are we trying to analyze", required=True) parser.add_argument( "-d", "--subclass", action="append", help="Subselect only some of the data - if specified, this should appear at least twice with the adequate options. ex: -c SEX -d male -d female", required=False, ) parser.add_argument("-o", "--output-folder", help="The folder to output our data to", required=True) parser.add_argument("-p", "--min-features", help="Minimum number of features to test", default=50, required=False) parser.add_argument("-q", "--max-features", help="Maximum number of features to test", default=150, required=False) parser.add_argument( "-s", "--step-size", help="Step size within the range of the number of features to be tested", default=1, required=False, ) # parser.add_argument("-p", "--predictor", help="Classifier/Predictor used", default="nbc", required=False) # As of today, contains only nbc parser.add_argument( "-j", "--objective-function", help="Objective function for the feature selection algorithm", default="mim", required=False, ) parser.add_argument( "-t", "--output-type", help="data output format. default: CSV options: csv, matlab, r, numpy", default="csv", required=False, ) parser.add_argument( "-f", "--select-field", help="Field to extract a subset of the data. e.g. EN_BIOME, COUNTRY. The default considers the whole dataset", default=None, required=False, ) parser.add_argument( "-g", "--value-field", action="append", help="When used with -f specifies the value of the field to filter - THIS IS REQUIRED IF -f if present", default=None, required=False, ) parser.add_argument( "-k", "--cluster", action="append", help="Allows to subgroup some of the labels. Ex: -k 'Vegan Vegan+Seafood'. The different values are separated with semi colon. Requires at least two appearances. This cannot be used in conjunction with the -d option", default=None, required=False, ) ## Need to be continued!!!! print "Definition of the arguments done" global output_type print "Start of the program" args = parser.parse_args() output_type = args.output_type.lower() # if our folder doesn't exist create it if not os.path.isdir(args.output_folder): os.mkdir(args.output_folder) nb_features = range(int(args.min_features), int(args.max_features) + 1, int(args.step_size)) print "nb_features prepared" matrix, site_names, otu_ids, otu_phylo = utils.load_biom(args.biom_file) metadata = utils.load_map(args.mapping_file) class_labels = [] for sample in site_names: class_labels.append(metadata[sample][args.class_label]) print "class_labels loaded" interesting_samples = range(0, len(site_names)) if args.select_field is not None: interesting_fields = [it.lower() for it in args.value_field] print interesting_fields subsample_habitat = [ i for i, sample in enumerate(site_names) if metadata[sample][args.select_field].lower() in interesting_fields ] interesting_samples = list(set(interesting_samples).intersection(set(subsample_habitat))) if args.subclass is not None: target_labels = [it.lower() for it in args.subclass] subsamples = [i for i in xrange(0, len(class_labels)) if class_labels[i].lower() in target_labels] interesting_samples = list(set(interesting_samples).intersection(set(subsamples))) if (args.cluster is not None) and (args.subclass is None): print "In da cluster separation" clusters = [it for it in args.cluster] clusters_dict = {} print "Initial Dictionary created" for idx, a_cluster in enumerate(clusters): print "In da loop" # keys = a_cluster.split() keys = a_cluster.split(";") print keys for a_key in keys: clusters_dict[a_key.lower()] = idx subsamples = [i for i in xrange(0, len(class_labels)) if class_labels[i].lower() in clusters_dict] interesting_samples = list(set(interesting_samples).intersection(set(subsamples))) for i in subsamples: class_labels[i] = "cluster" + str(clusters_dict[class_labels[i].lower()]) matrix = matrix[interesting_samples, :] class_labels = [class_labels[i] for i in interesting_samples] class_labels, labels_key = utils.discretize(class_labels) matrix = matrix + 1 row_sums = matrix.sum(axis=1) matrix = matrix / row_sums[:, np.newaxis] matrix = np.ceil(matrix / matrix.min()) # So far, we have the biom file open and the environment parameters # We can now launch our feature selection algorithm further_param = [] # This has to be adapted to the case we are using other objective functions nb_tests = 10 nb_folds = 5 launch_tests_feature_selection( matrix, np.array(map(int, class_labels)), site_names, otu_ids, otu_phylo, args.objective_function, nb_features, nb_tests, args.output_folder, nb_folds, ) avg_consistency, max_consistency, min_consistency, std_consistency = get_consistencies( nb_features, len(otu_ids), nb_tests, args.output_folder ) save_results( "consistency", os.path.join(args.output_folder, "consistencyresults.txt"), avg_consistency, max_consistency, min_consistency, std_consistency, ) avg_accuracy_g, max_accuracy_g, min_accuracy_g, std_accuracy_g, avg_accuracy, max_accuracy, min_accuracy, std_accuracy = get_accuracies( nb_features, len(otu_ids), nb_tests, args.output_folder ) save_results( "Accuracy", os.path.join(args.output_folder, "accuracyGaussianresults.txt"), avg_accuracy_g, max_accuracy_g, min_accuracy_g, std_accuracy_g, )
def reward(self, state, action, reward, new_state): state = utils.discretize(state, self.observation_space, QUANTA) new_state = utils.discretize(new_state, self.observation_space, QUANTA) self.q[tuple( state)][action] = reward + GAMMA * max(self.q[tuple(new_state)])
def analyze_rating_density(data): print("Analyze rating overall density") clean_folders([RESULT_FOLDER]) ratings = sorted([v["rating"] for v in data.values()], reverse=True) names = [] hlines = [] idxes = list(range(8)) + [11, 15, 20, 30, 40, 60, 80] for i in idxes: r = ratings[i] try: record = next(record for record in data.values() if record["rating"] == r) hlines.append([ 0.61 + 0.285 * (i % 2), r, record["title"] + " (р:" + str(r) + ", " + timestamp_to_date(record["timestamp"]) + ")" ]) names.append(record["title"]) except StopIteration as si: warnings.warn( "Could not find record with such rating: {}".format(r), RuntimeWarning) gini = _compute_gini_coefficient(ratings) p999 = np.percentile(ratings, 99.9) p99 = np.percentile(ratings, 99) p95 = np.percentile(ratings, 95) mean = np.mean(ratings) median = np.median(ratings) hlines.append([1.0, p999, "99.9 перцентиль ({0:.2f})".format(p999)]) hlines.append([1.0, p99, "99 перцентиль ({0:.2f})".format(p99)]) hlines.append([1.0, p95, "95 перцентиль ({0:.2f})".format(p95)]) hlines.append([ 0.61, 0.0, "Индекс Джини: {0:.4f}, среднее: {1:.2f}, медиана: {2:.2f}".format( gini, mean, median) ]) scatter_top_posts = list(zip([1.0] * 80, ratings[:80])) name = os.path.join(RESULT_FOLDER, "rating_violinplot.png") draw_rating_violinplot(ratings, hlines=hlines, scatter=scatter_top_posts, path_to_save=name) name = os.path.join(RESULT_FOLDER, "lores_rating_violinplot.png") hlines = [hlines[0]] + [hlines[2]] + [hlines[4]] + [hlines[6] ] + hlines[-4:] draw_rating_violinplot(ratings, hlines=hlines, scatter=scatter_top_posts[:25], path_to_save=name, figsize=(8, 10)) n_bins_for_logplot = 100 N = discretize([r for r in ratings if 100 < r <= 10000], bins=n_bins_for_logplot, normalize=False) name = os.path.join(RESULT_FOLDER, "logplot.png") draw_post_number_logplot([n_bins_for_logplot * i for i in range(len(N))], [N], [u"Количество постов"], path_to_save=name) name = os.path.join(RESULT_FOLDER, "loores_logplot.png") draw_post_number_logplot([n_bins_for_logplot * i for i in range(len(N))], [N], [u"Количество постов"], path_to_save=name, figsize=(14, 8))
def preprocess_state(state, state_grid): """Map a continuous state to its discretized representation.""" return discretize(state, state_grid)
def eval(context, question): with open(os.path.join(config.data_dir, "train", "word2idx.pkl"), "rb") as wi, \ open(os.path.join(config.data_dir, "train", "char2idx.pkl"), "rb") as ci, \ open(os.path.join(config.data_dir, "train", "word_embeddings.pkl"), "rb") as wb, \ open(os.path.join(config.data_dir, "train", "char_embeddings.pkl"), "rb") as cb: word2idx = pickle.load(wi) char2idx = pickle.load(ci) word_embedding_matrix = pickle.load(wb) char_embedding_matrix = pickle.load(cb) # transform them into Tensors word_embedding_matrix = torch.from_numpy( np.array(word_embedding_matrix)).type(torch.float32) char_embedding_matrix = torch.from_numpy( np.array(char_embedding_matrix)).type(torch.float32) idx2word = dict([(y, x) for x, y in word2idx.items()]) context = clean_text(context) context = [w for w in word_tokenize(context) if w] question = clean_text(question) question = [w for w in word_tokenize(question) if w] if len(context) > config.max_len_context: print("The context is too long. Maximum accepted length is", config.max_len_context, "words.") if max([len(w) for w in context]) > config.max_len_word: print("Some words in the context are longer than", config.max_len_word, "characters.") if len(question) > config.max_len_question: print("The question is too long. Maximum accepted length is", config.max_len_question, "words.") if max([len(w) for w in question]) > config.max_len_word: print("Some words in the question are longer than", config.max_len_word, "characters.") if len(question) < 3: print( "The question is too short. It needs to be at least a three words question." ) context_idx = np.zeros([config.max_len_context], dtype=np.int32) question_idx = np.zeros([config.max_len_question], dtype=np.int32) context_char_idx = np.zeros([config.max_len_context, config.max_len_word], dtype=np.int32) question_char_idx = np.zeros( [config.max_len_question, config.max_len_word], dtype=np.int32) # replace 0 values with word and char IDs for j, word in enumerate(context): if word in word2idx: context_idx[j] = word2idx[word] else: context_idx[j] = 1 for k, char in enumerate(word): if char in char2idx: context_char_idx[j, k] = char2idx[char] else: context_char_idx[j, k] = 1 for j, word in enumerate(question): if word in word2idx: question_idx[j] = word2idx[word] else: question_idx[j] = 1 for k, char in enumerate(word): if char in char2idx: question_char_idx[j, k] = char2idx[char] else: question_char_idx[j, k] = 1 model = BiDAF(word_vectors=word_embedding_matrix, char_vectors=char_embedding_matrix, hidden_size=config.hidden_size, drop_prob=config.drop_prob) try: if config.cuda: model.load_state_dict( torch.load(os.path.join(config.squad_models, "model_final.pkl"))["state_dict"]) else: model.load_state_dict( torch.load( os.path.join(config.squad_models, "model_final.pkl"), map_location=lambda storage, loc: storage)["state_dict"]) print("Model weights successfully loaded.") except: pass print( "Model weights not found, initialized model with random weights.") model.to(device) model.eval() with torch.no_grad(): context_idx, context_char_idx, question_idx, question_char_idx = torch.tensor(context_idx, dtype=torch.int64).unsqueeze(0).to(device),\ torch.tensor(context_char_idx, dtype=torch.int64).unsqueeze(0).to(device),\ torch.tensor(question_idx, dtype=torch.int64).unsqueeze(0).to(device),\ torch.tensor(question_char_idx, dtype=torch.int64).unsqueeze(0).to(device) pred1, pred2 = model(context_idx, context_char_idx, question_idx, question_char_idx) starts, ends = discretize(pred1.exp(), pred2.exp(), 15, False) prediction = " ".join(context[starts.item():ends.item() + 1]) return prediction
def sarsa(env, num_episodes, state_grid, alpha, gamma=1.0): np.random.seed(928) brain_name = env.brain_names[0] brain = env.brains[brain_name] nA = brain.vector_action_space_size Q = defaultdict(lambda: np.zeros(nA)) epsilon = 0.7 min_epsilon = 0.05 decay_epsilon = 0.999 num_episodes_concluded = 0 scores = [] max_avg_score = -np.inf for i_episode in range(1, num_episodes + 1): # monitor progress if i_episode % 100 == 0: print("\rEpisode {}/{} - Epsilon: {} Max avg score: {}".format( i_episode, num_episodes, epsilon, max_avg_score), end="") sys.stdout.flush() env_info = env.reset(train_mode=True)[brain_name] state = env_info.vector_observations[0] state = discretize(state, state_grid) epsilon = max(min_epsilon, decay_epsilon * epsilon) action = np.random.choice(np.arange(nA), p=epsilon_greedy_probs( Q[state], epsilon, nA)) total_reward = 0 while True: env_info = env.step(action)[brain_name] next_state = env_info.vector_observations[0] next_state = discretize(next_state, state_grid) reward = env_info.rewards[0] done = env_info.local_done[0] total_reward += reward if done: num_episodes_concluded += 1 break next_action = np.random.choice(np.arange(nA), p=get_probs(Q[next_state], epsilon, nA)) Q[state + (action, )] = Q[state + (action, )] + alpha * ( reward + gamma * Q[next_state + (next_action, )] - Q[state + (action, )]) state = next_state action = next_action scores.append(total_reward) if len(scores) > 100: avg_score = np.mean(scores[-100:]) if avg_score > max_avg_score: max_avg_score = avg_score if max_avg_score >= 13: print("The expect average score was bet. avg score: {}".format( max_avg_score)) break print("\n\n{}/{} were completly finished".format(num_episodes_concluded, num_episodes)) return Q, scores