def load_arithmetic_data(): data = pd.read_csv("./data/arithmetic-data.csv") equations = list(data['input']) answers = list(data['output']) print(answers[0]) equations_of_chars = [[c for c in str(equation)] for equation in equations] answers_of_chars = [[c for c in str(answer)] for answer in answers] index_to_char = {i: str(i) for i in range(1, 10)} index_to_char[len(index_to_char.keys())+1] = '+' index_to_char[len(index_to_char.keys())+1] = '*' index_to_char[len(index_to_char.keys())+1] = '-' vocab_size = len(index_to_char.keys()) char_to_index = {v:k for k,v in index_to_char.iteritems()} max_len = max(np.max([len(equation_of_chars) for equation_of_chars in equations_of_chars]), np.max([len(answer_of_chars) for answer_of_chars in answers_of_chars])) n_equations = len(equations_of_chars) X = np.zeros(shape=(n_equations, max_len, vocab_size), dtype='float32') y = np.zeros(shape=(n_equations, max_len, vocab_size), dtype='float32') label_to_index = {'negative': 0, 'neutral': 1, 'positive':2} for sentence_index in range(n_equations): current_sentence = equations_of_chars[sentence_index] for current_char_position in range(len(current_sentence)): char = equations_of_chars[sentence_index][current_char_position] token_index = char_to_index[char] X[sentence_index][current_char_position][token_index] = 1 current_answer = answers_of_chars[sentence_index] for current_char_position in range(len(current_answer)): char = answers_of_chars[sentence_index][current_char_position] token_index = char_to_index[char] y[sentence_index][current_char_position][token_index] = 1 return X, y, index_to_char, equations, answers, max_len
def relu(feature_map): #Preparing the output of the ReLU activation function. relu_out = np.zeros(feature_map.shape) for map_num in range(feature_map.shape[-1]): for r in np.arange(0,feature_map.shape[0]): for c in np.arange(0, feature_map.shape[1]): relu_out[r, c, map_num] = np.max([feature_map[r, c, map_num], 0]) return relu_out
def pooling(feature_map, size=2, stride=2): #Preparing the output of the pooling operation. pool_out = np.zeros((np.uint16((feature_map.shape[0]-size+1)/stride+1), np.uint16((feature_map.shape[1]-size+1)/stride+1), feature_map.shape[-1])) for map_num in range(feature_map.shape[-1]): r2 = 0 for r in np.arange(0,feature_map.shape[0]-size+1, stride): c2 = 0 for c in np.arange(0, feature_map.shape[1]-size+1, stride): pool_out[r2, c2, map_num] = np.max([feature_map[r:r+size, c:c+size, map_num]]) c2 = c2 + 1 r2 = r2 +1 return pool_out
def save(stories, query, error=False, exc="", name=""): if error: print(f"Ran into error!") print(f"fetched {len(stories)} total!") if stories: query["last_processed_id"] = stories[-1]["processed_stories_id"] else: query["last_processed_id"] = 0 df = pd.DataFrame(stories) DATA_FILENAME = f"{name}_us_mainstream_stories.tsv" METADATA_FILENAME = f"{name}_metadata.json" # chuck the whole thing into a df # append without headers if the file exists already, otherwise create a new file if os.path.exists(DATA_FILENAME): with open(DATA_FILENAME, "at") as f: df.to_csv(f, sep="\t", header=False) else: with open(DATA_FILENAME, "wt") as f: df.to_csv(f, sep="\t", header=True) print(f"Saved to {DATA_FILENAME}") # always rescan to get the latest date # df_all = pd.read_csv(DATA_FILENAME, sep='\t') latest_date = str(np.max(pd.to_datetime(df["publish_date"])).date()) new_metadata = { "error": error, "exc": exc, "last_query": query, "latest": latest_date, } with open(METADATA_FILENAME, "wt") as f: f.write(json.dumps(new_metadata)) print(f"Metadata saved to {METADATA_FILENAME}")
def softmax(x): m = np.max(x) e = np.exp(x - m) return e / e.sum()
def query(self, n, model, train_dataset, pool_dataset, budget=10000): device = model.state_dict()['softmax.bias'].device full_dataset = ConcatDataset([pool_dataset, train_dataset]) pool_len = len(pool_dataset) self.embeddings = self.get_embeddings(model, device, full_dataset) # Calc distance matrix num_images = self.embeddings.shape[0] dist_mat = self.calc_distance_matrix(num_images) # We need to get k centers start with greedy solution upper_bound = gb.UB lower_bound = upper_bound / 2.0 max_dist = upper_bound _x, _y = np.where(dist_mat <= max_dist) _distances = dist_mat[_x, _y] subset = [i for i in range(1)] model = solve_fac_loc(_x, _y, subset, num_images, budget) # model.setParam( 'OutputFlag', False ) x, y, z = model.__data delta = 1e-7 while upper_bound - lower_bound > delta: print("State", upper_bound, lower_bound) current_radius = (upper_bound + lower_bound) / 2.0 violate = np.where(_distances > current_radius) # Point distances which violate the radius new_max_d = np.min(_distances[_distances >= current_radius]) new_min_d = np.max(_distances[_distances <= current_radius]) print("If it succeeds, new max is:", new_max_d, new_min_d) for v in violate[0]: x[_x[v], _y[v]].UB = 0 # The upper bound for points, which violate the radius are set to zero model.update() r = model.optimize() if model.getAttr(gb.GRB.Attr.Status) == gb.GRB.INFEASIBLE: failed = True print("Infeasible") elif sum([z[i].X for i in range(len(z))]) > 0: failed = True print("Failed") else: failed = False if failed: lower_bound = max(current_radius, new_max_d) # failed so put edges back for v in violate[0]: x[_x[v], _y[v]].UB = 1 else: print("solution found", current_radius, lower_bound, upper_bound) upper_bound = min(current_radius, new_min_d) model.write("s_{}_solution_{}.sol".format(budget, current_radius)) idxs_labeled = np.arange(start=pool_len, stop=pool_len + len(train_dataset)) # Perform kcenter greedy self.update_distances(idxs_labeled, idxs_labeled, only_new=False, reset_dist=True) sel_ind = [] for _ in range(n): ind = np.argmax(self.min_distances) # Get sample with highest distance assert ind not in idxs_labeled, "Core-set picked index already labeled" self.update_distances([ind], idxs_labeled, only_new=True, reset_dist=False) sel_ind.append(ind) assert len(set(sel_ind)) == len(sel_ind), "Core-set picked duplicate samples" remaining_ind = list(set(np.arange(pool_len)) - set(sel_ind)) return sel_ind, remaining_ind