def __init__(self, downsampling_step, sequence_length): loading_dataset_since = time() extension = 'xlsx' self.downsampling_step = downsampling_step self.sequence_length = sequence_length all_filenames = [i for i in glob.glob('*.{}'.format(extension)) ] #find all files data_pd = pd.concat( [pd.read_excel(f).iloc[2:, 4:] for f in all_filenames], ignore_index=True) #concat all the data data_numpy = data_pd.to_numpy().astype(float) zeros_removed = remove_zeros(data_numpy) downsampled_data = downsample(zeros_removed, downsampling_step) time_series_data = split_time_series(downsampled_data, sequence_length) sc = StandardScaler() scaled_data = sc.fit_transform(time_series_data) scaled_data_tensor = torch.from_numpy(scaled_data) scaled_data_tensor_reshaped = scaled_data_tensor.unsqueeze( 0).transpose(1, 0) self.len = scaled_data_tensor_reshaped.shape[0] self.training_data_tensor = scaled_data_tensor_reshaped loading_dataset_end = time() hours, minutes, seconds = timer(loading_dataset_since, loading_dataset_end) print('The length of the dataset is {}'.format( len(self.training_data_tensor))) print("Time taken {:0>2}:{:0>2}:{:05.2f}".format( int(hours), int(minutes), seconds))
def fetch_comment_op_thread(comment_op, comment_keys, username, only_authored): # Skip comments we've already fetched op_key = get_key(comment_op) if op_key in comment_keys: return [] post_comments = [] # We can't fetch deleted comments from the api, so try to use the parent # to fill in the information missing from the operation comment = fetch_comment(op_key) if comment_is_not_found(comment): parent_comment = fetch_comment(get_parent_key(comment_op)) if comment_is_not_found(parent_comment): print_with_timestamp('Could not find \'{}\', skipping...'.format(get_link_string(*op_key))) return [] post_comments.append(make_comment_from_parent(comment_op, parent_comment)) print_with_timestamp('Comment \'{}\' was deleted but was able to fill in information from parent'.format(get_link_string(*op_key))) comment = parent_comment # Now that the deleted comment was added, find the root comment from the parent if only_authored and comment['root_author'] != username: return [] # If this isn't the root comment, fetch it comment_key = get_key(comment) root_key = get_root_key(comment) root_comment = comment if root_key == comment_key else fetch_comment(root_key) with timer('Fetching post \'{}\''.format(get_link_string(*root_key))): post_comments.extend(fetch_thread(root_comment, comment_keys)) return post_comments
def archive_user_history(username, start_date, end_date, only_authored): message = 'Archive history for user \'{}\' from {} to {}'.format(username, start_date, end_date) if only_authored: message += ' (only self-authored posts)' with timer(message): comment_keys = db.get_comment_keys() # Keep track of the comments we've already fetched for post_comments in fetch.fetch_user_history_rows(username, start_date, end_date, comment_keys, only_authored): if post_comments: insert_comments(post_comments)
def archive_thread(author, permlink): with timer(f'Archive thread "{get_link_string(author, permlink)}"'): thread_comments = fetch.fetch_thread_rows(author, permlink) if thread_comments: insert_comments(thread_comments)
def __init__(self, downsampling_step, sequence_length, train=True, normalize=False): loading_dataset_since = time() extension = 'xlsx' self.downsampling_step = downsampling_step self.sequence_length = sequence_length #find all files and concatenate all_filenames = [i for i in glob.glob('*{}'.format(extension))] data = pd.concat( [pd.read_excel(f).iloc[2:, 4:] for f in all_filenames], ignore_index=True) #extract torque and label torque = data.iloc[:, 0].to_numpy().astype(float) label = data.iloc[:, 1].to_numpy().astype(float) #remove zeros from torque and label label = np.delete(label, np.where(torque == 0)) torque = remove_zeros(torque) #expand dimension and store the zero removed data torque = np.expand_dims(torque, axis=1) label = np.expand_dims(label, axis=1) data = np.append(torque, label, axis=1) #find the normal and anomalous labeled sequences and divide the data into segments' segmented_list = consecutive( (np.where(data[:, 1] == 0))[0]) + consecutive( (np.where(data[:, 1] == 1))[0]) segmented_list.sort(key=lambda segmented_list: segmented_list[1]) segmented_data = [] for i in range(len(segmented_list)): segments = segmented_list[i] start_index = segments[0] end_index = segments[len(segments) - 1] seg_data = data[start_index:end_index + 1, :] segmented_data.append(seg_data) #downsample the data and make sequences' sequenced_data = [] for i in range(len(segmented_data)): label = segmented_data[i][0, 1] data = downsample(segmented_data[i][:, 0], self.downsampling_step) data = split_time_series(data, self.sequence_length) if label == 0.: label_column = [0] * len(data) else: label_column = [1] * len(data) sequenced_data.append(np.column_stack((data, label_column))) data = np.empty((0, self.sequence_length + 1)) for i in range(len(sequenced_data)): if sequenced_data[i].shape[1] == self.sequence_length + 1: data = np.append(data, sequenced_data[i], axis=0) if normalize: #scale the data and return the tensor output' sc = StandardScaler() training_data = data[0:int(0.7 * (len(data))), 0:self.sequence_length] testing_data = data[int(0.7 * (len(data))):, 0:self.sequence_length] training_label = data[0:int(0.7 * (len(data))), -1] testing_label = data[int(0.7 * (len(data))):, -1] sc_fit = sc.fit(training_data) if train: unlabeled_data = sc_fit.transform(training_data) data = np.column_stack((unlabeled_data, training_label)) else: unlabeled_data = sc_fit.transform(testing_data) data = np.column_stack((unlabeled_data, testing_label)) else: if train: data = data[0:int(0.7 * (len(data))), :] else: data = data[int(0.7 * (len(data))):, :] data = torch.from_numpy(data).unsqueeze(0).transpose(1, 0) self.len = data.shape[0] self.data = data loading_dataset_end = time() hours, minutes, seconds = timer(loading_dataset_since, loading_dataset_end) print('The length of the dataset is {}'.format(self.len)) print("Time taken {:0>2}:{:0>2}:{:05.2f}".format( int(hours), int(minutes), seconds))
import numpy as np from matplotlib import pyplot as plt from helpers import parse_args, timer from generate_data import generate_x, find_min_max from visualisation import visualise_x if __name__ == '__main__': filename = 'task_2.log' args = parse_args() x_msg = f"X generation with N = {args.N} and M = {args.M}" X = timer(generate_x, filename, x_msg)(args.M, args.N) y_msg = "Finding optimums for X" YMin, YMax = timer(find_min_max, filename, y_msg)(X, args.T, args.k) for _ in range(args.amount_graphs): start = np.random.randint(0, args.N * (args.M - 1)) visualise_x(X, start, args.N, YMin, YMax) plt.legend() plt.show()
import numpy as np from dataset import make_data_loader from helpers import parse_args, timer from generate_data import generate_x, find_min_max def sample(loader): for x in loader: pass if __name__ == '__main__': args = parse_args() filename = 'task_3.log' x_msg = f"X generation with N = {args.N} and M = {args.M}" X = timer(generate_x, filename, x_msg)(args.M, args.N) y_msg = "Finding optimums for X" YMin, YMax = timer(find_min_max, filename, y_msg)(X, args.T, args.k) loader = make_data_loader(X, YMin, YMax, N=args.N, batch_size=args.batch_size, num_batches=args.num_batches) timer( sample, filename, f"{args.num_batches} batches sampling with batch size = {args.batch_size}" )(loader)
alpha=1.0, color_one='red', color_two='green') run.plot_nmi_ari(list_of_nmi, list_of_ari, top_acc_maps_idx) #plot testing results run.plot_acc_test(list_of_acc_test, top_acc_maps_idx_test, alpha=0.8, color='red') run.plot_nmi_ari_test(list_of_nmi_test, list_of_ari_test, top_acc_maps_idx_test, alpha=0.8) end = time.time() hours, minutes, seconds = helpers.timer(since, end) print("Time taken {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds)) tsne_since = time.time() run.apply_TSNE(embeddings, labels_pred, list_of_centers, top_acc_maps_idx, n_components=2, perplexity=30.0) tsne_end = time.time() hours, minutes, seconds = helpers.timer(tsne_since, tsne_end) print("Time taken {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds))
def archive_thread(author, permlink): with timer('Archive thread \'{}\''.format(get_link_string(author, permlink))): thread_comments = fetch.fetch_thread_rows(author, permlink) if thread_comments: insert_comments(thread_comments)
# COFHAE config parser.add_argument('--skip_cofhae', type=int, default=0) parser.add_argument('--softmax_temperature', type=float, default=1.0) parser.add_argument('--adversarial_penalty', type=float, default=1.0) parser.add_argument('--assignment_penalty', type=float, default=1000.0) FLAGS = parser.parse_args() # Set up path to save model artifacts, possibly suffixed with an experiment ID path = FLAGS.output_dir or f"/tmp/{int(time.time())}" os.system('mkdir -p ' + path) with open(os.path.join(path, 'flags.json'), 'w') as f: f.write(json.dumps(FLAGS.__dict__)) with timer("loading data"): if 'chopsticks' in FLAGS.dataset: from chopsticks import Chopsticks m = re.search(r'depth(\d)_([a-z]+)', FLAGS.dataset) depth = int(m.group(1)) variant = m.group(2) noise = 0 if 'noise' in FLAGS.dataset: noise = float(re.search(r'noise([0-9\.]+)', FLAGS.dataset).group(1)) dataset = Chopsticks(depth, variant, noise) elif FLAGS.dataset == 'spaceshapes': from spaceshapes import Spaceshapes dataset = Spaceshapes() else: raise ValueError(f"Unrecognized dataset {FLAGS.dataset} -- should either be 'spaceshapes' or a Chopsticks variant string with a depth and slope/inter/either/both, e.g. 'chopsticks_depth3_both' or 'chopsticks_depth2_slope'.")
if __name__ == '__main__': logger = logger_call() queue, queue_out = Queue(), Queue() create_bd() links = get_links() start = time() thread_count = 5 # Populate queue for link in links: queue.put(link) # Create threads for links parsing for _ in range(thread_count): t = GetData(queue) t.daemon = True t.start() # Create thread for data storing to db db_thread = StoreData(queue_out) db_thread.daemon = True db_thread.name = 'Thread-DB' db_thread.start() queue.join() queue_out.join() # Measure time spent logger.info("Time spent: " + timer(start, time()))
def MIMOSA(Z, num_nearest_neighbors=40, eig_cumsum_thresh=0.95, eig_decay_thresh=4, cos_simil_thresh=0.99, ransac_frac=0.6667, contagion_num=5, min_size_init=20, min_size_merged=2000, neighbor_lengthscale_mult=10): with timer("BallTree"): ball_tree = BallTree(Z) neighbors = ball_tree.query(Z, k=num_nearest_neighbors)[1] with timer("LocalSVD"): svd_kwargs = dict(eig_cumsum_thresh=eig_cumsum_thresh, eig_decay_thresh=eig_decay_thresh, cos_simil_thresh=cos_simil_thresh, ransac_frac=ransac_frac) svds = [LocalSVD(Z[n], **svd_kwargs) for n in neighbors] covered = set() def BuildComponent(start): similar_neighbors = [ n for n in neighbors[start] if n not in covered and svds[start].is_similar(svds[n]) ] component = set([start] + similar_neighbors) frontier = similar_neighbors visits = defaultdict(int) while len(frontier): i = frontier.pop() for j in neighbors[i]: if j in covered: continue if j in component: continue if svds[i].is_similar(svds[j]): visits[j] += 1 if visits[j] >= contagion_num: component.add(j) frontier.append(j) idx = list(component) return ManifoldComponent(Z[idx], [svds[i] for i in idx], idx) with timer("BuildComponent"): components = [] for i in range(len(Z)): if i not in covered: component = BuildComponent(i) components.append(component) for j in component.index_list: covered.add(j) with timer("MergeComponents"): components2 = MergeComponents(components, min_size_init=min_size_init, min_size_merged=min_size_merged) with timer("ConstructHierarchy"): hierarchy, assignments = ConstructHierarchy( len(Z), components2, neighbor_lengthscale_mult=neighbor_lengthscale_mult) return components, components2, hierarchy, assignments
def begin(self): helpers.timer(self.duration, self.endStage).start()
def startBidTimer(self): # start and announce to players self.bidTimer = helpers.timer(self.bidDuration, self.bidEnded) self.bidTimer.start() self.game.sendEventToAllPlayers('TimerBegin', {'duration':self.bidDuration})
def train_bultmann(self, tr_loader, tr_dataset_length, Adam=True, scheduler=True): since = time.time() print('Training the network {}'.format( self.network.__class__.__name__)) print('Network Architecture \n{}'.format(self.network)) print('Network Criterion {}'.format(self.network_criterion)) list_of_network_loss = [] list_of_clustering_loss = [] list_of_total_loss = [] list_of_losses = [] learning_rates = [] list_of_centers = [] list_of_ranks_of_center_distances = [] list_of_center_distances = [] if Adam: optimizer = torch.optim.Adam(self.network.parameters(), lr=self.lr, weight_decay=0.0) else: optimizer = torch.optim.SGD(self.network.parameters(), lr=self.lr, momentum=0.0, weight_decay=0.0, nesterov=False) if scheduler: scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1) for epoch in range(self.n_epochs): embedded_representation = [] batched_center_index = 0 total_combined_loss = 0.0 total_network_loss = 0.0 total_clustering_loss = 0.0 labels = np.empty((0, 1), float) for batch in tr_loader: #extract the sequence and label from the batch and make predictions and return bottleneck sequences = batch[:, :, 0:self.sequence_length].float() batch_labels = batch[:, :, self.sequence_length] labels = np.append(labels, batch_labels.numpy(), axis=0) target_sequences = sequences.clone() predictions, bottleneck = self.network(sequences) embedded_representation.append(bottleneck.clone().detach()) batch_embeddings = torch.cat(embedded_representation) #compute the network loss network_loss = self.network_criterion(predictions, target_sequences) #set condition for pretrain mode if epoch <= self.no_of_pretrain_epochs: #pretrain mode clustering_loss = torch.zeros([1, 1], dtype=torch.float64) combined_loss = network_loss # + self.alpha*clustering_loss # defining the combined loss optimizer.zero_grad() #calculating the gradients and taking step with only network loss as the clustering loss is zero' combined_loss.backward( retain_graph=True ) # retaining the pytorch computation graph so that backward can be done twice optimizer.step() else: #joint training mode clustering_loss = self.clustering_criterion( bottleneck, batched_center_designation[batched_center_index]) batched_center_index += 1 # incrementing the batched center index combined_loss = ( 1 - self.alpha ) * network_loss + self.alpha * clustering_loss optimizer.zero_grad() #calculating the gradients but not taking step combined_loss.backward(retain_graph=True) #updating the weights of the clustering friendly channels wrt combined loss bottleneck_layer = helpers.get_bottleneck_name( self.network) #train_reporter.print_grads(network) with torch.no_grad(): for name, parameters in self.network.named_parameters( ): if name == bottleneck_layer: ranked_channels = torch.from_numpy( ranks_of_center_distances) parameters.grad[torch.where( ranked_channels <= self.no_of_clustering_channels)] = 0.0 optimizer.step() #updating the weights of rest of the channels wrt network loss' optimizer.zero_grad() network_loss.backward() with torch.no_grad(): for name, parameters in self.network.named_parameters( ): if name == bottleneck_layer: ranked_channels = torch.from_numpy( ranks_of_center_distances) parameters.grad[torch.where( ranked_channels > self.no_of_clustering_channels)] = 0.0 optimizer.step() total_network_loss += network_loss.item() total_clustering_loss += clustering_loss.item() total_combined_loss += combined_loss.item() #extract embeddings embeddings = batch_embeddings #make list of losses list_of_network_loss.append(total_network_loss / (tr_dataset_length) / self.batch_size) list_of_clustering_loss.append( total_clustering_loss / (tr_dataset_length) / self.batch_size) list_of_total_loss.append(total_combined_loss / (tr_dataset_length) / self.batch_size) #make cluster update interval array cluster_update = np.arange(self.no_of_pretrain_epochs, self.n_epochs, self.cluster_update_interval) #clustering for update in cluster_update: if update == epoch: print('Updating Cluster Centers') center_designation_pre = [] cluster_label_pre = [] centers_pre = [] no_of_channels = embeddings.shape[1] for i in range(no_of_channels): channel = embeddings[:, i, :].numpy() choice_cluster, initial_centers, cluster_ass = helpers.kmeansalter( channel, self.n_clusters) cluster_label_pre.append( torch.from_numpy(choice_cluster).unsqueeze( 0).transpose(1, 0)) cluster_label = torch.cat(cluster_label_pre, dim=1) centers_pre.append( torch.from_numpy(initial_centers).unsqueeze( 0).transpose(1, 0)) centers = torch.cat(centers_pre, dim=1) center_designation_pre.append( cluster_ass.unsqueeze(0).transpose(1, 0)) center_designation = torch.cat(center_designation_pre, dim=1) batched_center_designation = list( helpers.divide_batches(center_designation, self.batch_size)) center_distances, ranks_of_center_distances = helpers.rank_channels( centers) print( 'Epoch : {}/{} Network Loss : {} Clustering Loss : {} Total Loss : {}' .format(epoch + 1, self.n_epochs, (total_network_loss / (tr_dataset_length / self.batch_size)), (total_clustering_loss / (tr_dataset_length / self.batch_size)), (total_combined_loss / (tr_dataset_length / self.batch_size)))) list_of_centers.append(centers.numpy()) list_of_ranks_of_center_distances.append(ranks_of_center_distances) list_of_center_distances.append(center_distances) list_of_losses.append(list_of_network_loss) list_of_losses.append(list_of_clustering_loss) list_of_losses.append(list_of_total_loss) end = time.time() hours, minutes, seconds = helpers.timer(since, end) print("Time taken {:0>2}:{:0>2}:{:05.2f}".format( int(hours), int(minutes), seconds)) return self.network, optimizer, list_of_network_loss, list_of_clustering_loss, list_of_total_loss, list_of_losses, embeddings, labels, list_of_centers, list_of_ranks_of_center_distances, list_of_center_distances
import torch from model import Model, train from generate_data import generate_x, find_min_max from helpers import parse_args, timer if __name__ == '__main__': args = parse_args() config = {'hidden_size': args.hidden_size, 'num_layers': args.num_layers} model = Model(**config) print(model) filename = "task_4.log" x_msg = f"X generation with N = {args.N} and M = {args.M}" X = timer(generate_x, filename, x_msg)(args.M, args.N) y_msg = "Finding optimums for X" YMin, YMax = timer(find_min_max, filename, y_msg)(X, args.T, args.k) X_val = generate_x(1, args.val_size) YMin_val, YMax_val = find_min_max(X_val, args.T, args.k)[:args.N] X, X_val, YMin, YMin_val, YMax, YMax_val = [torch.from_numpy(x) for x in [X, X_val, YMin, YMin_val, YMax, YMax_val]] training_message = "Training model" timer(train, filename, training_message)(model, X, X_val, YMin, YMin_val, YMax, YMax_val, args.N, args.M, args.epochs, args.lr, args.num_batches, args.batch_size, args.device) if not os.path.exists(args.model_path): os.mkdir(args.model_path) torch.save(model.state_dict(), os.path.join(args.model_path, 'model.pkl')) with open(os.path.join(args.model_path, 'config.json'), 'w') as f: json.dump(config, f, ensure_ascii=False, indent=4)