def process_tweets(self): from utils.twitter.tokenizer import Tweet_Tokenizer # for more documentation, visit: # https://dev.twitter.com/overview/api/tweets def extract_text(tweet_json): try: tweet_json["text"] except KeyError: return orig_text = tweet_json["text"].lower().strip() tweet_text = Tweet_Tokenizer.apply_regex_to_text( orig_text, self.replace_hashtags, self.replace_links, self.replace_user_refs) tweet_obj = Tweet(tweet_text, orig_text) #get, process, and package hashtags (this also stores and counts them) hashtags = [ Hashtag(h["text"].lower()) for h in tweet_json["entities"]["hashtags"] ] #connect hashtags to tweet and vice versa tweet_obj.register_hashtags(hashtags) [h.register_tweet(tweet_obj) for h in hashtags] if self.show_progress: with Progress("Extracting tweets", len(self.tweet_JSON_objs)) as up: [ up(extract_text(tweet_obj)) for tweet_obj in self.tweet_JSON_objs[0] ] else: print("Extracting tweets.") [extract_text(tweet_obj) for tweet_obj in self.tweet_JSON_objs]
def __init__(self, tag: str, from_top: bool = True): self.base_url = (f'https://archiveofourown.org/tags/' f'{quote(tag).replace(".", "*d*")}/works?page=') tag_path = paths.tag_path(tag) self.progress = Progress(tag_path) self.last = self.progress.read()[0] self.path = paths.meta_path(tag) log_path = paths.meta_log_path(tag) super().__init__(tag + '_meta', log_path) self.from_top = self._start_from_top(from_top)
def main(): # load data mnist = input_data.read_data_sets(args.dataset_path + 'MNIST_data', one_hot=True) # we can access to images like this: # images = mnist.train.images; images.shape = [] # labels = mnist.train.labels; each label is a probability distribution. print(mnist.train.num_examples) max_epoch = args.max_epoch max_loop_z = args.max_loop_z with tf.Graph().as_default(): config = Config() # model = MLP(config) model = DCGN(config) tf.get_default_graph().finalize() progress = Progress() n_batch_loop = int(mnist.train.num_examples / config.batch_size) for epoch in range(max_epoch): sum_cost = 0 progress.start_epoch(epoch, max_epoch) for t in range(n_batch_loop): # batch_X: batch_size x n_input # batch_y: batch_size batch_X, batch_y = mnist.train.next_batch(config.batch_size, shuffle=False) batch_indices = np.arange( t * config.batch_size, t * config.batch_size + config.batch_size) for z_t in range(max_loop_z): cost_z = model.forward_backprop_z(batch_X, batch_indices) model.project_z_L2() #print(cost_z) cost_per_sample = model.forward_backprop_theta( batch_X, batch_indices) sum_cost += cost_per_sample # model.increaseBatchID() if t % 10 == 0: progress.show(t, n_batch_loop, {}) print("cost: {}".format(sum_cost / n_batch_loop)) model.save(epoch, args.model_dir)
def extract_batch(filelist, savedir, descobj, verbose=False): """ Extract features/descriptors from a batch of images. Single-threaded. This function calls an image descripor object on a batch of images in order to extract the images descripor. If a feature/descriptor file already exists for the image, it is skipped. This is a single-threaded pipeline. Arguments: filelist: A list of files of image names including their paths of images to read and extract descriptors from savedir: A directory in which to save all of the image features. They are pickled objects (protocol 2) with the same name as the image file. The object that is pickled is the return from descobj.extract(). decobj: An image descriptor object which does the actual extraction work. the method called is descobj.extract(image). See descriptors.Descriptor for an abstract base class. verbose: bool, display progress? Returns: True if there we any errors extracting image features. False otherwise. If there is a problem extracting any image descriptors, a file "errors.log" is created in the savedir directory with a list of file names, error number and messages. """ # Try to make the save path if not os.path.exists(savedir): os.mkdir(savedir) errflag = False # Set up progess updates nfiles = len(filelist) progbar = Progress(nfiles, title='Extracting descriptors', verbose=verbose) # Iterate through all of the images in filelist and extract features for i, impath in enumerate(filelist): errflag |= extract(impath, savedir, descobj) progbar.update(i) progbar.finished() if errflag == True: print('Done with errors. See the "errors.log" file in ' + savedir)
def AddSequences(): """Creates one training, validation.""" errors = [] # Generate datasets file lists. sequences = FindPatternFiles(FLAGS.input_dir, FLAGS.view_pattern, errors) num_frames = PrintSequencesInfo(sequences, 'Found the following datasets and files:') # Sharding and randomizing sets. if FLAGS.max_per_shard > 0: sequences = ShardSequences(sequences, FLAGS.max_per_shard) num_frames = PrintSequencesInfo(sequences, 'After sharding:') tf.logging.info('') # Process sets. progress = Progress(num_frames) output_list = [] for sequence in sequences: record_name = os.path.join(FLAGS.output_dir, '%s.tfrecord' % sequence['name']) if tf.gfile.Exists(record_name) and not FLAGS.overwrite: ok, num_frames = CheckRecord(record_name, sequence) if ok: progress.Add(num_frames) tf.logging.info('Skipping existing output file: %s' % record_name) continue else: tf.logging.info( 'File does not match sequence, reprocessing...') output_dir = os.path.dirname(record_name) if not tf.gfile.Exists(output_dir): tf.logging.info('Creating output directory: %s' % output_dir) tf.gfile.MakeDirs(output_dir) output_list.append(record_name) tf.logging.info('Writing to ' + record_name) writer = tf.python_io.TFRecordWriter(record_name) AddSequence(sequence, writer, progress, errors) writer.close() tf.logging.info('Wrote dataset files: ' + str(output_list)) tf.logging.info('All errors (%d): %s' % (len(errors), str(errors)))
def __init__(self, json_txt, show_progress=False, replace_hashtags=True, replace_user_refs=False, replace_links=True): import json self.show_progress = show_progress self.replace_links = replace_links self.replace_hashtags = replace_hashtags self.replace_user_refs = replace_user_refs #parse text into json objects if self.show_progress: from utils.progress import Progress with Progress("Parsing text into JSON Object", len(json_txt)) as up: self.tweet_JSON_objs = [ u(json.loads(line)) for line in json_txt ] else: print("Parsing text into JSON Object.") self.tweet_JSON_objs = [json.loads(line) for line in json_txt] #extract text from tweets self.process_tweets()
def main(args): ############################### # TRAIN PREP ############################### print("Loading data") train_loader, valid_loader, data_var, input_size = \ data.get_data(args.data_folder,args.batch_size) args.input_size = input_size args.downsample = args.input_size[-1] // args.enc_height args.data_variance = data_var print(f"Training set size {len(train_loader.dataset)}") print(f"Validation set size {len(valid_loader.dataset)}") print("Loading model") if args.model == 'diffvqvae': model = DiffVQVAE(args).to(device) elif args.model == 'vqvae': model = VQVAE(args).to(device) print( f'The model has {utils.count_parameters(model):,} trainable parameters' ) optimizer = optim.Adam(model.parameters(), lr=args.learning_rate, amsgrad=False) print(f"Start training for {args.num_epochs} epochs") num_batches = math.ceil( len(train_loader.dataset) / train_loader.batch_size) pbar = Progress(num_batches, bar_length=10, custom_increment=True) # Needed for bpd args.KL = args.enc_height * args.enc_height * args.num_codebooks * \ np.log(args.num_embeddings) args.num_pixels = np.prod(args.input_size) ############################### # MAIN TRAIN LOOP ############################### best_valid_loss = float('inf') train_bpd = [] train_recon_error = [] train_perplexity = [] args.global_it = 0 for epoch in range(args.num_epochs): pbar.epoch_start() train_epoch(args, vq_vae_loss, pbar, train_loader, model, optimizer, train_bpd, train_recon_error, train_perplexity) # loss, _ = test(valid_loader, model, args) # pbar.print_eval(loss) valid_loss = evaluate(args, vq_vae_loss, pbar, valid_loader, model) if valid_loss < best_valid_loss: best_valid_loss = valid_loss best_valid_epoch = epoch torch.save(model.state_dict(), args.save_path) pbar.print_end_epoch() print("Plotting training results") utils.plot_results(train_recon_error, train_perplexity, "results/train.png") print("Evaluate and plot validation set") generate_samples(model, valid_loader)
def embed_tweets_hashtags(self, tweets, hashtags): with Progress("Calculating hashtag and tweet embeddings", len(hashtags) + len(tweets)) as up: [up(self.tweet_embedding(t)) for t in tweets] [up(self.hashtag_embedding(h)) for h in hashtags]
def extract_smp(filelist, savedir, descobj, njobs=None, verbose=False): """ Extract features/descriptors from a batch of images. Multi-threaded. This function calls an image descripor object on a batch of images in order to extract the images descripor. If a feature/descriptor file already exists for the image, it is skipped. This is a multi-threaded (SMP) pipeline suitable for running on a single computer. Arguments: filelist: A list of files of image names including their paths of images to read and extract descriptors from savedir: A directory in which to save all of the image features. They are pickled objects (protocol 2) with the same name as the image file. The object that is pickled is the return from descobj.extract(). decobj: An image descriptor object which does the actual extraction work. the method called is descobj.extract(image). See descriptors.Descriptor for an abstract base class. njobs: int, Number of threads to use. If None, then the number of threads is chosen to be the same as the number of cores. verbose: bool, display progress? Returns: True if there we any errors extracting image features. False otherwise. If there is a problem extracting any image descriptors, a file "errors.log" is created in the savedir directory with a list of file names, error number and messages. """ # Try to make the save path if not os.path.exists(savedir): os.mkdir(savedir) # Set up parallel job pool = mp.Pool(processes=njobs) # Iterate through all of the images in filelist and extract features result = pool.map_async( __extract_star, itertools.izip(filelist, itertools.repeat(savedir), itertools.repeat(descobj))) # Set up progess updates nfiles = len(filelist) progbar = Progress(nfiles, title='Extracting descriptors', verbose=verbose) # Get the status while ((result.ready() is not True) and (verbose == True)): approx_rem = nfiles - result._number_left * result._chunksize progbar.update(max(0, approx_rem)) time.sleep(5) progbar.finished() # Get notification of errors errflag = any(result.get()) pool.close() pool.join() if errflag == True: print('Done, with errors. See the "errors.log" file in ' + savedir)