Exemple #1
0
    def process_tweets(self):
        from utils.twitter.tokenizer import Tweet_Tokenizer

        # for more documentation, visit:
        # https://dev.twitter.com/overview/api/tweets
        def extract_text(tweet_json):
            try:
                tweet_json["text"]
            except KeyError:
                return

            orig_text = tweet_json["text"].lower().strip()
            tweet_text = Tweet_Tokenizer.apply_regex_to_text(
                orig_text, self.replace_hashtags, self.replace_links,
                self.replace_user_refs)
            tweet_obj = Tweet(tweet_text, orig_text)
            #get, process, and package hashtags (this also stores and counts them)
            hashtags = [
                Hashtag(h["text"].lower())
                for h in tweet_json["entities"]["hashtags"]
            ]
            #connect hashtags to tweet and vice versa
            tweet_obj.register_hashtags(hashtags)
            [h.register_tweet(tweet_obj) for h in hashtags]

        if self.show_progress:
            with Progress("Extracting tweets",
                          len(self.tweet_JSON_objs)) as up:
                [
                    up(extract_text(tweet_obj))
                    for tweet_obj in self.tweet_JSON_objs[0]
                ]
        else:
            print("Extracting tweets.")
            [extract_text(tweet_obj) for tweet_obj in self.tweet_JSON_objs]
Exemple #2
0
    def __init__(self, tag: str, from_top: bool = True):
        self.base_url = (f'https://archiveofourown.org/tags/'
                         f'{quote(tag).replace(".", "*d*")}/works?page=')
        tag_path = paths.tag_path(tag)
        self.progress = Progress(tag_path)
        self.last = self.progress.read()[0]

        self.path = paths.meta_path(tag)
        log_path = paths.meta_log_path(tag)
        super().__init__(tag + '_meta', log_path)

        self.from_top = self._start_from_top(from_top)
Exemple #3
0
def main():
    # load data
    mnist = input_data.read_data_sets(args.dataset_path + 'MNIST_data',
                                      one_hot=True)
    # we can access to images like this:
    # images = mnist.train.images;  images.shape = []
    # labels = mnist.train.labels; each label is a probability distribution.
    print(mnist.train.num_examples)

    max_epoch = args.max_epoch
    max_loop_z = args.max_loop_z
    with tf.Graph().as_default():
        config = Config()
        # model = MLP(config)
        model = DCGN(config)
        tf.get_default_graph().finalize()

        progress = Progress()

        n_batch_loop = int(mnist.train.num_examples / config.batch_size)
        for epoch in range(max_epoch):
            sum_cost = 0
            progress.start_epoch(epoch, max_epoch)

            for t in range(n_batch_loop):
                # batch_X: batch_size x n_input
                # batch_y: batch_size

                batch_X, batch_y = mnist.train.next_batch(config.batch_size,
                                                          shuffle=False)
                batch_indices = np.arange(
                    t * config.batch_size,
                    t * config.batch_size + config.batch_size)
                for z_t in range(max_loop_z):
                    cost_z = model.forward_backprop_z(batch_X, batch_indices)
                    model.project_z_L2()
                    #print(cost_z)

                cost_per_sample = model.forward_backprop_theta(
                    batch_X, batch_indices)
                sum_cost += cost_per_sample
                # model.increaseBatchID()

                if t % 10 == 0:
                    progress.show(t, n_batch_loop, {})

            print("cost: {}".format(sum_cost / n_batch_loop))
            model.save(epoch, args.model_dir)
Exemple #4
0
def extract_batch(filelist, savedir, descobj, verbose=False):
    """ Extract features/descriptors from a batch of images. Single-threaded. 

    This function calls an image descripor object on a batch of images in order
    to extract the images descripor. If a feature/descriptor file already exists
    for the image, it is skipped. This is a single-threaded pipeline.

    Arguments:
        filelist: A list of files of image names including their paths of images
                  to read and extract descriptors from
        savedir:  A directory in which to save all of the image features. They
                  are pickled objects (protocol 2) with the same name as the
                  image file. The object that is pickled is the return from
                  descobj.extract().
        decobj:   An image descriptor object which does the actual extraction
                  work. the method called is descobj.extract(image). See
                  descriptors.Descriptor for an abstract base class. 
        verbose:  bool, display progress?

    Returns:
        True if there we any errors extracting image features. False otherwise. 
        If there is a problem extracting any image descriptors, a file
        "errors.log" is created in the savedir directory with a list of file
        names, error number and messages.

    """

    # Try to make the save path
    if not os.path.exists(savedir):
        os.mkdir(savedir)

    errflag = False

    # Set up progess updates
    nfiles = len(filelist)
    progbar = Progress(nfiles, title='Extracting descriptors', verbose=verbose)

    # Iterate through all of the images in filelist and extract features
    for i, impath in enumerate(filelist):
        errflag |= extract(impath, savedir, descobj)
        progbar.update(i)

    progbar.finished()

    if errflag == True:
        print('Done with errors. See the "errors.log" file in ' + savedir)
Exemple #5
0
def AddSequences():
    """Creates one training, validation."""
    errors = []

    # Generate datasets file lists.
    sequences = FindPatternFiles(FLAGS.input_dir, FLAGS.view_pattern, errors)
    num_frames = PrintSequencesInfo(sequences,
                                    'Found the following datasets and files:')

    # Sharding and randomizing sets.
    if FLAGS.max_per_shard > 0:
        sequences = ShardSequences(sequences, FLAGS.max_per_shard)
        num_frames = PrintSequencesInfo(sequences, 'After sharding:')
        tf.logging.info('')

    # Process sets.
    progress = Progress(num_frames)
    output_list = []
    for sequence in sequences:
        record_name = os.path.join(FLAGS.output_dir,
                                   '%s.tfrecord' % sequence['name'])
        if tf.gfile.Exists(record_name) and not FLAGS.overwrite:
            ok, num_frames = CheckRecord(record_name, sequence)
            if ok:
                progress.Add(num_frames)
                tf.logging.info('Skipping existing output file: %s' %
                                record_name)
                continue
            else:
                tf.logging.info(
                    'File does not match sequence, reprocessing...')
        output_dir = os.path.dirname(record_name)
        if not tf.gfile.Exists(output_dir):
            tf.logging.info('Creating output directory: %s' % output_dir)
            tf.gfile.MakeDirs(output_dir)
        output_list.append(record_name)
        tf.logging.info('Writing to ' + record_name)
        writer = tf.python_io.TFRecordWriter(record_name)
        AddSequence(sequence, writer, progress, errors)
        writer.close()
    tf.logging.info('Wrote dataset files: ' + str(output_list))
    tf.logging.info('All errors (%d): %s' % (len(errors), str(errors)))
Exemple #6
0
 def __init__(self,
              json_txt,
              show_progress=False,
              replace_hashtags=True,
              replace_user_refs=False,
              replace_links=True):
     import json
     self.show_progress = show_progress
     self.replace_links = replace_links
     self.replace_hashtags = replace_hashtags
     self.replace_user_refs = replace_user_refs
     #parse text into json objects
     if self.show_progress:
         from utils.progress import Progress
         with Progress("Parsing text into JSON Object",
                       len(json_txt)) as up:
             self.tweet_JSON_objs = [
                 u(json.loads(line)) for line in json_txt
             ]
     else:
         print("Parsing text into JSON Object.")
         self.tweet_JSON_objs = [json.loads(line) for line in json_txt]
     #extract text from tweets
     self.process_tweets()
Exemple #7
0
def main(args):
    ###############################
    # TRAIN PREP
    ###############################
    print("Loading data")
    train_loader, valid_loader, data_var, input_size = \
                                data.get_data(args.data_folder,args.batch_size)

    args.input_size = input_size
    args.downsample = args.input_size[-1] // args.enc_height
    args.data_variance = data_var
    print(f"Training set size {len(train_loader.dataset)}")
    print(f"Validation set size {len(valid_loader.dataset)}")

    print("Loading model")
    if args.model == 'diffvqvae':
        model = DiffVQVAE(args).to(device)
    elif args.model == 'vqvae':
        model = VQVAE(args).to(device)
    print(
        f'The model has {utils.count_parameters(model):,} trainable parameters'
    )

    optimizer = optim.Adam(model.parameters(),
                           lr=args.learning_rate,
                           amsgrad=False)

    print(f"Start training for {args.num_epochs} epochs")
    num_batches = math.ceil(
        len(train_loader.dataset) / train_loader.batch_size)
    pbar = Progress(num_batches, bar_length=10, custom_increment=True)

    # Needed for bpd
    args.KL = args.enc_height * args.enc_height * args.num_codebooks * \
                                                    np.log(args.num_embeddings)
    args.num_pixels = np.prod(args.input_size)

    ###############################
    # MAIN TRAIN LOOP
    ###############################
    best_valid_loss = float('inf')
    train_bpd = []
    train_recon_error = []
    train_perplexity = []
    args.global_it = 0
    for epoch in range(args.num_epochs):
        pbar.epoch_start()
        train_epoch(args, vq_vae_loss, pbar, train_loader, model, optimizer,
                    train_bpd, train_recon_error, train_perplexity)
        # loss, _ = test(valid_loader, model, args)
        # pbar.print_eval(loss)
        valid_loss = evaluate(args, vq_vae_loss, pbar, valid_loader, model)
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            best_valid_epoch = epoch
            torch.save(model.state_dict(), args.save_path)
        pbar.print_end_epoch()

    print("Plotting training results")
    utils.plot_results(train_recon_error, train_perplexity,
                       "results/train.png")

    print("Evaluate and plot validation set")
    generate_samples(model, valid_loader)
Exemple #8
0
 def embed_tweets_hashtags(self, tweets, hashtags):
     with Progress("Calculating hashtag and tweet embeddings",
                   len(hashtags) + len(tweets)) as up:
         [up(self.tweet_embedding(t)) for t in tweets]
         [up(self.hashtag_embedding(h)) for h in hashtags]
Exemple #9
0
def extract_smp(filelist, savedir, descobj, njobs=None, verbose=False):
    """ Extract features/descriptors from a batch of images. Multi-threaded. 

    This function calls an image descripor object on a batch of images in order
    to extract the images descripor. If a feature/descriptor file already exists
    for the image, it is skipped. This is a multi-threaded (SMP) pipeline
    suitable for running on a single computer.

    Arguments:
        filelist: A list of files of image names including their paths of images
                  to read and extract descriptors from
        savedir:  A directory in which to save all of the image features. They
                  are pickled objects (protocol 2) with the same name as the
                  image file. The object that is pickled is the return from
                  descobj.extract().
        decobj:   An image descriptor object which does the actual extraction
                  work. the method called is descobj.extract(image). See
                  descriptors.Descriptor for an abstract base class. 
        njobs:    int, Number of threads to use. If None, then the number of
                  threads is chosen to be the same as the number of cores.
        verbose:  bool, display progress?

    Returns:
        True if there we any errors extracting image features. False otherwise. 
        If there is a problem extracting any image descriptors, a file
        "errors.log" is created in the savedir directory with a list of file
        names, error number and messages.

    """

    # Try to make the save path
    if not os.path.exists(savedir):
        os.mkdir(savedir)

    # Set up parallel job
    pool = mp.Pool(processes=njobs)

    # Iterate through all of the images in filelist and extract features
    result = pool.map_async(
        __extract_star,
        itertools.izip(filelist, itertools.repeat(savedir),
                       itertools.repeat(descobj)))

    # Set up progess updates
    nfiles = len(filelist)
    progbar = Progress(nfiles, title='Extracting descriptors', verbose=verbose)

    # Get the status
    while ((result.ready() is not True) and (verbose == True)):
        approx_rem = nfiles - result._number_left * result._chunksize
        progbar.update(max(0, approx_rem))
        time.sleep(5)

    progbar.finished()

    # Get notification of errors
    errflag = any(result.get())
    pool.close()
    pool.join()

    if errflag == True:
        print('Done, with errors. See the "errors.log" file in ' + savedir)