Exemple #1
0
 def process (self, filename):
     curSize = 0
     progress = Progress(filename, 0, os.stat(filename)[6])
     for ln in open(filename):
         if ln.startswith('RF'):
             self.processRF(ln)
         curSize += len(ln)
         progress.update(curSize)
Exemple #2
0
    def __init__(self, tag: str, from_top: bool = True):
        self.base_url = (f'https://archiveofourown.org/tags/'
                         f'{quote(tag).replace(".", "*d*")}/works?page=')
        tag_path = paths.tag_path(tag)
        self.progress = Progress(tag_path)
        self.last = self.progress.read()[0]

        self.path = paths.meta_path(tag)
        log_path = paths.meta_log_path(tag)
        super().__init__(tag + '_meta', log_path)

        self.from_top = self._start_from_top(from_top)
Exemple #3
0
def main(argv=None):
    _init_output_directories()

    # step 1: create the negatives and positives directory
    if FLAGS.do_full_prepare:
        print('Loading labels from %s' % FLAGS.label_file)
        lr = LabelRecord()
        label_records = lr.load(FLAGS.label_file)
        all_bounding_boxes = Box.get_all_bounding_boxes(label_records)
        counter = 0

        # fill examples, originals, negatives, and positives directories
        print('Processing images...')
        for (_, v) in label_records.items():
            Progress.show_progress(counter)
            image = CXRImage.get_image_data(v.filename, FLAGS.image_path)
            basefilename = os.path.splitext(v.filename)[0]
            if v.hasBoundingBox:
                for i in range(0, v.boundingBoxes.shape[0]):
                    box = v.boundingBoxes[i, :]
                    #CXRImage.extract_center_and_write(image,box,1024,1024,FLAGS.positives_dir)
                    CXRImage.extract_anisotropic_scale_and_write(
                        image, box, FLAGS.image_size, FLAGS.image_size,
                        FLAGS.positives_dir)
                CXRImage.write_image(image, FLAGS.examples_dir,
                                     "%s.jpg" % basefilename)
            else:
                i = np.int32(
                    np.random.randint(0, all_bounding_boxes.shape[0] - 1))
                box = all_bounding_boxes[i, :]
                CXRImage.extract_anisotropic_scale_and_write(
                    image, box, FLAGS.image_size, FLAGS.image_size,
                    FLAGS.negatives_dir)
                #CXRImage.extract_center_and_write(image,box,1024,1024,FLAGS.negatives_dir)

            if (v.hasBoundingBox):
                img = (CXRImage.xlate_image(image) * 255).astype(np.uint8)
                CXRImage.write_image_with_bounding_boxes(
                    img, FLAGS.originals_dir, "%s.jpg" % basefilename,
                    v.boundingBoxes)
            counter = counter + 1

    # step 2: create the pre-training features by combining negatives and positives into pre_train.tfrecord
    print('\nCreating pre-train file...')
    rec = Record(1024, 1024, 1 if FLAGS.grayscale else 3)
    total = rec.create_pre_train_file(FLAGS.positives_dir, FLAGS.negatives_dir,
                                      FLAGS.pre_train_file)
    print('\n%d files combined in %s' % (total, FLAGS.pre_train_file))
Exemple #4
0
    def process_tweets(self):
        from utils.twitter.tokenizer import Tweet_Tokenizer

        # for more documentation, visit:
        # https://dev.twitter.com/overview/api/tweets
        def extract_text(tweet_json):
            try:
                tweet_json["text"]
            except KeyError:
                return

            orig_text = tweet_json["text"].lower().strip()
            tweet_text = Tweet_Tokenizer.apply_regex_to_text(
                orig_text, self.replace_hashtags, self.replace_links,
                self.replace_user_refs)
            tweet_obj = Tweet(tweet_text, orig_text)
            #get, process, and package hashtags (this also stores and counts them)
            hashtags = [
                Hashtag(h["text"].lower())
                for h in tweet_json["entities"]["hashtags"]
            ]
            #connect hashtags to tweet and vice versa
            tweet_obj.register_hashtags(hashtags)
            [h.register_tweet(tweet_obj) for h in hashtags]

        if self.show_progress:
            with Progress("Extracting tweets",
                          len(self.tweet_JSON_objs)) as up:
                [
                    up(extract_text(tweet_obj))
                    for tweet_obj in self.tweet_JSON_objs[0]
                ]
        else:
            print("Extracting tweets.")
            [extract_text(tweet_obj) for tweet_obj in self.tweet_JSON_objs]
Exemple #5
0
 def process (self, filename, minlevel, maxlevel):
     self.__minlevel = minlevel
     self.__maxlevel = maxlevel
     cursize = 0
     progress = Progress(filename, 0, os.stat(filename)[6])
     for ln in open(filename):
         if ln.startswith('AF'):
             self.processAF(ln)
         elif ln.startswith('LF'):
             self.processLF(ln)
         elif ln.startswith('LM'):
             self.processLM(ln)
         elif ln.startswith('PF'):
             self.processPF(ln)
         cursize += len(ln)
         progress.update(cursize)
Exemple #6
0
def extract_batch(filelist, savedir, descobj, verbose=False):
    """ Extract features/descriptors from a batch of images. Single-threaded. 

    This function calls an image descripor object on a batch of images in order
    to extract the images descripor. If a feature/descriptor file already exists
    for the image, it is skipped. This is a single-threaded pipeline.

    Arguments:
        filelist: A list of files of image names including their paths of images
                  to read and extract descriptors from
        savedir:  A directory in which to save all of the image features. They
                  are pickled objects (protocol 2) with the same name as the
                  image file. The object that is pickled is the return from
                  descobj.extract().
        decobj:   An image descriptor object which does the actual extraction
                  work. the method called is descobj.extract(image). See
                  descriptors.Descriptor for an abstract base class. 
        verbose:  bool, display progress?

    Returns:
        True if there we any errors extracting image features. False otherwise. 
        If there is a problem extracting any image descriptors, a file
        "errors.log" is created in the savedir directory with a list of file
        names, error number and messages.

    """

    # Try to make the save path
    if not os.path.exists(savedir):
        os.mkdir(savedir)

    errflag = False

    # Set up progess updates
    nfiles = len(filelist)
    progbar = Progress(nfiles, title="Extracting descriptors", verbose=verbose)

    # Iterate through all of the images in filelist and extract features
    for i, impath in enumerate(filelist):
        errflag |= extract(impath, savedir, descobj)
        progbar.update(i)

    progbar.finished()

    if errflag == True:
        print('Done with errors. See the "errors.log" file in ' + savedir)
Exemple #7
0
 def build (self, model, grid, level):
     simplify = True
     self.__meshset = set()
     total = 0
     for mesh in model.keys():
         total += 1
     self.__model = model
     counter = 0
     progress = Progress(("construct<%d>...." % level), 0, total)
     for mesh, parcel in self.__model.items():
         mainmap = MainMapBuilder()
         rect = grid.getMeshRect(mesh)
         bin = mainmap.build(parcel, rect, simplify)
         offset = self.__stream.tell()
         self.__stream.write(bin)
         self.__index_builder.registerParcel(mesh, offset, len(bin))
         counter += 1
         progress.update(counter)
Exemple #8
0
 def process(self, level):
     # enum
     meshList = set()
     if level == 0:
         for meshId in self.__modelSrc.keys():
             meshList.add(meshId)
     else:
         for meshId in self.__modelSrc.keys():
             meshList.add(self.__grid.getUpperMesh(meshId))
     # construct
     counter = 0
     progress = Progress(("building <%d>...." % level), 0, len(meshList))
     for meshId in meshList:
         progress.update(counter); counter += 1
         parcel = self.__constructParcel(meshId)
         if parcel:
             self.__modelOut.put(meshId, parcel)
             pass
     return None
Exemple #9
0
def AddSequences():
    """Creates one training, validation."""
    errors = []

    # Generate datasets file lists.
    sequences = FindPatternFiles(FLAGS.input_dir, FLAGS.view_pattern, errors)
    num_frames = PrintSequencesInfo(sequences,
                                    'Found the following datasets and files:')

    # Sharding and randomizing sets.
    if FLAGS.max_per_shard > 0:
        sequences = ShardSequences(sequences, FLAGS.max_per_shard)
        num_frames = PrintSequencesInfo(sequences, 'After sharding:')
        tf.logging.info('')

    # Process sets.
    progress = Progress(num_frames)
    output_list = []
    for sequence in sequences:
        record_name = os.path.join(FLAGS.output_dir,
                                   '%s.tfrecord' % sequence['name'])
        if tf.gfile.Exists(record_name) and not FLAGS.overwrite:
            ok, num_frames = CheckRecord(record_name, sequence)
            if ok:
                progress.Add(num_frames)
                tf.logging.info('Skipping existing output file: %s' %
                                record_name)
                continue
            else:
                tf.logging.info(
                    'File does not match sequence, reprocessing...')
        output_dir = os.path.dirname(record_name)
        if not tf.gfile.Exists(output_dir):
            tf.logging.info('Creating output directory: %s' % output_dir)
            tf.gfile.MakeDirs(output_dir)
        output_list.append(record_name)
        tf.logging.info('Writing to ' + record_name)
        writer = tf.python_io.TFRecordWriter(record_name)
        AddSequence(sequence, writer, progress, errors)
        writer.close()
    tf.logging.info('Wrote dataset files: ' + str(output_list))
    tf.logging.info('All errors (%d): %s' % (len(errors), str(errors)))
Exemple #10
0
def main():
    # load data
    mnist = input_data.read_data_sets(args.dataset_path + 'MNIST_data',
                                      one_hot=True)
    # we can access to images like this:
    # images = mnist.train.images;  images.shape = []
    # labels = mnist.train.labels; each label is a probability distribution.
    print(mnist.train.num_examples)

    max_epoch = args.max_epoch
    max_loop_z = args.max_loop_z
    with tf.Graph().as_default():
        config = Config()
        # model = MLP(config)
        model = DCGN(config)
        tf.get_default_graph().finalize()

        progress = Progress()

        n_batch_loop = int(mnist.train.num_examples / config.batch_size)
        for epoch in range(max_epoch):
            sum_cost = 0
            progress.start_epoch(epoch, max_epoch)

            for t in range(n_batch_loop):
                # batch_X: batch_size x n_input
                # batch_y: batch_size

                batch_X, batch_y = mnist.train.next_batch(config.batch_size,
                                                          shuffle=False)
                batch_indices = np.arange(
                    t * config.batch_size,
                    t * config.batch_size + config.batch_size)
                for z_t in range(max_loop_z):
                    cost_z = model.forward_backprop_z(batch_X, batch_indices)
                    model.project_z_L2()
                    #print(cost_z)

                cost_per_sample = model.forward_backprop_theta(
                    batch_X, batch_indices)
                sum_cost += cost_per_sample
                # model.increaseBatchID()

                if t % 10 == 0:
                    progress.show(t, n_batch_loop, {})

            print("cost: {}".format(sum_cost / n_batch_loop))
            model.save(epoch, args.model_dir)
Exemple #11
0
def extract_batch(filelist, savedir, descobj, verbose=False):
    """ Extract features/descriptors from a batch of images. Single-threaded. 

    This function calls an image descripor object on a batch of images in order
    to extract the images descripor. If a feature/descriptor file already exists
    for the image, it is skipped. This is a single-threaded pipeline.

    Arguments:
        filelist: A list of files of image names including their paths of images
                  to read and extract descriptors from
        savedir:  A directory in which to save all of the image features. They
                  are pickled objects (protocol 2) with the same name as the
                  image file. The object that is pickled is the return from
                  descobj.extract().
        decobj:   An image descriptor object which does the actual extraction
                  work. the method called is descobj.extract(image). See
                  descriptors.Descriptor for an abstract base class. 
        verbose:  bool, display progress?

    Returns:
        True if there we any errors extracting image features. False otherwise. 
        If there is a problem extracting any image descriptors, a file
        "errors.log" is created in the savedir directory with a list of file
        names, error number and messages.

    """

    # Try to make the save path
    if not os.path.exists(savedir):
        os.mkdir(savedir)

    errflag = False

    # Set up progess updates
    nfiles = len(filelist)
    progbar = Progress(nfiles, title='Extracting descriptors', verbose=verbose)

    # Iterate through all of the images in filelist and extract features
    for i, impath in enumerate(filelist):
        errflag |= extract(impath, savedir, descobj)
        progbar.update(i)

    progbar.finished()

    if errflag == True:
        print('Done with errors. See the "errors.log" file in ' + savedir)
Exemple #12
0
 def __init__(self,
              json_txt,
              show_progress=False,
              replace_hashtags=True,
              replace_user_refs=False,
              replace_links=True):
     import json
     self.show_progress = show_progress
     self.replace_links = replace_links
     self.replace_hashtags = replace_hashtags
     self.replace_user_refs = replace_user_refs
     #parse text into json objects
     if self.show_progress:
         from utils.progress import Progress
         with Progress("Parsing text into JSON Object",
                       len(json_txt)) as up:
             self.tweet_JSON_objs = [
                 u(json.loads(line)) for line in json_txt
             ]
     else:
         print("Parsing text into JSON Object.")
         self.tweet_JSON_objs = [json.loads(line) for line in json_txt]
     #extract text from tweets
     self.process_tweets()
Exemple #13
0
def extract_smp(filelist, savedir, descobj, njobs=None, verbose=False):
    """ Extract features/descriptors from a batch of images. Multi-threaded. 

    This function calls an image descripor object on a batch of images in order
    to extract the images descripor. If a feature/descriptor file already exists
    for the image, it is skipped. This is a multi-threaded (SMP) pipeline
    suitable for running on a single computer.

    Arguments:
        filelist: A list of files of image names including their paths of images
                  to read and extract descriptors from
        savedir:  A directory in which to save all of the image features. They
                  are pickled objects (protocol 2) with the same name as the
                  image file. The object that is pickled is the return from
                  descobj.extract().
        decobj:   An image descriptor object which does the actual extraction
                  work. the method called is descobj.extract(image). See
                  descriptors.Descriptor for an abstract base class. 
        njobs:    int, Number of threads to use. If None, then the number of
                  threads is chosen to be the same as the number of cores.
        verbose:  bool, display progress?

    Returns:
        True if there we any errors extracting image features. False otherwise. 
        If there is a problem extracting any image descriptors, a file
        "errors.log" is created in the savedir directory with a list of file
        names, error number and messages.

    """

    # Try to make the save path
    if not os.path.exists(savedir):
        os.mkdir(savedir)

    # Set up parallel job
    pool = mp.Pool(processes=njobs)

    # Iterate through all of the images in filelist and extract features
    result = pool.map_async(
        __extract_star,
        itertools.izip(filelist, itertools.repeat(savedir),
                       itertools.repeat(descobj)))

    # Set up progess updates
    nfiles = len(filelist)
    progbar = Progress(nfiles, title='Extracting descriptors', verbose=verbose)

    # Get the status
    while ((result.ready() is not True) and (verbose == True)):
        approx_rem = nfiles - result._number_left * result._chunksize
        progbar.update(max(0, approx_rem))
        time.sleep(5)

    progbar.finished()

    # Get notification of errors
    errflag = any(result.get())
    pool.close()
    pool.join()

    if errflag == True:
        print('Done, with errors. See the "errors.log" file in ' + savedir)
Exemple #14
0
def main(args):
    ###############################
    # TRAIN PREP
    ###############################
    print("Loading data")
    train_loader, valid_loader, data_var, input_size = \
                                data.get_data(args.data_folder,args.batch_size)

    args.input_size = input_size
    args.downsample = args.input_size[-1] // args.enc_height
    args.data_variance = data_var
    print(f"Training set size {len(train_loader.dataset)}")
    print(f"Validation set size {len(valid_loader.dataset)}")

    print("Loading model")
    if args.model == 'diffvqvae':
        model = DiffVQVAE(args).to(device)
    elif args.model == 'vqvae':
        model = VQVAE(args).to(device)
    print(
        f'The model has {utils.count_parameters(model):,} trainable parameters'
    )

    optimizer = optim.Adam(model.parameters(),
                           lr=args.learning_rate,
                           amsgrad=False)

    print(f"Start training for {args.num_epochs} epochs")
    num_batches = math.ceil(
        len(train_loader.dataset) / train_loader.batch_size)
    pbar = Progress(num_batches, bar_length=10, custom_increment=True)

    # Needed for bpd
    args.KL = args.enc_height * args.enc_height * args.num_codebooks * \
                                                    np.log(args.num_embeddings)
    args.num_pixels = np.prod(args.input_size)

    ###############################
    # MAIN TRAIN LOOP
    ###############################
    best_valid_loss = float('inf')
    train_bpd = []
    train_recon_error = []
    train_perplexity = []
    args.global_it = 0
    for epoch in range(args.num_epochs):
        pbar.epoch_start()
        train_epoch(args, vq_vae_loss, pbar, train_loader, model, optimizer,
                    train_bpd, train_recon_error, train_perplexity)
        # loss, _ = test(valid_loader, model, args)
        # pbar.print_eval(loss)
        valid_loss = evaluate(args, vq_vae_loss, pbar, valid_loader, model)
        if valid_loss < best_valid_loss:
            best_valid_loss = valid_loss
            best_valid_epoch = epoch
            torch.save(model.state_dict(), args.save_path)
        pbar.print_end_epoch()

    print("Plotting training results")
    utils.plot_results(train_recon_error, train_perplexity,
                       "results/train.png")

    print("Evaluate and plot validation set")
    generate_samples(model, valid_loader)
Exemple #15
0
class Meta(Page):
    def __init__(self, tag: str, from_top: bool = True):
        self.base_url = (f'https://archiveofourown.org/tags/'
                         f'{quote(tag).replace(".", "*d*")}/works?page=')
        tag_path = paths.tag_path(tag)
        self.progress = Progress(tag_path)
        self.last = self.progress.read()[0]

        self.path = paths.meta_path(tag)
        log_path = paths.meta_log_path(tag)
        super().__init__(tag + '_meta', log_path)

        self.from_top = self._start_from_top(from_top)

    def scrape(self) -> None:

        if self.from_top is True or self.path.is_file() is False:
            mode = 'w'
        else:
            mode = 'a'

        with open(self.path, mode) as f_out:
            pages = self._pages()
            for page, progress_num in pages:
                page_elements = self._page_elements(page)
                for element in page_elements:
                    f_out.write(json.dumps(element) + '\n')
                self.progress.write(progress_num)
        self.logger.info(f'Completed scraping "{self.page_kind}"')
        return
        super().scrape()

    def _pages(self) -> Generator[Tuple[BeautifulSoup, str], None, None]:

        try:
            page_num = int(self.last)
        except ValueError:
            self.logger.error(f'Last scraped value ({self.last})'
                              f' in .meta is not a number')
            raise ValueError

        if page_num == -1 or self.from_top is True:
            page_num = 1
        else:
            page_num += 1
        errors = 0

        self.logger.info(f"Scraping: {self.base_url}")
        try:
            max_pages = self._total_pages()
        except ConnectionError:
            self.logger.error(f'Base URL: {self.base_url} Not found.')
            raise ConnectionError(f"Error connecting to: {self.base_url}\n"
                                  f"Could your fandom name be incorrect?")
        except Exception as e:
            self.logger.error(f'Base URL: {self.base_url} Not found.')
            raise Exception(f"Other error: {e}")

        while errors < cfg.MAX_ERRORS and page_num <= max_pages:
            try:
                url = self.base_url + str(page_num)
                soup = self._get_soup(url)
            except HTTPError:
                # just move onto next page
                self.logger.error(f'PAGE: {url} 404 Error. Skipping this work.'
                                  f' {cfg.MAX_ERRORS-errors} attempts left.')
                errors += 1
                time.sleep(cfg.DELAY)
                page_num += 1
                url = self.base_url + str(page_num)
            except ConnectTimeout:
                # Try again
                errors += 1
                self.logger.error(f'PAGE: {url} Not found. '
                                  f'{cfg.MAX_ERRORS-errors} attempts left.')
                time.sleep(cfg.DELAY * errors)  # exponential decay wait
            else:
                self.logger.info(f'Scraping PAGE: {str(page_num)}')
                time.sleep(cfg.DELAY)
                yield (soup, str(page_num))
                page_num += 1
                url = self.base_url + str(page_num)

    def _page_elements(self,
                       page: BeautifulSoup) -> Generator[MetaJson, None, None]:
        """ Find each HTML element and parse out the details into a row. """

        time = datetime.datetime.now().strftime("%d/%b/%Y %H:%M")
        meta: MetaJson = {}  # type: ignore

        works = page.find_all(class_="work blurb group")
        for work in works:
            meta.update(self._get_header(work))
            meta.update(self._get_required_tags(work))
            meta.update(self._get_tags(work))
            meta.update(self._get_stats(work))
            meta['fandom'] = self._get_fandoms(work)
            meta['summary'] = self._get_summary(work)
            meta['series_part'], meta['series_name'] = self._get_series(work)
            meta['updated'] = self._get_updated(work)
            meta['scrape_date'] = time

            yield meta

    def _total_pages(self) -> int:
        ''' Make max attempts at loading base url to get starting number'''

        for attempts in range(cfg.MAX_ERRORS):
            try:
                soup = self._get_soup(self.base_url)
                next_element = soup.find('li', class_='next')
                max_pages = int(next_element.find_previous('li').text)
                self.logger.info(f'Attempting to scrape up to '
                                 f'{str(max_pages)} pages.')
                return max_pages
            except AttributeError:
                self.logger.info('Attempting to scrape 1 page.')
                return 1
            except ConnectTimeout:
                self.logger.error(f'Base URL: {self.base_url} Not found. '
                                  f'{cfg.MAX_ERRORS-attempts} attempts left.')
        raise ConnectTimeout
        return 0

    def _get_tags(self, meta: BeautifulSoup) -> Any:
        """Find relationships, characters, and freeforms tags"""
        tag_dict = {}  # type: Dict[str, Optional[List[str]]]
        tags = ['relationships', 'characters', 'freeforms']
        for tag in tags:
            tag_dict[tag] = self._get_tag_info(tag, meta)
        return tag_dict

    def _get_tag_info(self, category: str, meta: BeautifulSoup) -> \
            Optional[List[str]]:
        """ Find relationships, characters, and freeforms tags."""
        try:
            tag_list = meta.find_all("li", class_=category)
        except AttributeError:
            return None
        return [result.text for result in tag_list]

    def _get_required_tags(self, work: BeautifulSoup) -> Any:
        """Finds required tags."""
        req_dict = {}
        try:
            req_tags = work.find(class_='required-tags').find_all('a')
            req_dict['rating'] = req_tags[0].text
            req_dict['warnings'] = req_tags[1].text.split(',')
            req_dict['category'] = req_tags[2].text.split(',')
            req_dict['status'] = req_tags[3].text
        except Exception:
            req_dict['rating'] = None
            req_dict['warnings'] = []
            req_dict['category'] = []
            req_dict['status'] = None
        return req_dict

    def _get_stats(self, work: BeautifulSoup) -> Any:
        """
        Find stats (language, published, status, date status, words, chapters,
        comments, kudos, bookmarks, hits
        """
        str_categories = ['language', 'chapters']
        num_categories = [
            'collections', 'words', 'comments', 'kudos', 'bookmarks', 'hits'
        ]
        stats = {}
        for s_cat in str_categories:
            try:
                stats[s_cat] = work.find("dd", class_=s_cat).text
            except AttributeError:
                stats[s_cat] = None
        for n_cat in num_categories:
            try:
                str_num = work.find("dd", class_=n_cat).text
                stats[n_cat] = int(str_num.replace(',', ''))
            except (AttributeError, ValueError):
                stats[n_cat] = 0
        return stats

    def _get_header(self, work: BeautifulSoup) -> Any:
        '''Finds header information
           (work_id, title, author, gifted to user).'''
        header_dict = {}

        result = work.find('h4', class_='heading').find_all('a')
        header_dict['work_id'] = result[0].get('href').strip('/works/')
        header_dict['title'] = result[0].text

        auth_list = []
        header_text = work.find('h4', class_='heading').text
        if "Anonymous" in header_text:
            header_dict['author'] = ["Anonymous"]
        else:
            authors = work.find_all('a', rel='author')
            for author in authors:
                auth_list.append(author.text)
            header_dict['author'] = auth_list

        gift_list = []
        for link in result:
            href = link.get('href')
            if 'gifts' in href:
                gift_list.append(link.text)

        if len(gift_list) == 0:
            header_dict['gifted'] = []
        else:
            header_dict['gifted'] = gift_list

        return header_dict

    def _get_fandoms(self, work: BeautifulSoup) -> List[str]:
        """ Find the list of fandoms."""
        try:
            tag_list = work.find('h5', class_='fandoms heading').find_all('a')
            fan_list = [x.text for x in tag_list]
            return fan_list
        except AttributeError:
            return []

    def _get_summary(self, work: BeautifulSoup) -> Optional[str]:
        """ Find summary description and return as list of strings. """

        try:
            summary_string = work.find('blockquote',
                                       class_='userstuff summary')
            summary = summary_string.text.strip().replace('\n', ' ')
        except AttributeError:
            summary = None
        return summary

    def _get_updated(self, work: BeautifulSoup) -> Optional[str]:
        """ Find update date. Return as list of strings. """

        try:
            date = work.find('p', class_='datetime').text
        except AttributeError:
            date = None
        return date

    def _get_series(self, work: BeautifulSoup) \
            -> Tuple[Optional[str], Optional[str]]:
        """ Find series info and return as list. """

        try:
            series = work.find('ul', class_='series')
            part = series.find('strong').text
            s_name = series.find('a').text
        except AttributeError:
            part, s_name = None, None
        return part, s_name

    def _start_from_top(self, from_top: bool) -> bool:

        if from_top is True:
            self.logger.info("Scraping from the top.")
            return True
        elif self.last == self.progress.unscraped_flag:
            self.logger.info(
                f"Last scraped unknown: {self.progress.unscraped_flag}. "
                f"Scraping from the top.")
            return True
        else:
            self.logger.info(f"Picking up from {self.last} ")
            return False
Exemple #16
0
def extract_smp(filelist, savedir, descobj, njobs=None, verbose=False):
    """ Extract features/descriptors from a batch of images. Multi-threaded. 

    This function calls an image descripor object on a batch of images in order
    to extract the images descripor. If a feature/descriptor file already exists
    for the image, it is skipped. This is a multi-threaded (SMP) pipeline
    suitable for running on a single computer.

    Arguments:
        filelist: A list of files of image names including their paths of images
                  to read and extract descriptors from
        savedir:  A directory in which to save all of the image features. They
                  are pickled objects (protocol 2) with the same name as the
                  image file. The object that is pickled is the return from
                  descobj.extract().
        decobj:   An image descriptor object which does the actual extraction
                  work. the method called is descobj.extract(image). See
                  descriptors.Descriptor for an abstract base class. 
        njobs:    int, Number of threads to use. If None, then the number of
                  threads is chosen to be the same as the number of cores.
        verbose:  bool, display progress?

    Returns:
        True if there we any errors extracting image features. False otherwise. 
        If there is a problem extracting any image descriptors, a file
        "errors.log" is created in the savedir directory with a list of file
        names, error number and messages.

    """

    # Try to make the save path
    if not os.path.exists(savedir):
        os.mkdir(savedir)

    # Set up parallel job
    pool = mp.Pool(processes=njobs)

    # Iterate through all of the images in filelist and extract features
    result = pool.map_async(
        __extract_star, itertools.izip(filelist, itertools.repeat(savedir), itertools.repeat(descobj))
    )

    # Set up progess updates
    nfiles = len(filelist)
    progbar = Progress(nfiles, title="Extracting descriptors", verbose=verbose)

    # Get the status
    while (result.ready() is not True) and (verbose == True):
        approx_rem = nfiles - result._number_left * result._chunksize
        progbar.update(max(0, approx_rem))
        time.sleep(5)

    progbar.finished()

    # Get notification of errors
    errflag = any(result.get())
    pool.close()
    pool.join()

    if errflag == True:
        print('Done, with errors. See the "errors.log" file in ' + savedir)
Exemple #17
0
 def embed_tweets_hashtags(self, tweets, hashtags):
     with Progress("Calculating hashtag and tweet embeddings",
                   len(hashtags) + len(tweets)) as up:
         [up(self.tweet_embedding(t)) for t in tweets]
         [up(self.hashtag_embedding(h)) for h in hashtags]