Example #1
0
 def _load_archive(self, path, extract_path):
     from zipfile import ZipFile, ZIP_DEFLATED
     try:
         zfile = ZipFile(path, mode='r', compression=ZIP_DEFLATED)
         allfile = zfile.namelist()
         # validate extract_path
         if not os.path.isdir(extract_path):
             raise ValueError('Extract path must be path folder, but path'
                              '={} is a file'.format(extract_path))
         extract_path = os.path.join(
             extract_path,
             os.path.basename(path).replace('.zip', ''))
         # found the extracted dir, use it
         if os.path.isdir(extract_path) and \
            set(os.listdir(extract_path)) == set(allfile):
             self._set_path(extract_path)
             return
         # decompress everything
         if not os.path.exists(extract_path):
             os.mkdir(extract_path)
         maxlen = max([len(i) for i in allfile])
         progbar = Progbar(len(allfile))
         for i, f in enumerate(allfile):
             zfile.extract(f, path=extract_path)
             progbar.title = ('Unarchiving: %-' + str(maxlen) + 's') % f
             progbar.update(i + 1)
         # ====== finally set path ====== #
         self._set_path(extract_path)
     except IOError as e:
         raise IOError('Error loading archived dataset, path:{}, error:{}'
                       '.'.format(path, e))
     return None
Example #2
0
 def fit(self, texts, vocabulary=None):
     """q
     Parameters
     ----------
     texts: iterator of unicode
         iterator, generator or list of unicode string.
     """
     texts = self._validate_texts(texts)
     word_counts = self._word_counts
     word_docs = self._word_docs
     # ====== pick engine ====== #
     if self.__engine == 'spacy':
         processor = self._preprocess_docs_spacy
     elif self.__engine == 'odin':
         processor = self._preprocess_docs_odin
     # ====== start processing ====== #
     prog = Progbar(target=1208)
     start_time = timeit.default_timer()
     for nb_docs, doc in processor(texts, vocabulary, keep_order=False):
         total_docs_tokens = 0
         seen_words = {}
         # update words->count
         for token in doc:
             total_docs_tokens += 1
             word_counts[token] += 1
             # update words->doc
             if token not in seen_words:
                 seen_words[token] = 1
                 word_docs[token] += 1
         # save longest docs
         if total_docs_tokens > self.__longest_document[-1]:
             self.__longest_document = [doc, total_docs_tokens]
         # print progress
         if self.print_progress:
             prog.title = '[Training]#Doc:%d #Tok:%d' % (nb_docs,
                                                         len(word_counts))
             prog.add(1)
             if prog.seen_so_far >= 0.8 * prog.target:
                 prog.target = 1.2 * prog.target
     # ====== print summary of the process ====== #
     if self.print_progress:
         prog.target = nb_docs
         prog.update(nb_docs)
     processing_time = timeit.default_timer() - start_time
     print('Processed %d-docs, %d-tokens in %f second.' %
           (nb_docs, len(word_counts), processing_time))
     self.nb_docs += nb_docs
     # ====== sorting ====== #
     self._refresh_dictionary()
     return self
Example #3
0
    def archive(self):
        from zipfile import ZipFile, ZIP_DEFLATED
        path = self.archive_path
        zfile = ZipFile(path, mode='w', compression=ZIP_DEFLATED)

        files = set([_[-1] for _ in self._data_map.itervalues()])

        progbar = Progbar(len(files), title='Archiving:')
        maxlen = max([len(os.path.basename(i)) for i in files])
        for i, f in enumerate(files):
            zfile.write(f, os.path.basename(f))
            progbar.title = ('Archiving: %-' + str(maxlen) +
                             's') % os.path.basename(f)
            progbar.update(i + 1)
        zfile.close()
        return path
Example #4
0
    def predict_proba(self, *args):
        self._auto_create_inputs(args)
        self._create_function()

        n = 0
        nb_samples = args[0].shape[0]
        batch_size = self._batch_size
        prediction = []
        prog = Progbar(target=nb_samples, title='Predicting')
        while n < nb_samples:
            end = min(n + batch_size, nb_samples)
            x = [i[n:end] for i in args]
            x = self._functions['pred'](*x)
            _min = np.min(x, axis=-1)[:, None]
            _max = np.max(x, axis=-1)[:, None]
            x = (x - _min) / (_max - _min)
            x = x / x.sum(-1)[:, None]
            prediction.append(x)
            n = end
            prog.update(n)

        return np.concatenate(prediction, axis=0)
Example #5
0
 def transform(self,
               texts,
               mode='seq',
               dtype='int32',
               padding='pre',
               truncating='pre',
               value=0.,
               end_document=None,
               maxlen=None,
               token_not_found='ignore'):
     """
     Parameters
     ----------
     mode: 'binary', 'tfidf', 'count', 'freq', 'seq'
         'binary', abc
         'tfidf', abc
         'count', abc
         'freq', abc
         'seq', abc
     token_not_found: 'ignore', 'raise', a token string, an integer
         pass
     """
     # ====== check arguments ====== #
     texts = self._validate_texts(texts)
     # ====== check mode ====== #
     mode = str(mode)
     if mode not in ('seq', 'binary', 'count', 'freq', 'tfidf'):
         raise ValueError('The "mode" argument must be: "seq", "binary", '
                          '"count", "freq", or "tfidf".')
     # ====== check token_not_found ====== #
     if not isinstance(token_not_found, Number) and \
     not is_string(token_not_found) and \
     token_not_found not in ('ignore', 'raise'):
         raise ValueError('token_not_found can be: "ignore", "raise"'
                          ', an integer of token index, or a string '
                          'represented a token.')
     if token_not_found not in ('ignore', 'raise'):
         token_not_found = int(self.dictionary[token_not_found])
     elif isinstance(token_not_found, Number):
         token_not_found = int(token_not_found)
     # ====== pick engine ====== #
     if self.__engine == 'spacy':
         processor = self._preprocess_docs_spacy
     elif self.__engine == 'odin':
         processor = self._preprocess_docs_odin
     # ====== Initialize variables ====== #
     dictionary = self.dictionary
     results = []
     # ====== preprocess arguments ====== #
     if isinstance(end_document, str):
         end_document = dictionary.index(end_document)
     elif isinstance(end_document, Number):
         end_document = int(end_document)
     # ====== processing ====== #
     if hasattr(texts, '__len__'):
         target_len = len(texts)
         auto_adjust_len = False
     else:
         target_len = 1208
         auto_adjust_len = True
     prog = Progbar(target=target_len)
     for nb_docs, doc in processor(texts, vocabulary=None, keep_order=True):
         # found the word in dictionary
         vec = []
         for x in doc:
             idx = dictionary.get(x, -1)
             if idx >= 0:
                 vec.append(idx)
                 # not found the token in dictionary
             elif token_not_found == 'ignore':
                 continue
             elif token_not_found == 'raise':
                 raise RuntimeError(
                     'Cannot find token: "%s" in dictionary' % x)
             elif isinstance(token_not_found, int):
                 vec.append(token_not_found)
         # append ending document token
         if end_document is not None:
             vec.append(end_document)
         # add the final results
         results.append(vec)
         # print progress
         if self.print_progress:
             prog.title = "[Transforming] %d docs" % nb_docs
             prog.add(1)
             if auto_adjust_len and prog.seen_so_far >= 0.8 * prog.target:
                 prog.target = 1.2 * prog.target
     # end the process
     if self.print_progress and auto_adjust_len:
         prog.target = nb_docs
         prog.update(nb_docs)
     # ====== pad the sequence ====== #
     # just transform into sequence of tokens
     if mode == 'seq':
         maxlen = self.longest_document_length if maxlen is None \
             else int(maxlen)
         results = pad_sequences(results,
                                 maxlen=maxlen,
                                 dtype=dtype,
                                 padding=padding,
                                 truncating=truncating,
                                 value=value)
     # transform into one-hot matrix
     else:
         X = np.zeros(shape=(len(results), self.nb_words))
         for i, seq in enumerate(results):
             if mode == 'binary':
                 X[i, seq] = 1
             elif mode == 'freq':
                 length = len(seq)
                 count = freqcount(seq)
                 for tok, n in count.iteritems():
                     X[i, tok] = n / float(length)
             elif mode == 'count':
                 count = freqcount(seq)
                 for tok, n in count.iteritems():
                     X[i, tok] = n
             elif mode == 'tfidf':
                 count = freqcount(seq)
                 for tok, n in count.iteritems():
                     tf = 1 + np.log(n)
                     docs_freq = self._word_dictionary_info.get(
                         tok, (0, 0))[-1]
                     idf = np.log(1 + self.nb_docs / (1 + docs_freq))
                     X[i, tok] = tf * idf
         results = X
     return results
Example #6
0
class ProgressMonitor(Callback):

    '''
    Parameters
    ----------
    title : str
        pattern to serialize return from function to string
    format: str
        format for the results, using the new python style
        (e.g. {0}, {1}, {:.4f}), ... and not %s, %d ...)
    tracking: list
        list of [(index, postprocessing_func)] or a dictionary,
        tracking information at given index of the return value
        during batch_end, then, postprocess and print it in epoch_end.

    Example
    -------
    >>> t = training.Task(dataset=ds, batch_size=512)
    >>> t.set_callback(training.ProgressMonitor(name='Test',
    ...                                         format='Result: {:.4f}',
    ...                                         tracking={1: lambda x: sum(x)}))
    >>> t.run()
    # Result: 52751.29 98/98 [=======================================] - 0s

    Note
    ----
    This callback require specify `samples_size` in **kwargs of record
    '''

    def __init__(self, name, format='', tracking=[]):
        super(ProgressMonitor, self).__init__()
        self.name = name
        self._history = []
        self._prog = Progbar(100, title='')
        # ====== format ====== #
        self._format_results = 0
        for i in _PLACEHOLDER:
            self._format_results += len(i.findall(format))
        self._format = format
        # ====== one-time tracking at epoch_end ====== #
        if isinstance(tracking, dict):
            tracking = tracking.iteritems()
        self.tracking = [(int(i), j) for i, j in tracking if callable(j)]
        self._tracking_history = defaultdict(list)

    @property
    def _saveable_variables(self):
        return {'_format': self._format,
                '_format_results': self._format_results,
                '_history': [],
                'tracking': self.tracking,
                '_tracking_history': defaultdict(list),
                '_prog': Progbar(100, title=''),
                'name': self.name}

    def epoch_start(self):
        # reset tiem of ProgressBar
        if self.name == self.event_name:
            self._prog.start = time.time()

    def batch_end(self):
        # do nothing for not specified task
        if self.name != self.event_name or 'samples_size' not in self:
            return
        samples_size = self['samples_size']
        # ====== title ====== #
        r = self.results if isinstance(self.results, (tuple, list)) else (self.results,)
        r = [i.tolist() if isinstance(i, np.ndarray) and i.ndim == 0 else i
             for i in r]
        # ====== tracking ====== #
        for i, j in self.tracking:
            self._tracking_history[i].append(r[i])
        r = r[:self._format_results]
        self._history.append(r)
        title = (self._format.format(*r)
                 if self._format_results else self._format)
        # title
        self._prog.title = 'Name:%-8s,Epoch:%2d,' % \
        (self.name[:8], self.nb_epoch) + title
        # progress
        n = round(((self.nb_samples % samples_size) / samples_size) * 100)
        self._prog.update(min(int(n), 99))

    def epoch_end(self):
        # do nothing for not specified task
        if self.name != self.event_name:
            return
        # risky move: get the mean of all results
        if self._format_results:
            r = np.mean(self._history, axis=0).tolist()
            title = self._format.format(*r)
        else:
            title = self._format
        # reset
        self._history = []
        # title
        self._prog.title = 'Name:%-8s,Epoch:%2d,' % (
            self.event_name, self.nb_epoch) + title
        # always 100% at the end of epoch
        self._prog.target = 100; self._prog.update(100)
        # tracking
        for i, f in self.tracking:
            r = self._tracking_history[i]
            r = f(r)
            print('Tracking name-"%s" at location-%d:' % (self.name, i))
            print(r)
        self._tracking_history = defaultdict(list)