コード例 #1
0
 def fit(self, texts, vocabulary=None):
     """q
     Parameters
     ----------
     texts: iterator of unicode
         iterator, generator or list of unicode string.
     """
     texts = self._validate_texts(texts)
     word_counts = self._word_counts
     word_docs = self._word_docs
     # ====== pick engine ====== #
     if self.__engine == 'spacy':
         processor = self._preprocess_docs_spacy
     elif self.__engine == 'odin':
         processor = self._preprocess_docs_odin
     # ====== start processing ====== #
     prog = Progbar(target=1208)
     start_time = timeit.default_timer()
     for nb_docs, doc in processor(texts, vocabulary, keep_order=False):
         total_docs_tokens = 0
         seen_words = {}
         # update words->count
         for token in doc:
             total_docs_tokens += 1
             word_counts[token] += 1
             # update words->doc
             if token not in seen_words:
                 seen_words[token] = 1
                 word_docs[token] += 1
         # save longest docs
         if total_docs_tokens > self.__longest_document[-1]:
             self.__longest_document = [doc, total_docs_tokens]
         # print progress
         if self.print_progress:
             prog.title = '[Training]#Doc:%d #Tok:%d' % (nb_docs,
                                                         len(word_counts))
             prog.add(1)
             if prog.seen_so_far >= 0.8 * prog.target:
                 prog.target = 1.2 * prog.target
     # ====== print summary of the process ====== #
     if self.print_progress:
         prog.target = nb_docs
         prog.update(nb_docs)
     processing_time = timeit.default_timer() - start_time
     print('Processed %d-docs, %d-tokens in %f second.' %
           (nb_docs, len(word_counts), processing_time))
     self.nb_docs += nb_docs
     # ====== sorting ====== #
     self._refresh_dictionary()
     return self
コード例 #2
0
    def save_cache(self, path, datatype='memmap', print_progress=True):
        """ Save all preprocessed data to a Dataset """
        if not isinstance(path, str) or os.path.isfile(path):
            raise ValueError('path must be string path to a folder.')
        if os.path.exists(path):
            print('Remove old dataset at path:', path)
            shutil.rmtree(path)

        ds = Dataset(path)
        # ====== start caching ====== #
        if print_progress:
            prog = Progbar(target=self.shape[0], title='Caching:')
        for X in self:
            if not isinstance(X, (tuple, list)):
                X = (X, )
            # saving preprocessed data
            for i, x in enumerate(X):
                name = 'data%d' % i
                if name in ds: ds[name].append(x)
                else: ds[(name, datatype)] = x
            # print progress
            if print_progress:
                prog.add(X[0].shape[0])
        prog.target = prog.seen_so_far
        prog.add(0)
        ds.flush()
        ds.close()
        # end
        return self
コード例 #3
0
ファイル: text.py プロジェクト: imito/odin
 def fit(self, texts, vocabulary=None):
   """
   Parameters
   ----------
   texts: iterator of unicode
       iterator, generator or list (e.g. [u'a', u'b', ...])
       of unicode documents.
   """
   texts = self._validate_texts(texts)
   word_counts = self._word_counts
   word_docs = self._word_docs
   # ====== pick engine ====== #
   if self.__engine == 'spacy':
     processor = self._preprocess_docs_spacy
   elif self.__engine == 'odin':
     processor = self._preprocess_docs_odin
   # ====== start processing ====== #
   prog = Progbar(target=1208, name="Fitting tokenizer",
                  print_report=True, print_summary=True)
   start_time = timeit.default_timer()
   for nb_docs, doc in processor(texts, vocabulary, keep_order=False):
     total_docs_tokens = 0
     seen_words = {}
     # update words->count
     for token in doc:
       total_docs_tokens += 1
       word_counts[token] += 1
       # update words->doc
       if token not in seen_words:
         seen_words[token] = 1
         word_docs[token] += 1
     # save longest docs
     if total_docs_tokens > self.__longest_document[-1]:
       self.__longest_document = [doc, total_docs_tokens]
     # print progress
     prog['#Doc'] = nb_docs
     prog['#Tok'] = len(word_counts)
     prog.add(1)
     if prog.seen_so_far >= 0.8 * prog.target:
       prog.target = 1.2 * prog.target
   # ====== print summary of the process ====== #
   # if self.print_progress:
   #     prog.target = nb_docs; prog.update(nb_docs)
   processing_time = timeit.default_timer() - start_time
   print('Processed %d-docs, %d-tokens in %f second.' %
       (nb_docs, len(word_counts), processing_time))
   self.nb_docs += nb_docs
   # ====== sorting ====== #
   self._refresh_dictionary()
   return self
コード例 #4
0
 def transform(self,
               texts,
               mode='seq',
               dtype='int32',
               padding='pre',
               truncating='pre',
               value=0.,
               end_document=None,
               maxlen=None,
               token_not_found='ignore'):
     """
 Parameters
 ----------
 texts: iterator of unicode
     iterator, generator or list (e.g. [u'a', u'b', ...])
     of unicode documents.
 mode: 'binary', 'tfidf', 'count', 'freq', 'seq'
     'binary', abc
     'tfidf', abc
     'count', abc
     'freq', abc
     'seq', abc
 token_not_found: 'ignore', 'raise', a token string, an integer
     pass
 """
     # ====== check arguments ====== #
     texts = self._validate_texts(texts)
     # ====== check mode ====== #
     mode = str(mode)
     if mode not in ('seq', 'binary', 'count', 'freq', 'tfidf'):
         raise ValueError('The "mode" argument must be: "seq", "binary", '
                          '"count", "freq", or "tfidf".')
     # ====== check token_not_found ====== #
     if not is_number(token_not_found) and \
     not is_string(token_not_found) and \
     token_not_found not in ('ignore', 'raise'):
         raise ValueError('token_not_found can be: "ignore", "raise"'
                          ', an integer of token index, or a string '
                          'represented a token.')
     if token_not_found not in ('ignore', 'raise'):
         token_not_found = int(self.dictionary[token_not_found])
     elif is_number(token_not_found):
         token_not_found = int(token_not_found)
     # ====== pick engine ====== #
     if self.__engine == 'spacy':
         processor = self._preprocess_docs_spacy
     elif self.__engine == 'odin':
         processor = self._preprocess_docs_odin
     # ====== Initialize variables ====== #
     dictionary = self.dictionary
     results = []
     # ====== preprocess arguments ====== #
     if isinstance(end_document, str):
         end_document = dictionary.index(end_document)
     elif is_number(end_document):
         end_document = int(end_document)
     # ====== processing ====== #
     if hasattr(texts, '__len__'):
         target_len = len(texts)
         auto_adjust_len = False
     else:
         target_len = 1234
         auto_adjust_len = True
     prog = Progbar(target=target_len,
                    name="Tokenize Transform",
                    print_report=True,
                    print_summary=True)
     for nb_docs, doc in processor(texts, vocabulary=None, keep_order=True):
         # found the word in dictionary
         vec = []
         for x in doc:
             idx = dictionary.get(x, -1)
             if idx >= 0:
                 vec.append(idx)
                 # not found the token in dictionary
             elif token_not_found == 'ignore':
                 continue
             elif token_not_found == 'raise':
                 raise RuntimeError(
                     'Cannot find token: "%s" in dictionary' % x)
             elif isinstance(token_not_found, int):
                 vec.append(token_not_found)
         # append ending document token
         if end_document is not None:
             vec.append(end_document)
         # add the final results
         results.append(vec)
         # print progress
         if self.print_progress:
             prog['#Docs'] = nb_docs
             prog.add(1)
             if auto_adjust_len and prog.seen_so_far >= 0.8 * prog.target:
                 prog.target = 1.2 * prog.target
     # end the process
     # if self.print_progress and auto_adjust_len:
     #     prog.target = nb_docs; prog.update(nb_docs)
     # ====== pad the sequence ====== #
     # just transform into sequence of tokens
     if mode == 'seq':
         maxlen = self.longest_document_length if maxlen is None \
             else int(maxlen)
         results = pad_sequences(results,
                                 maxlen=maxlen,
                                 dtype=dtype,
                                 padding=padding,
                                 truncating=truncating,
                                 value=value)
     # transform into one-hot matrix
     else:
         X = np.zeros(shape=(len(results), self.nb_words))
         for i, seq in enumerate(results):
             if mode == 'binary':
                 X[i, seq] = 1
             elif mode == 'freq':
                 length = len(seq)
                 count = freqcount(seq)
                 for tok, n in count.items():
                     X[i, tok] = n / float(length)
             elif mode == 'count':
                 count = freqcount(seq)
                 for tok, n in count.items():
                     X[i, tok] = n
             elif mode == 'tfidf':
                 count = freqcount(seq)
                 for tok, n in count.items():
                     tf = 1 + np.log(n)
                     docs_freq = self._word_dictionary_info.get(
                         tok, (0, 0))[-1]
                     idf = np.log(1 + self.nb_docs / (1 + docs_freq))
                     X[i, tok] = tf * idf
         results = X
     return results
コード例 #5
0
ファイル: text.py プロジェクト: imito/odin
 def transform(self, texts, mode='seq', dtype='int32',
               padding='pre', truncating='pre', value=0.,
               end_document=None, maxlen=None,
               token_not_found='ignore'):
   """
   Parameters
   ----------
   texts: iterator of unicode
       iterator, generator or list (e.g. [u'a', u'b', ...])
       of unicode documents.
   mode: 'binary', 'tfidf', 'count', 'freq', 'seq'
       'binary', abc
       'tfidf', abc
       'count', abc
       'freq', abc
       'seq', abc
   token_not_found: 'ignore', 'raise', a token string, an integer
       pass
   """
   # ====== check arguments ====== #
   texts = self._validate_texts(texts)
   # ====== check mode ====== #
   mode = str(mode)
   if mode not in ('seq', 'binary', 'count', 'freq', 'tfidf'):
     raise ValueError('The "mode" argument must be: "seq", "binary", '
                      '"count", "freq", or "tfidf".')
   # ====== check token_not_found ====== #
   if not is_number(token_not_found) and \
   not is_string(token_not_found) and \
   token_not_found not in ('ignore', 'raise'):
     raise ValueError('token_not_found can be: "ignore", "raise"'
                      ', an integer of token index, or a string '
                      'represented a token.')
   if token_not_found not in ('ignore', 'raise'):
     token_not_found = int(self.dictionary[token_not_found])
   elif is_number(token_not_found):
     token_not_found = int(token_not_found)
   # ====== pick engine ====== #
   if self.__engine == 'spacy':
     processor = self._preprocess_docs_spacy
   elif self.__engine == 'odin':
     processor = self._preprocess_docs_odin
   # ====== Initialize variables ====== #
   dictionary = self.dictionary
   results = []
   # ====== preprocess arguments ====== #
   if isinstance(end_document, str):
     end_document = dictionary.index(end_document)
   elif is_number(end_document):
     end_document = int(end_document)
   # ====== processing ====== #
   if hasattr(texts, '__len__'):
     target_len = len(texts)
     auto_adjust_len = False
   else:
     target_len = 1208
     auto_adjust_len = True
   prog = Progbar(target=target_len, name="Tokenize Transform",
                  print_report=True, print_summary=True)
   for nb_docs, doc in processor(texts, vocabulary=None, keep_order=True):
     # found the word in dictionary
     vec = []
     for x in doc:
       idx = dictionary.get(x, -1)
       if idx >= 0: vec.append(idx)
       # not found the token in dictionary
       elif token_not_found == 'ignore':
         continue
       elif token_not_found == 'raise':
         raise RuntimeError('Cannot find token: "%s" in dictionary' % x)
       elif isinstance(token_not_found, int):
         vec.append(token_not_found)
     # append ending document token
     if end_document is not None:
       vec.append(end_document)
     # add the final results
     results.append(vec)
     # print progress
     if self.print_progress:
       prog['#Docs'] = nb_docs
       prog.add(1)
       if auto_adjust_len and prog.seen_so_far >= 0.8 * prog.target:
         prog.target = 1.2 * prog.target
   # end the process
   # if self.print_progress and auto_adjust_len:
   #     prog.target = nb_docs; prog.update(nb_docs)
   # ====== pad the sequence ====== #
   # just transform into sequence of tokens
   if mode == 'seq':
     maxlen = self.longest_document_length if maxlen is None \
         else int(maxlen)
     results = pad_sequences(results, maxlen=maxlen, dtype=dtype,
                             padding=padding, truncating=truncating,
                             value=value)
   # transform into one-hot matrix
   else:
     X = np.zeros(shape=(len(results), self.nb_words))
     for i, seq in enumerate(results):
       if mode == 'binary':
         X[i, seq] = 1
       elif mode == 'freq':
         length = len(seq)
         count = freqcount(seq)
         for tok, n in count.items():
           X[i, tok] = n / float(length)
       elif mode == 'count':
         count = freqcount(seq)
         for tok, n in count.items():
           X[i, tok] = n
       elif mode == 'tfidf':
         count = freqcount(seq)
         for tok, n in count.items():
           tf = 1 + np.log(n)
           docs_freq = self._word_dictionary_info.get(tok, (0, 0))[-1]
           idf = np.log(1 + self.nb_docs / (1 + docs_freq))
           X[i, tok] = tf * idf
     results = X
   return results