def fit(self, texts, vocabulary=None): """q Parameters ---------- texts: iterator of unicode iterator, generator or list of unicode string. """ texts = self._validate_texts(texts) word_counts = self._word_counts word_docs = self._word_docs # ====== pick engine ====== # if self.__engine == 'spacy': processor = self._preprocess_docs_spacy elif self.__engine == 'odin': processor = self._preprocess_docs_odin # ====== start processing ====== # prog = Progbar(target=1208) start_time = timeit.default_timer() for nb_docs, doc in processor(texts, vocabulary, keep_order=False): total_docs_tokens = 0 seen_words = {} # update words->count for token in doc: total_docs_tokens += 1 word_counts[token] += 1 # update words->doc if token not in seen_words: seen_words[token] = 1 word_docs[token] += 1 # save longest docs if total_docs_tokens > self.__longest_document[-1]: self.__longest_document = [doc, total_docs_tokens] # print progress if self.print_progress: prog.title = '[Training]#Doc:%d #Tok:%d' % (nb_docs, len(word_counts)) prog.add(1) if prog.seen_so_far >= 0.8 * prog.target: prog.target = 1.2 * prog.target # ====== print summary of the process ====== # if self.print_progress: prog.target = nb_docs prog.update(nb_docs) processing_time = timeit.default_timer() - start_time print('Processed %d-docs, %d-tokens in %f second.' % (nb_docs, len(word_counts), processing_time)) self.nb_docs += nb_docs # ====== sorting ====== # self._refresh_dictionary() return self
def save_cache(self, path, datatype='memmap', print_progress=True): """ Save all preprocessed data to a Dataset """ if not isinstance(path, str) or os.path.isfile(path): raise ValueError('path must be string path to a folder.') if os.path.exists(path): print('Remove old dataset at path:', path) shutil.rmtree(path) ds = Dataset(path) # ====== start caching ====== # if print_progress: prog = Progbar(target=self.shape[0], title='Caching:') for X in self: if not isinstance(X, (tuple, list)): X = (X, ) # saving preprocessed data for i, x in enumerate(X): name = 'data%d' % i if name in ds: ds[name].append(x) else: ds[(name, datatype)] = x # print progress if print_progress: prog.add(X[0].shape[0]) prog.target = prog.seen_so_far prog.add(0) ds.flush() ds.close() # end return self
def fit(self, texts, vocabulary=None): """ Parameters ---------- texts: iterator of unicode iterator, generator or list (e.g. [u'a', u'b', ...]) of unicode documents. """ texts = self._validate_texts(texts) word_counts = self._word_counts word_docs = self._word_docs # ====== pick engine ====== # if self.__engine == 'spacy': processor = self._preprocess_docs_spacy elif self.__engine == 'odin': processor = self._preprocess_docs_odin # ====== start processing ====== # prog = Progbar(target=1208, name="Fitting tokenizer", print_report=True, print_summary=True) start_time = timeit.default_timer() for nb_docs, doc in processor(texts, vocabulary, keep_order=False): total_docs_tokens = 0 seen_words = {} # update words->count for token in doc: total_docs_tokens += 1 word_counts[token] += 1 # update words->doc if token not in seen_words: seen_words[token] = 1 word_docs[token] += 1 # save longest docs if total_docs_tokens > self.__longest_document[-1]: self.__longest_document = [doc, total_docs_tokens] # print progress prog['#Doc'] = nb_docs prog['#Tok'] = len(word_counts) prog.add(1) if prog.seen_so_far >= 0.8 * prog.target: prog.target = 1.2 * prog.target # ====== print summary of the process ====== # # if self.print_progress: # prog.target = nb_docs; prog.update(nb_docs) processing_time = timeit.default_timer() - start_time print('Processed %d-docs, %d-tokens in %f second.' % (nb_docs, len(word_counts), processing_time)) self.nb_docs += nb_docs # ====== sorting ====== # self._refresh_dictionary() return self
def transform(self, texts, mode='seq', dtype='int32', padding='pre', truncating='pre', value=0., end_document=None, maxlen=None, token_not_found='ignore'): """ Parameters ---------- texts: iterator of unicode iterator, generator or list (e.g. [u'a', u'b', ...]) of unicode documents. mode: 'binary', 'tfidf', 'count', 'freq', 'seq' 'binary', abc 'tfidf', abc 'count', abc 'freq', abc 'seq', abc token_not_found: 'ignore', 'raise', a token string, an integer pass """ # ====== check arguments ====== # texts = self._validate_texts(texts) # ====== check mode ====== # mode = str(mode) if mode not in ('seq', 'binary', 'count', 'freq', 'tfidf'): raise ValueError('The "mode" argument must be: "seq", "binary", ' '"count", "freq", or "tfidf".') # ====== check token_not_found ====== # if not is_number(token_not_found) and \ not is_string(token_not_found) and \ token_not_found not in ('ignore', 'raise'): raise ValueError('token_not_found can be: "ignore", "raise"' ', an integer of token index, or a string ' 'represented a token.') if token_not_found not in ('ignore', 'raise'): token_not_found = int(self.dictionary[token_not_found]) elif is_number(token_not_found): token_not_found = int(token_not_found) # ====== pick engine ====== # if self.__engine == 'spacy': processor = self._preprocess_docs_spacy elif self.__engine == 'odin': processor = self._preprocess_docs_odin # ====== Initialize variables ====== # dictionary = self.dictionary results = [] # ====== preprocess arguments ====== # if isinstance(end_document, str): end_document = dictionary.index(end_document) elif is_number(end_document): end_document = int(end_document) # ====== processing ====== # if hasattr(texts, '__len__'): target_len = len(texts) auto_adjust_len = False else: target_len = 1234 auto_adjust_len = True prog = Progbar(target=target_len, name="Tokenize Transform", print_report=True, print_summary=True) for nb_docs, doc in processor(texts, vocabulary=None, keep_order=True): # found the word in dictionary vec = [] for x in doc: idx = dictionary.get(x, -1) if idx >= 0: vec.append(idx) # not found the token in dictionary elif token_not_found == 'ignore': continue elif token_not_found == 'raise': raise RuntimeError( 'Cannot find token: "%s" in dictionary' % x) elif isinstance(token_not_found, int): vec.append(token_not_found) # append ending document token if end_document is not None: vec.append(end_document) # add the final results results.append(vec) # print progress if self.print_progress: prog['#Docs'] = nb_docs prog.add(1) if auto_adjust_len and prog.seen_so_far >= 0.8 * prog.target: prog.target = 1.2 * prog.target # end the process # if self.print_progress and auto_adjust_len: # prog.target = nb_docs; prog.update(nb_docs) # ====== pad the sequence ====== # # just transform into sequence of tokens if mode == 'seq': maxlen = self.longest_document_length if maxlen is None \ else int(maxlen) results = pad_sequences(results, maxlen=maxlen, dtype=dtype, padding=padding, truncating=truncating, value=value) # transform into one-hot matrix else: X = np.zeros(shape=(len(results), self.nb_words)) for i, seq in enumerate(results): if mode == 'binary': X[i, seq] = 1 elif mode == 'freq': length = len(seq) count = freqcount(seq) for tok, n in count.items(): X[i, tok] = n / float(length) elif mode == 'count': count = freqcount(seq) for tok, n in count.items(): X[i, tok] = n elif mode == 'tfidf': count = freqcount(seq) for tok, n in count.items(): tf = 1 + np.log(n) docs_freq = self._word_dictionary_info.get( tok, (0, 0))[-1] idf = np.log(1 + self.nb_docs / (1 + docs_freq)) X[i, tok] = tf * idf results = X return results
def transform(self, texts, mode='seq', dtype='int32', padding='pre', truncating='pre', value=0., end_document=None, maxlen=None, token_not_found='ignore'): """ Parameters ---------- texts: iterator of unicode iterator, generator or list (e.g. [u'a', u'b', ...]) of unicode documents. mode: 'binary', 'tfidf', 'count', 'freq', 'seq' 'binary', abc 'tfidf', abc 'count', abc 'freq', abc 'seq', abc token_not_found: 'ignore', 'raise', a token string, an integer pass """ # ====== check arguments ====== # texts = self._validate_texts(texts) # ====== check mode ====== # mode = str(mode) if mode not in ('seq', 'binary', 'count', 'freq', 'tfidf'): raise ValueError('The "mode" argument must be: "seq", "binary", ' '"count", "freq", or "tfidf".') # ====== check token_not_found ====== # if not is_number(token_not_found) and \ not is_string(token_not_found) and \ token_not_found not in ('ignore', 'raise'): raise ValueError('token_not_found can be: "ignore", "raise"' ', an integer of token index, or a string ' 'represented a token.') if token_not_found not in ('ignore', 'raise'): token_not_found = int(self.dictionary[token_not_found]) elif is_number(token_not_found): token_not_found = int(token_not_found) # ====== pick engine ====== # if self.__engine == 'spacy': processor = self._preprocess_docs_spacy elif self.__engine == 'odin': processor = self._preprocess_docs_odin # ====== Initialize variables ====== # dictionary = self.dictionary results = [] # ====== preprocess arguments ====== # if isinstance(end_document, str): end_document = dictionary.index(end_document) elif is_number(end_document): end_document = int(end_document) # ====== processing ====== # if hasattr(texts, '__len__'): target_len = len(texts) auto_adjust_len = False else: target_len = 1208 auto_adjust_len = True prog = Progbar(target=target_len, name="Tokenize Transform", print_report=True, print_summary=True) for nb_docs, doc in processor(texts, vocabulary=None, keep_order=True): # found the word in dictionary vec = [] for x in doc: idx = dictionary.get(x, -1) if idx >= 0: vec.append(idx) # not found the token in dictionary elif token_not_found == 'ignore': continue elif token_not_found == 'raise': raise RuntimeError('Cannot find token: "%s" in dictionary' % x) elif isinstance(token_not_found, int): vec.append(token_not_found) # append ending document token if end_document is not None: vec.append(end_document) # add the final results results.append(vec) # print progress if self.print_progress: prog['#Docs'] = nb_docs prog.add(1) if auto_adjust_len and prog.seen_so_far >= 0.8 * prog.target: prog.target = 1.2 * prog.target # end the process # if self.print_progress and auto_adjust_len: # prog.target = nb_docs; prog.update(nb_docs) # ====== pad the sequence ====== # # just transform into sequence of tokens if mode == 'seq': maxlen = self.longest_document_length if maxlen is None \ else int(maxlen) results = pad_sequences(results, maxlen=maxlen, dtype=dtype, padding=padding, truncating=truncating, value=value) # transform into one-hot matrix else: X = np.zeros(shape=(len(results), self.nb_words)) for i, seq in enumerate(results): if mode == 'binary': X[i, seq] = 1 elif mode == 'freq': length = len(seq) count = freqcount(seq) for tok, n in count.items(): X[i, tok] = n / float(length) elif mode == 'count': count = freqcount(seq) for tok, n in count.items(): X[i, tok] = n elif mode == 'tfidf': count = freqcount(seq) for tok, n in count.items(): tf = 1 + np.log(n) docs_freq = self._word_dictionary_info.get(tok, (0, 0))[-1] idf = np.log(1 + self.nb_docs / (1 + docs_freq)) X[i, tok] = tf * idf results = X return results