def _load_archive(self, path, extract_path): from zipfile import ZipFile, ZIP_DEFLATED try: zfile = ZipFile(path, mode='r', compression=ZIP_DEFLATED) allfile = zfile.namelist() # validate extract_path if not os.path.isdir(extract_path): raise ValueError('Extract path must be path folder, but path' '={} is a file'.format(extract_path)) extract_path = os.path.join( extract_path, os.path.basename(path).replace('.zip', '')) # found the extracted dir, use it if os.path.isdir(extract_path) and \ set(os.listdir(extract_path)) == set(allfile): self._set_path(extract_path) return # decompress everything if not os.path.exists(extract_path): os.mkdir(extract_path) maxlen = max([len(i) for i in allfile]) progbar = Progbar(len(allfile)) for i, f in enumerate(allfile): zfile.extract(f, path=extract_path) progbar.title = ('Unarchiving: %-' + str(maxlen) + 's') % f progbar.update(i + 1) # ====== finally set path ====== # self._set_path(extract_path) except IOError as e: raise IOError('Error loading archived dataset, path:{}, error:{}' '.'.format(path, e)) return None
def fit(self, texts, vocabulary=None): """q Parameters ---------- texts: iterator of unicode iterator, generator or list of unicode string. """ texts = self._validate_texts(texts) word_counts = self._word_counts word_docs = self._word_docs # ====== pick engine ====== # if self.__engine == 'spacy': processor = self._preprocess_docs_spacy elif self.__engine == 'odin': processor = self._preprocess_docs_odin # ====== start processing ====== # prog = Progbar(target=1208) start_time = timeit.default_timer() for nb_docs, doc in processor(texts, vocabulary, keep_order=False): total_docs_tokens = 0 seen_words = {} # update words->count for token in doc: total_docs_tokens += 1 word_counts[token] += 1 # update words->doc if token not in seen_words: seen_words[token] = 1 word_docs[token] += 1 # save longest docs if total_docs_tokens > self.__longest_document[-1]: self.__longest_document = [doc, total_docs_tokens] # print progress if self.print_progress: prog.title = '[Training]#Doc:%d #Tok:%d' % (nb_docs, len(word_counts)) prog.add(1) if prog.seen_so_far >= 0.8 * prog.target: prog.target = 1.2 * prog.target # ====== print summary of the process ====== # if self.print_progress: prog.target = nb_docs prog.update(nb_docs) processing_time = timeit.default_timer() - start_time print('Processed %d-docs, %d-tokens in %f second.' % (nb_docs, len(word_counts), processing_time)) self.nb_docs += nb_docs # ====== sorting ====== # self._refresh_dictionary() return self
def archive(self): from zipfile import ZipFile, ZIP_DEFLATED path = self.archive_path zfile = ZipFile(path, mode='w', compression=ZIP_DEFLATED) files = set([_[-1] for _ in self._data_map.itervalues()]) progbar = Progbar(len(files), title='Archiving:') maxlen = max([len(os.path.basename(i)) for i in files]) for i, f in enumerate(files): zfile.write(f, os.path.basename(f)) progbar.title = ('Archiving: %-' + str(maxlen) + 's') % os.path.basename(f) progbar.update(i + 1) zfile.close() return path
def run(self): if self.pca: from odin.ml import MiniBatchPCA if not hasattr(self, 'jobs'): raise Exception( 'the Processor must has "jobs" attribute, which is ' 'the list of all jobs.') njobs = len(self.jobs) if self.njobs == 0 else self.njobs prog = Progbar(target=njobs) dataset = self.dataset datatype = self.datatype if self.ncpu is None: # auto select number of CPU ncpu = min(njobs, int(1.2 * cpu_count())) else: ncpu = self.ncpu # ====== indices ====== # indices = defaultdict(list) # ====== MmapDict ====== # dicts = {} for name, dtype, stats in self.features_properties: if 'dict' in str(dtype).lower(): dicts[name] = MmapDict(os.path.join(dataset.path, name)) # ====== statistic ====== # statistic_able = {i[0]: i[-1] for i in self.features_properties} sum1 = defaultdict(int) sum2 = defaultdict(int) # init PCA pca = defaultdict(lambda *args, **kwargs: MiniBatchPCA( n_components=None, whiten=self.pca_whiten, copy=True, batch_size=None) if self.pca else None) # all data are cached for periodically flushed cache = defaultdict(list) if self.ncache <= 1: cache_limit = max(2, int(0.12 * njobs)) else: cache_limit = int(self.ncache) ref_vars = {'start': defaultdict(int), 'processed_count': 0} # ====== helper ====== # def flush_feature(name, cache_data): if len(cache_data) > 0: cache_data = np.concatenate(cache_data, 0) # NOTE: if nb_samples < nb_features, fitting PCA # will course error if self.pca and statistic_able[name]: pca[name].partial_fit(cache_data) # flush data if name in dataset: dataset[name].append(cache_data) else: dataset[(name, datatype)] = cache_data def wrapped_reduce(result): name, data = result ref_vars['processed_count'] += 1 # check data if not isinstance(data, (tuple, list)): data = (data, ) length = [] # store length of all data for validation # processing for prop, d in zip(self.features_properties, data): n, t, s = prop # data-type-name, dtype, stats # mmapdict type: if 'dict' in str(t).lower(): dicts[n][name] = d.tolist() if isinstance( d, np.ndarray) else d del d continue # auto-create new indices if len(d) not in length: length.append(len(d)) indices[n].append([ name, ref_vars['start'][n], ref_vars['start'][n] + len(d) ]) ref_vars['start'][n] += len(d) # cache data, only if we have more than 0 sample if len(d) > 0: cache[n].append(d.astype(t)) if self.save_stats and s: # save stats sum1[n] += np.sum(d, axis=0, dtype='float64') sum2[n] += np.sum(np.power(d, 2), axis=0, dtype='float64') del d # ====== flush cache ====== # if ref_vars['processed_count'] % cache_limit == 0: # 12 + 8 for i, j in cache.iteritems(): flush_feature(i, j) cache.clear() # ====== update progress ====== # return name # ====== processing ====== # mpi = MPI(self.jobs, self.map, wrapped_reduce, ncpu=ncpu, buffer_size=1, maximum_queue_size=ncpu * 3) for name in mpi: prog.title = '%-20s' % name prog.add(1) # ====== end, flush the last time ====== # for i, j in cache.iteritems(): flush_feature(i, j) cache = None dataset.flush() # ====== saving indices ====== # for n, ids in indices.iteritems(): outpath = os.path.join( dataset.path, 'indices' if n in self.primary_indices else 'indices_%s' % n) _ = MmapDict(outpath) for name, start, end in ids: _[name] = (int(start), int(end)) _.flush() _.close() # ====== save mean and std ====== # def save_mean_std(sum1, sum2, pca, name, dataset): N = dataset[name].shape[0] mean = sum1 / N std = np.sqrt(sum2 / N - mean**2) if self.substitute_nan is not None: mean = np.where(np.isnan(mean), self.substitute_nan, mean) std = np.where(np.isnan(std), self.substitute_nan, std) else: assert not np.any( np.isnan(mean)), 'Mean contains NaN, %s' % name assert not np.any(np.isnan(std)), 'Std contains NaN, %s' % name dataset[name + '_sum1'] = sum1 dataset[name + '_sum2'] = sum2 dataset[name + '_mean'] = mean dataset[name + '_std'] = std dataset[name + '_pca'] = pca # save all stats if self.save_stats: print('Saving statistics of each data ...') for n, d, s in self.features_properties: if s: # save stats print(' * Name:', n) s1, s2, pca_ = sum1[n], sum2[n], pca[n] save_mean_std(s1, s2, pca_, n, dataset) # ====== dataset flush() ====== # dataset.flush() dataset.close() # ====== all MmapDict flush() ====== # for d in dicts.itervalues(): d.flush() d.close()
def transform(self, texts, mode='seq', dtype='int32', padding='pre', truncating='pre', value=0., end_document=None, maxlen=None, token_not_found='ignore'): """ Parameters ---------- mode: 'binary', 'tfidf', 'count', 'freq', 'seq' 'binary', abc 'tfidf', abc 'count', abc 'freq', abc 'seq', abc token_not_found: 'ignore', 'raise', a token string, an integer pass """ # ====== check arguments ====== # texts = self._validate_texts(texts) # ====== check mode ====== # mode = str(mode) if mode not in ('seq', 'binary', 'count', 'freq', 'tfidf'): raise ValueError('The "mode" argument must be: "seq", "binary", ' '"count", "freq", or "tfidf".') # ====== check token_not_found ====== # if not isinstance(token_not_found, Number) and \ not is_string(token_not_found) and \ token_not_found not in ('ignore', 'raise'): raise ValueError('token_not_found can be: "ignore", "raise"' ', an integer of token index, or a string ' 'represented a token.') if token_not_found not in ('ignore', 'raise'): token_not_found = int(self.dictionary[token_not_found]) elif isinstance(token_not_found, Number): token_not_found = int(token_not_found) # ====== pick engine ====== # if self.__engine == 'spacy': processor = self._preprocess_docs_spacy elif self.__engine == 'odin': processor = self._preprocess_docs_odin # ====== Initialize variables ====== # dictionary = self.dictionary results = [] # ====== preprocess arguments ====== # if isinstance(end_document, str): end_document = dictionary.index(end_document) elif isinstance(end_document, Number): end_document = int(end_document) # ====== processing ====== # if hasattr(texts, '__len__'): target_len = len(texts) auto_adjust_len = False else: target_len = 1208 auto_adjust_len = True prog = Progbar(target=target_len) for nb_docs, doc in processor(texts, vocabulary=None, keep_order=True): # found the word in dictionary vec = [] for x in doc: idx = dictionary.get(x, -1) if idx >= 0: vec.append(idx) # not found the token in dictionary elif token_not_found == 'ignore': continue elif token_not_found == 'raise': raise RuntimeError( 'Cannot find token: "%s" in dictionary' % x) elif isinstance(token_not_found, int): vec.append(token_not_found) # append ending document token if end_document is not None: vec.append(end_document) # add the final results results.append(vec) # print progress if self.print_progress: prog.title = "[Transforming] %d docs" % nb_docs prog.add(1) if auto_adjust_len and prog.seen_so_far >= 0.8 * prog.target: prog.target = 1.2 * prog.target # end the process if self.print_progress and auto_adjust_len: prog.target = nb_docs prog.update(nb_docs) # ====== pad the sequence ====== # # just transform into sequence of tokens if mode == 'seq': maxlen = self.longest_document_length if maxlen is None \ else int(maxlen) results = pad_sequences(results, maxlen=maxlen, dtype=dtype, padding=padding, truncating=truncating, value=value) # transform into one-hot matrix else: X = np.zeros(shape=(len(results), self.nb_words)) for i, seq in enumerate(results): if mode == 'binary': X[i, seq] = 1 elif mode == 'freq': length = len(seq) count = freqcount(seq) for tok, n in count.iteritems(): X[i, tok] = n / float(length) elif mode == 'count': count = freqcount(seq) for tok, n in count.iteritems(): X[i, tok] = n elif mode == 'tfidf': count = freqcount(seq) for tok, n in count.iteritems(): tf = 1 + np.log(n) docs_freq = self._word_dictionary_info.get( tok, (0, 0))[-1] idf = np.log(1 + self.nb_docs / (1 + docs_freq)) X[i, tok] = tf * idf results = X return results