def make_from_serializable(cls, obj): obj = cls.make_serialized_form_compatible_with_newer_version(obj) res = IndexingPrePostProcessor(voc_limit=obj["voc_limit"]) res.indexer = Indexer.make_from_serializable(obj["indexer"]) if "preprocessor" in obj: res.preprocessor = PreProcessor.make_from_serializable( obj["preprocessor"]) res.is_initialized_ = True return res
def make_serialized_form_compatible_with_newer_version(cls, obj): if Indexer.check_if_data_indexer(obj): new_obj = cls.make_base_serializable_object() new_obj["indexer"] = obj new_obj["voc_limit"] = len(obj) ss = SimpleSegmenter("word") ss.initialize(None) new_obj["preprocessor"] = ss.to_serializable() elif "processors_list" in obj: new_obj = obj["processors_list"][-1] if len(obj["processors_list"]) > 1: preproc_obj = copy.deepcopy(obj) preproc_obj["processors_list"] = preproc_obj[ "processors_list"][:-1] new_obj["preprocessor"] = preproc_obj else: new_obj = obj return new_obj
def build_index_from_iterable(iterable, voc_limit=None): counts = collections.defaultdict(int) for num_ex, line in enumerate(iterable): for w in line: counts[w] += 1 sorted_counts = sorted(six.iteritems(counts), key=operator.itemgetter(1), reverse=True) res = Indexer() for w, _ in sorted_counts[:voc_limit]: res.add_word(w, should_be_new=True) res.finalize() return res