def __init__(self, save_path, min_freq=3, max_count=5000, include_table_name_in_column=True, word_emb=None, count_tokens_in_word_emb_for_vocab=False, fix_issue_16_primary_keys=False, compute_sc_link=False, compute_cv_link=False, db_path=None): if word_emb is None: self.word_emb = None else: self.word_emb = registry.construct('word_emb', word_emb) self.data_dir = os.path.join(save_path, 'enc') self.include_table_name_in_column = include_table_name_in_column self.count_tokens_in_word_emb_for_vocab = count_tokens_in_word_emb_for_vocab self.fix_issue_16_primary_keys = fix_issue_16_primary_keys self.compute_sc_link = compute_sc_link self.compute_cv_link = compute_cv_link self.texts = collections.defaultdict(list) self.db_path = db_path self.vocab_builder = vocab.VocabBuilder(min_freq, max_count) self.vocab_path = os.path.join(save_path, 'enc_vocab.json') self.vocab_word_freq_path = os.path.join(save_path, 'enc_word_freq.json') self.vocab = None self.counted_db_ids = set() self.preprocessed_schemas = {}
def __init__(self, grammar, save_path, min_freq=3, max_count=5000, use_seq_elem_rules=False): self.grammar = registry.construct('grammar', grammar) self.ast_wrapper = self.grammar.ast_wrapper self.vocab_path = os.path.join(save_path, 'dec_vocab.json') self.observed_productions_path = os.path.join( save_path, 'observed_productions.json') self.grammar_rules_path = os.path.join(save_path, 'grammar_rules.json') self.data_dir = os.path.join(save_path, 'dec') self.vocab_builder = vocab.VocabBuilder(min_freq, max_count) self.use_seq_elem_rules = use_seq_elem_rules self.items = collections.defaultdict(list) self.sum_type_constructors = collections.defaultdict(set) self.field_presence_infos = collections.defaultdict(set) self.seq_lengths = collections.defaultdict(set) self.primitive_types = set() self.vocab = None self.all_rules = None self.rules_mask = None
def __init__(self, save_path, min_freq=3, max_count=5000): self.vocab_path = os.path.join(save_path, 'enc_vocab.json') self.data_dir = os.path.join(save_path, 'enc') self.vocab_builder = vocab.VocabBuilder(min_freq, max_count) self.init_items() self.vocab = None