def __init__(self, config, trial='trial1'): """Initialize MALLET with trial name""" if trial not in config.trials: raise ValueError("Invalid trail name `{}`.format(trial)") self.config = config self.trial = trial self.config.set_config_attributes(self) # Prefixes keys with cfg_ self.config.set_config_attributes(self, self.trial) # todo: Put this in config.ini self.cfg_tw_quantile = 0.8 # Temporary hack to handle casting for key in "num_topics num_iterations optimize_interval num_threads num_top_words".split(): att = 'cfg_{}'.format(key) setattr(self, att, int(getattr(self, att))) self.cfg_thresh = float(self.cfg_thresh) # Get replacment files # todo: Fix order; higher ngrams should go first ... argues for sortable names self.replacement_files = self.cfg_replacements for filename in os.listdir('corpus'): if 'replacements_' in filename: self.replacement_files += ' corpus/' + filename self.trial_name = self.trial # HACK self.file_prefix = '{}/{}'.format(self.cfg_mallet_out_dir, self.trial_name) self.mallet = {'import-file': {}, 'train-topics': {}} self.mallet_init() dbfile = self.config.generate_model_db_file_path(self.trial) PoloDb.__init__(self, dbfile)
def __init__(self, config): """Initialize corpus object""" # Import Configs self.config = config self.config.set_config_attributes(self) if not os.path.isfile(self.cfg_src_file_name): raise ValueError( "Missing source file. Check value of `src_file_name` in INI file." ) self.dbfile = config.generate_corpus_db_file_path() PoloDb.__init__(self, self.dbfile) # self.db = PoloDb(self.dbfile) # Why not do this? if self.cfg_nltk_data_path: nltk.data.path.append(self.cfg_nltk_data_path) # For tokenizing into sentences # fixme: TOKENIZER ASSUMES ENGLISH -- PARAMETIZE THIS nltk.download('punkt') nltk.download('tagsets') nltk.download('averaged_perceptron_tagger') self.tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')