def synthetic_sequences_experiment(exp_params, path_dict, sample_retries=100, reuse_sequences=None): t_exp = Timer() start_time = t_exp.tic("Starting a 'synthetic sequences' experiment.") # Get parameters t = Timer() t.tic("Parsing parameters ...") exp_params = _parse_syntheticgt_dirichlet_parameters(exp_params, path_dict) exp_params = _parse_standard_and_dense(exp_params, path_dict, exp_params['n_emissions']) _exp_params = _save_experiment_parameters(exp_params, path_dict) t.toc("Parameters parsed. Using parameters: %s" % str(_exp_params)) train_X, test_X, gt_AB = None, None, None if reuse_sequences is None or type(reuse_sequences) != tuple or len(reuse_sequences) != 3: train_X, test_X, gt_AB = _sample_sequences_from_gt_hmm(exp_params, path_dict, sample_retries=sample_retries) else: train_X, test_X, gt_AB = reuse_sequences timestamp_msg("Reusing sequences") if 'fair_standard_params' in exp_params: _standard_vs_dense(train_X, test_X, (exp_params['standard_params'], exp_params['fair_standard_params']), exp_params['dense_params'], gt_AB) else: _standard_vs_dense(train_X, test_X, exp_params['standard_params'], exp_params['dense_params'], gt_AB) fin_time, diff = t_exp.toc("Finished a 'synthetic sequences' experiment.")
def _load_protein(): global protein_seqs, protein_meta if protein_seqs is None or protein_meta is None or (len(protein_seqs) != len(protein_meta)): protein_seqs, protein_meta = [], [] # Parsing data filename = os.path.join(DATA_DIR, 'pdb_seqres.txt.gz') with gzip.open(filename) as f: for i, line in enumerate(f): #if i >= 500: # break #print(line) if line.startswith('>'): protein_meta.append(line.strip()) else: seq_str = line.strip() protein_seqs.append([c for c in seq_str]) else: timestamp_msg('Using already loaded sequences ...') return protein_seqs, protein_meta
def _save_data(path_dict, train_X, test_X=None, gt_AB=None): if 'data_dir' in path_dict: data_dir = str(path_dict['data_dir']) check_dir(data_dir) np.save(data_dir + '/train_X', train_X) if test_X is not None: np.save(data_dir + '/test_X', test_X) if gt_AB is not None: np.save(data_dir + '/gt_A', gt_AB[0]) np.save(data_dir + '/gt_B', gt_AB[1]) timestamp_msg("Saved data in %s" % data_dir)
def _load_penntreebank(): global penntree_tagged_words, penntree_tagged_sents if penntree_tagged_words is None or penntree_tagged_sents is None: nltk.download('treebank') from nltk.corpus import treebank # Organized in sentences and words penntree_tagged_sents = treebank.tagged_sents() penntree_tagged_words = [ word for sent in penntree_tagged_sents for word in sent ] else: timestamp_msg('Using already loaded sequences ...') return penntree_tagged_sents, penntree_tagged_words
def dataset_sequences_experiment(exp_params, path_dict, reuse_sequences=None): t_exp = Timer() exp_params = dict(exp_params) ident = dict_get(exp_params, 'dataset_ident', default='', cast=str) start_time = t_exp.tic("Starting a 'dataset sequences' experiment. (%s)" % str(ident)) # Get parameters t = Timer() t.tic("Parsing parameters ...") train_perc = dict_get(exp_params, 'train_perc', default=1., cast=float) gt_dir = dict_get(path_dict, 'gt_dir', default=None) check_dir(gt_dir) ds_params = dict_get(exp_params, 'dataset_params', default=dict(), cast=dict) if reuse_sequences is None or type(reuse_sequences) != tuple or len(reuse_sequences) != 2: gt_sequences, _, _ = get_dataset_sequences(ident, ds_params, gt_dir) train_X, test_X = train_test_split(gt_sequences, train_perc) else: train_X, test_X = reuse_sequences timestamp_msg("Reusing sequences ...") # Check gt_sequences _, _, n_train_emissions = check_sequences(train_X) n_test_emissions = None if test_X is not None and len(test_X) > 0: _, _, n_test_emissions = check_sequences(test_X) _save_data(path_dict, train_X, test_X) if n_test_emissions is not None and n_train_emissions != n_test_emissions: raise Exception("Number of emissions in train and test sequence differs") exp_params['n_emissions'] = n_train_emissions exp_params = _parse_base_parameters(exp_params, path_dict) exp_params = _parse_standard_and_dense(exp_params, path_dict, exp_params['n_emissions']) _exp_params = _save_experiment_parameters(exp_params, path_dict) t.toc("Parameters parsed. Using parameters: %s" % str(_exp_params)) if 'fair_standard_params' in exp_params: _standard_vs_dense(train_X, test_X, (exp_params['standard_params'], exp_params['fair_standard_params']), exp_params['dense_params']) else: _standard_vs_dense(train_X, test_X, exp_params['standard_params'], exp_params['dense_params']) fin_time, diff = t_exp.toc("Finished a 'dataset sequences' experiment.")
def _save_experiment_parameters(exp_params, path_dict): if 'experiment_directory' in path_dict: exp_dir = str(path_dict['experiment_directory']) check_dir(exp_dir) _exp_params = copy.deepcopy(exp_params) gt_params = dict_get(_exp_params, 'gt_params', default=None, cast=dict) if gt_params is not None: _exp_params['gt_params'] = gt_params init_params = dict_get(gt_params, 'init_params', default=None) if callable(init_params): _exp_params['gt_params']['init_params'] = str(init_params.__name__) gt_logmon = dict_get(gt_params, 'logging_monitor', default=None) if gt_logmon is not None and isinstance(gt_logmon, HMMLoggingMonitor): _exp_params['gt_params']['logging_monitor'] = dict(gt_logmon.log_config) standard_params = dict_get(_exp_params, 'standard_params', default=None, cast=dict) standard_logmon = dict_get(standard_params, 'logging_monitor', default=None) if standard_logmon is not None and isinstance(standard_logmon, HMMLoggingMonitor): _exp_params['standard_params']['logging_monitor'] = dict(standard_logmon.log_config) dense_params = dict_get(_exp_params, 'dense_params', default=None, cast=dict) dense_logmon = dict_get(standard_params, 'logging_monitor', default=None) if dense_logmon is not None and isinstance(dense_logmon, HMMLoggingMonitor): _exp_params['dense_params']['logging_monitor'] = dict(dense_logmon.log_config) fair_standard_params = dict_get(_exp_params, 'fair_standard_params', default=None, cast=dict) fair_standard_logmon = dict_get(fair_standard_params, 'logging_monitor', default=None) if fair_standard_logmon is not None and isinstance(fair_standard_logmon, HMMLoggingMonitor): _exp_params['fair_standard_params']['logging_monitor'] = dict(fair_standard_logmon.log_config) np.save(exp_dir + '/exp_params', _exp_params) timestamp_msg("Saved experiment parameters in %s" % exp_dir) return _exp_params