Esempio n. 1
0
def _write_lm_encodings(lm_data_shard_cfg, device):
    data = get_data(
        word_emb_data_path_prefix=GLOVE_PREPROC_WITH_UNKS_PATH_PREFIX,
        tokenized_trn_json_path=TOKENIZED_TRN_JSON_PATH,
        tokenized_dev_json_path=TOKENIZED_DEV_JSON_PATH,
        max_ans_len=MAX_ANS_LEN,
        max_ctx_len=MAX_CTX_LEN)
    write_lm_data_shard(data, lm_data_shard_cfg, device)
Esempio n. 2
0
def _main(config, config_idx):
    base_filename = config.name + '_cfg' + str(config_idx)
    logger = set_up_logger('logs/' + base_filename + '.log')
    title = '{}: {} ({}) config index {}'.format(__file__, config.name,
                                                 config.desc, config_idx)
    logger.info('START ' + title + '\n\n{}\n'.format(config))

    data = get_data(config)

    if config.device != 'cpu':
        assert 'theano' not in sys.modules
        import theano.sandbox.cuda
        theano.sandbox.cuda.use(config.device)
    from model import get_model
    model = get_model(config, data)

    if not config.is_train:
        if config.tst_load_model_path and not model.load_if_exists(
                config.tst_load_model_path):
            raise AssertionError('Failed loading model weights from {}'.format(
                config.tst_load_model_path))
        ans_hats = _tst_epoch(config, model, data)
        write_test_predictions(ans_hats, config.tst_prd_json_path)
        logger.info('END ' + title)
        return

    # Training loop
    epoch_results = []
    max_em = -np.inf
    max_f1 = -np.inf
    np_rng = np.random.RandomState(config.seed // 2)
    for epoch in range(1, config.max_num_epochs + 1):
        trn_loss, trn_acc, trn_samples_per_sec = _trn_epoch(
            config, model, data, epoch, np_rng)
        dev_min_loss, dev_prx_loss, dev_max_acc, dev_prx_acc, dev_em, dev_f1 = _dev_epoch(
            config, model, data)
        if dev_em > max_em:
            model.save('models/' + base_filename + '_best_em.pkl')
            max_em = dev_em
        if dev_f1 > max_f1:
            model.save('models/' + base_filename + '_best_f1.pkl')
            max_f1 = dev_f1
        if epoch % 5 == 0:
            model.save('models/' + base_filename +
                       '_e{:03d}.pkl'.format(epoch))
        epoch_results.append(
            EpochResult(trn_loss, trn_acc, dev_min_loss, dev_prx_loss,
                        dev_max_acc, dev_prx_acc, dev_em, dev_f1))
        if config.plot:
            plot_epoch_results(epoch_results, 'logs/' + base_filename + '.png')
        logger.info(
            '\n\nFinished epoch {} for: (config index {}) (samples/sec: {:<.1f})\n{}\n\nResults:\n{}\n\n'
            .format(epoch, config_idx, trn_samples_per_sec,
                    config.format_compared(),
                    format_epoch_results(epoch_results)))
    logger.info('END ' + title)
Esempio n. 3
0
 def __init__(self, args, logger, out_dir):
     self.args = args
     self.logger = logger
     self.out_dir = out_dir
     self.vocab, train_x, test_x, self.overall_maxlen = dataset.get_data(
         self.args.domain,
         vocab_size=self.args.vocab_size,
         maxlen=self.args.maxlen)
     self.train_x = sequence.pad_sequences(train_x,
                                           maxlen=self.overall_maxlen)
     self.test_x = sequence.pad_sequences(test_x,
                                          maxlen=self.overall_maxlen)
     self.vis_path = self.out_dir + "/visualization"
     U.mkdir_p(self.vis_path)
Esempio n. 4
0
def _main(config, config_idx, train):
  base_filename = config.name + '_cfg' + str(config_idx)
  logger = set_up_logger('logs/' + base_filename + '.log')
  title = '{}: {} ({}) config index {}'.format(__file__, config.name, config.desc, config_idx)
  logger.info('START ' + title + '\n\n{}\n'.format(config))

  data = get_data(config, train)

  if config.device != 'cpu':
    assert 'theano' not in sys.modules 
    import theano.sandbox.cuda
    theano.sandbox.cuda.use(config.device)
  from model import get_model
  model = get_model(config, data)

  if not train:
    assert config.tst_load_model_path
    if not model.load(config.tst_load_model_path):
      raise AssertionError('Failed loading model weights from {}'.format(config.tst_load_model_path))
    ans_hats = _tst_epoch(config, model, data)
    write_test_predictions(ans_hats, config.pred_json_path)
    logger.info('END ' + title)
    return

  # Training loop
  epoch_results = []
  max_em = -np.inf
  max_f1 = -np.inf
  np_rng = np.random.RandomState(config.seed // 2)
  for epoch in range(1, config.max_num_epochs+1):
    trn_loss, trn_acc, trn_samples_per_sec = _trn_epoch(config, model, data, epoch, np_rng)
    dev_loss, dev_acc, dev_em, dev_f1 = _dev_epoch(config, model, data)
    if dev_em > max_em:
      model.save('models/' + base_filename + '_best_em.pkl')
      max_em = dev_em
    if dev_f1 > max_f1:
      model.save('models/' + base_filename + '_best_f1.pkl')
      max_f1 = dev_f1
    if config.save_freq and epoch % config.save_freq == 0:
      model.save('models/' + base_filename + '_e{:03d}.pkl'.format(epoch))
    epoch_results.append(
      EpochResult(trn_loss, trn_acc, dev_loss, dev_acc, dev_em, dev_f1))
    if config.plot:
      plot_epoch_results(epoch_results, 'logs/' + base_filename + '.png')
    logger.info('\n\nFinished epoch {} for: (config index {}) (samples/sec: {:<.1f})\n{}\n\nResults:\n{}\n\n'.format(
      epoch, config_idx, trn_samples_per_sec, config.format_compared(), format_epoch_results(epoch_results)))
  logger.info('END ' + title)
def prepare_data(datapaths, embedding_path=None, embedding='word2vec', embedd_dim=100, prompt_id=1, vocab_size=0, tokenize_text=True, \
                         to_lower=True, sort_by_len=False, vocab_path=None, score_index=3):
    assert len(datapaths) == 3, "data paths should include train, dev and test path"
    (train_x, train_y), (dev_x, dev_y), (test_x, test_y), vocab, overal_maxlen, overal_maxnum = \
        reader.get_data(datapaths, prompt_id, vocab_size, tokenize_text=True, to_lower=True, sort_by_len=False, vocab_path=None, score_index=6)

    X_train, y_train, mask_train = padding_sentence_sequences(train_x, train_y, overal_maxnum, overal_maxlen, post_padding=True)
    X_dev, y_dev, mask_dev = padding_sentence_sequences(dev_x, dev_y, overal_maxnum, overal_maxlen, post_padding=True)
    X_test, y_test, mask_test = padding_sentence_sequences(test_x, test_y, overal_maxnum, overal_maxlen, post_padding=True)

    train_mean = y_train.mean(axis=0)
    train_std = y_train.std(axis=0)
    dev_mean = y_dev.mean(axis=0)
    dev_std = y_dev.std(axis=0)
    test_mean = y_test.mean(axis=0)
    test_std = y_test.std(axis=0)

    Y_train = reader.get_model_friendly_scores(y_train, prompt_id)
    Y_dev = reader.get_model_friendly_scores(y_dev, prompt_id)
    Y_test = reader.get_model_friendly_scores(y_test, prompt_id)
    scaled_train_mean = reader.get_model_friendly_scores(train_mean, prompt_id)

    logger.info('Statistics:')

    logger.info('  train X shape: ' + str(X_train.shape))
    logger.info('  dev X shape:   ' + str(X_dev.shape))
    logger.info('  test X shape:  ' + str(X_test.shape))

    logger.info('  train Y shape: ' + str(Y_train.shape))
    logger.info('  dev Y shape:   ' + str(Y_dev.shape))
    logger.info('  test Y shape:  ' + str(Y_test.shape))

    logger.info('  train_y mean: %s, stdev: %s, train_y mean after scaling: %s' % 
                (str(train_mean), str(train_std), str(scaled_train_mean)))

    if embedding_path:
        embedd_dict, embedd_dim, _ = load_word_embedding_dict(embedding, embedding_path, vocab, logger, embedd_dim)
        embedd_matrix = build_embedd_table(vocab, embedd_dict, embedd_dim, logger, caseless=True)
    else:
        embedd_matrix = None
    
    return (X_train, Y_train, mask_train), (X_dev, Y_dev, mask_dev), (X_test, Y_test, mask_test), vocab, len(vocab), embedd_matrix, overal_maxlen, overal_maxnum, scaled_train_mean
Esempio n. 6
0
def main():
    # Get the arguments from the command-line except the filename
    argv = sys.argv[1:]
    params = get_params(argv)
    params['data_params'], type_dict = reader.get_data_settings(params['config_file'])
    header, raw_data = reader.get_data(params)
    type_selector, converters = reader.validate_data_format(params['data_params'], header)
    key_fails, failed_rows, surveys, out_keys = data_processor.process_rows(raw_data, header, params, converters)
    if 'logfile' in params:
        with open(params['logfile'], 'w') as logfile:
            now = datetime.datetime.now()
            logfile.writelines(['Completed: ' + now.strftime("%B %d, %Y") + '\n'])
            logfile.writelines(['Total of %s rows were not completed' % len(failed_rows) + '\n'])
            for key in key_fails.keys():
                logfile.writelines([str(key_fails[key]) + ' ' + str(key) + ' key(s) could not be read' + '\n'])
            logfile.writelines(['%s surveys read, %s surveys failed' % (len(raw_data), len(failed_rows)) + '\n'])
            logfile.writelines(['%s percent success rate' % str(
                round((len(raw_data) - len(failed_rows)) / len(raw_data) * 100, 2)) + '\n'])

    out_header = header + out_keys
    writer.write_data(surveys, out_header, params)
def prepare_sentence_data(datapath, vocab_path, embedding_path=None, embedding='glove', embedd_dim=100, prompt_id=1, vocab_size=0, tokenize_text=True, to_lower=True, sort_by_len=False):

    assert len(datapath) == 1, "data paths should include train, dev and test path"
    (train_x, train_y, train_prompts), vocab, overal_maxlen, overal_maxnum = reader.get_data(datapath, vocab_path, prompt_id, vocab_size, tokenize_text=True, to_lower=True, sort_by_len=False)
        
        

    X_train, y_train, mask_train = utils.padding_sentence_sequences(train_x, train_y, overal_maxnum, overal_maxlen, post_padding=True)
    
        
    train_mean = y_train.mean(axis=0)
    train_std = y_train.std(axis=0)
    
    # Convert scores to boundary of [0 1] for training and evaluation (loss calculation)
    #######################################
    #这里我改了   prompt_id ->train_prompts
    #######################################
    if prompt_id ==-1:
        Y_train = reader.get_model_friendly_scores(y_train, train_prompts)
        scaled_train_mean = Y_train.mean()
    elif prompt_id!=-1:
        Y_train = reader.get_model_friendly_scores(y_train, prompt_id)
        scaled_train_mean = reader.get_model_friendly_scores(train_mean, prompt_id)
    # print Y_train.shape

    logger.info('Statistics:')

    logger.info('  train X shape: ' + str(X_train.shape))
    logger.info('  train Y shape: ' + str(Y_train.shape))
    logger.info('  train_y mean: %s, stdev: %s, train_y mean after scaling: %s' %
                (str(train_mean), str(train_std), str(scaled_train_mean)))

    if embedding_path:
        embedd_dict, embedd_dim, _ = utils.load_word_embedding_dict(embedding, embedding_path, vocab, logger, embedd_dim)
        embedd_matrix = utils.build_embedd_table(vocab, embedd_dict, embedd_dim, logger, caseless=True)
    else:
        embedd_matrix = None

    return (X_train, Y_train, mask_train), vocab, len(vocab), embedd_matrix, overal_maxlen, overal_maxnum, scaled_train_mean
Esempio n. 8
0
def main():
    # Get the arguments from the command-line except the filename
    argv = sys.argv[1:]
    params = get_params(argv)
    print('Program started, input file: %s' % params['filepath'])

    if 'debug' in params.keys():
        if params['debug'] == '1':
            print('Setting debug level: Debug')
            logging.getLogger().setLevel(logging.DEBUG)
        else:
            print('Setting debug level: Warnings')
            logging.getLogger().setLevel(logging.WARNING)
    else:
        logging.getLogger().setLevel(logging.INFO)

    params['data_params'], type_dict = reader.get_data_settings(params['config_file'])
    header, raw_data = reader.get_data(params)
    type_selector, converters = reader.validate_data_format(params['data_params'], header)
    key_fails, failed_rows, surveys,quality_assessment, out_keys,meta, raw_surveys = data_processor.process_rows(raw_data, header, params, converters)
    if 'logfile' in params:
        if params['logfile'].endswith('txt'):
            writer.write_log(logfiler.write_txt_log(params, key_fails, raw_data, failed_rows, meta), params['logfile'], 'Log:')
        elif params['logfile'].endswith('csv'):
            writer.write_log(logfiler.create_pbi_log(surveys,quality_assessment, meta['attribute_quality'], meta, failed_rows,params), params['logfile'], ['Key', 'Value'])
        else:
            logging.CRITICAL('Logfile not written, wrong file extension provided')
    out_header = surveys[0].keys()
    if params['outfile'].endswith('.csv'):
        writer.write_data(surveys, out_header, params,rounding=9,raw_surveys=raw_surveys)
        if len(quality_assessment) > 0:
            writer.write_data(quality_assessment, quality_assessment[0].keys(), params, sub_file='group_qual')
        if len(meta['attribute_quality']) > 0 and 'logfile' in params:
            writer.write_data(meta['attribute_quality'], meta['attribute_quality'][0].keys(), params,
                              sub_file='attr_qual')
    else:
        logging.critical("Your output filename: %s must end with .csv" % params['outfile'])
Esempio n. 9
0
                    "--emb_technique",
                    dest="emb_technique",
                    type=str,
                    metavar='<str>',
                    default='w2v',
                    help="embedding technique (w2v or fasttext)")
parser.add_argument("-o",
                    "--output",
                    dest="out_file",
                    type=str,
                    metavar='<str>',
                    help="output file name")

args = parser.parse_args()
vocab, train_x, overall_maxlen = dataset.get_data(vocab_size=args.vocab_size,
                                                  maxlen=args.maxlen,
                                                  lang=args.lang)
train_x = sequence.pad_sequences(train_x, maxlen=overall_maxlen)
print('Number of training examples: ', len(train_x))
print('Length of vocab: ', len(vocab))

optimizer = get_optimizer(args.algorithm)
model = create_model(args, overall_maxlen, vocab)

model_param = config.model_param_file[args.lang].format(
    args.emb_technique, config.word_emb_training_type, args.epochs,
    config.filter_word_on)
model.load_weights(model_param)
model.compile(optimizer=optimizer,
              loss=ut.max_margin_loss,
              metrics=[ut.max_margin_loss])
Esempio n. 10
0
from reader import get_data
from basic_prepro import process, do_tokeinze
text, title = get_data('001.pdf')

processed = process(text)
print(len(do_tokeinze(processed)))
Esempio n. 11
0
				embedding = np.zeros(300,dtype=np.float32)
			sen_embedding = sen_embedding + embedding
		return sen_embedding
	elif embedding_type == 'WORD2VEC':
		sen_embedding = np.zeros(300,dtype=np.float32)
		for word in sentence:
			try:
				embedding = model_word2vec[word]
			except:
				embedding = np.zeros(300,dtype=np.float32)
			sen_embedding = sen_embedding + embedding
		return sen_embedding		
			
	return None	

x, y , ac_num, ac_words = reader.get_data()
x_embedding = []
for node in x:
	node_embedding = []
	for sent in node:
		temp_sent = sent.split(" ")
		sent_embedding = sentence_embedding([temp_value.strip() for temp_value in temp_sent],"WORD2VEC")
		node_embedding.append(sent_embedding)
	x_embedding.append(node_embedding)
x_embedding = np.asarray(x_embedding)

#done with the data preperation step 
x_embedding,y = shuffle_copy(x_embedding,y)

evals = []
# 0.63
Esempio n. 12
0
}
#assert args.domain in {'restaurant', 'beer'}

if args.seed > 0:
    np.random.seed(args.seed)

# ###############################################################################################################################
# ## Prepare data
# #

from keras.preprocessing import sequence
import reader as dataset

#vocab, train_x, test_x, overall_maxlen = dataset.get_data(args.domain, vocab_size=args.vocab_size, maxlen=args.maxlen)
vocab, train_x, overall_maxlen = dataset.get_data(args.input_path,
                                                  args.out_dir_path,
                                                  vocab_size=args.vocab_size,
                                                  maxlen=args.maxlen)

train_x = sequence.pad_sequences(train_x, maxlen=overall_maxlen)
#test_x = sequence.pad_sequences(test_x, maxlen=overall_maxlen)

print('Number of training examples: ', len(train_x))
print('Length of vocab: ', len(vocab))


def sentence_batch_generator(data, batch_size):
    n_batch = len(data) // batch_size
    batch_count = 0
    np.random.shuffle(data)

    while True:
Esempio n. 13
0
def prepare_sentence_data(datapaths, embedding_path=None, embedding='word2vec', embedd_dim=100, prompt_id=1, vocab_size=0, tokenize_text=True, \
                         to_lower=True, sort_by_len=False, vocab_path=None, score_index=6):

    assert len(
        datapaths) == 4, "data paths should include train, dev and test path"
    (train_x, train_y, train_prompts, train_ids), (dev_x, dev_y, dev_prompts, dev_ids), (test_x, test_y, test_prompts, test_ids), vocab, overal_maxlen, overal_maxnum = \
        reader.get_data(datapaths, prompt_id, vocab_size, tokenize_text=True, to_lower=True, sort_by_len=False, vocab_path=None, score_index=6)

    train_d, max_sentnum = reader.read_description(datapaths[3],
                                                   vocab,
                                                   len(train_x),
                                                   tokenize_text=True,
                                                   to_lower=True)
    dev_d, max_sentnum = reader.read_description(datapaths[3],
                                                 vocab,
                                                 len(dev_x),
                                                 tokenize_text=True,
                                                 to_lower=True)
    test_d, max_sentnum = reader.read_description(datapaths[3],
                                                  vocab,
                                                  len(test_x),
                                                  tokenize_text=True,
                                                  to_lower=True)

    X_train, y_train, mask_train = utils.padding_sentence_sequences(
        train_x, train_y, overal_maxnum, overal_maxlen, post_padding=True)
    X_dev, y_dev, mask_dev = utils.padding_sentence_sequences(
        dev_x, dev_y, overal_maxnum, overal_maxlen, post_padding=True)
    X_test, y_test, mask_test = utils.padding_sentence_sequences(
        test_x, test_y, overal_maxnum, overal_maxlen, post_padding=True)

    D_train, mask_d_train = utils.padding_des_sequences(train_d,
                                                        max_sentnum,
                                                        overal_maxlen,
                                                        post_padding=True)
    D_dev, mask_d_dev = utils.padding_des_sequences(dev_d,
                                                    max_sentnum,
                                                    overal_maxlen,
                                                    post_padding=True)
    D_test, mask_d_test = utils.padding_des_sequences(test_d,
                                                      max_sentnum,
                                                      overal_maxlen,
                                                      post_padding=True)

    if prompt_id:
        train_pmt = np.array(train_prompts, dtype='int32')
        dev_pmt = np.array(dev_prompts, dtype='int32')
        test_pmt = np.array(test_prompts, dtype='int32')

    train_mean = y_train.mean(axis=0)
    train_std = y_train.std(axis=0)
    dev_mean = y_dev.mean(axis=0)
    dev_std = y_dev.std(axis=0)
    test_mean = y_test.mean(axis=0)
    test_std = y_test.std(axis=0)

    # We need the dev and test sets in the original scale for evaluation
    # dev_y_org = y_dev.astype(reader.get_ref_dtype())
    # test_y_org = y_test.astype(reader.get_ref_dtype())

    # Convert scores to boundary of [0 1] for training and evaluation (loss calculation)
    Y_train = reader.get_model_friendly_scores(y_train, prompt_id)
    Y_dev = reader.get_model_friendly_scores(y_dev, prompt_id)
    Y_test = reader.get_model_friendly_scores(y_test, prompt_id)
    scaled_train_mean = reader.get_model_friendly_scores(train_mean, prompt_id)
    # print Y_train.shape

    logger.info('Statistics:')

    logger.info('  train X shape: ' + str(X_train.shape))
    logger.info('  dev X shape:   ' + str(X_dev.shape))
    logger.info('  test X shape:  ' + str(X_test.shape))

    logger.info('  train Y shape: ' + str(Y_train.shape))
    logger.info('  dev Y shape:   ' + str(Y_dev.shape))
    logger.info('  test Y shape:  ' + str(Y_test.shape))

    logger.info(
        '  train_y mean: %s, stdev: %s, train_y mean after scaling: %s' %
        (str(train_mean), str(train_std), str(scaled_train_mean)))

    if embedding_path:
        embedd_dict, embedd_dim, _ = utils.load_word_embedding_dict(
            embedding, embedding_path, vocab, logger, embedd_dim)
        embedd_matrix = utils.build_embedd_table(vocab,
                                                 embedd_dict,
                                                 embedd_dim,
                                                 logger,
                                                 caseless=True)
    else:
        embedd_matrix = None

    return (X_train, Y_train, D_train, mask_train, train_ids), (X_dev, Y_dev, D_dev, mask_dev, dev_ids), (X_test, Y_test, D_test ,mask_test, test_ids), \
            vocab, len(vocab), embedd_matrix, overal_maxlen, overal_maxnum, max_sentnum, scaled_train_mean
Esempio n. 14
0
import calculator.flow
import viewer.team_metrics
import viewer.dash
import logging
import logging.config
import config

with open("log_config.yaml", "r") as f:
    log_config = yaml.safe_load(f.read())
    logging.config.dictConfig(log_config)

logger = logging.getLogger(__name__)
logger.info("Starting seshat. Let's do team magic!")

config = config.get()

projects = []
for source_config in config["input"]:
    logging.info(f"Reading data for {source_config['name']}")
    data = reader.get_data(source_config)
    cycle_data = calculator.flow.cycle_data(data, source_config)
    team_metrics = viewer.team_metrics.Team_Metrics(cycle_data, source_config)
    projects.append(team_metrics)

dash = viewer.dash.Dash(projects, config)

server = dash.server

if __name__ == "__main__":
    dash.run()
Esempio n. 15
0
def _main(config):
    base_filename = config.name
    logger_filename = 'logs/' + base_filename + '.log'
    logger = set_up_logger(logger_filename)
    title = '{}: {} ({})'.format(__file__, config.name, config.desc)
    logger.info('START ' + title + '\n\n{}\n'.format(config))

    data = get_data(
        word_emb_data_path_prefix=GLOVE_PREPROC_WITH_UNKS_PATH_PREFIX,
        tokenized_trn_json_path=TOKENIZED_TRN_JSON_PATH,
        tokenized_dev_json_path=TOKENIZED_DEV_JSON_PATH,
        max_ans_len=MAX_ANS_LEN,
        max_ctx_len=MAX_CTX_LEN)

    if config.device != 'cpu':
        assert 'theano' not in sys.modules
        import theano.sandbox.cuda
        theano.sandbox.cuda.use(config.device)

    from model import get_model
    model = get_model(config, data)

    lm_data = get_lm_data(config.lm_layer) if config.mode == 'LM' else None

    # Training loop
    epoch_results = []
    max_em = -np.inf
    max_f1 = -np.inf
    np_rng = np.random.RandomState(config.seed // 2)
    for epoch in range(1, config.max_num_epochs + 1):
        trn_loss, trn_acc, trn_samples_per_sec, trn_num_all_samples, trn_num_valid_samples, \
          trn_mean_grad_norm, trn_max_grad_norm, trn_min_grad_norm, trn_num_unsafe_samples = \
            _trn_epoch(config, model, data, lm_data, epoch, np_rng)
        dev_loss, dev_acc, dev_em, dev_f1, dev_num_all_samples, dev_num_valid_samples = \
          _dev_epoch(config, model, data, lm_data)

        best_filename = base_filename
        if dev_em > max_em:
            model.save('models/' + best_filename + '_best_em.pkl')
            max_em = dev_em
        if dev_f1 > max_f1:
            model.save('models/' + best_filename + '_best_f1.pkl')
            max_f1 = dev_f1
        if config.save_freq and epoch % config.save_freq == 0:
            model.save('models/' + base_filename +
                       '_e{:03d}.pkl'.format(epoch))

        epoch_results.append(
            EpochResult(trn_loss, trn_acc, dev_loss, dev_acc, dev_em, dev_f1))
        if config.plot:
            plot_epoch_results(epoch_results, 'logs/' + base_filename + '.png')
        logger.info(
            ('\n\nEpc {} {}: (smp/sec: {:<.1f})'
             ' (trn: {}/{}) (dev: {}/{})'
             ' (grad: avg:{} max:{} min:{}) (low probability predictions:{})'
             '\n{}\n\nResults:\n{}\n\n').format(
                 epoch, config.name, trn_samples_per_sec,
                 trn_num_valid_samples, trn_num_all_samples,
                 dev_num_valid_samples, dev_num_all_samples,
                 trn_mean_grad_norm, trn_max_grad_norm, trn_min_grad_norm,
                 trn_num_unsafe_samples, config.format_compared(),
                 format_epoch_results(epoch_results)))

    logger.info('END ' + title)
Esempio n. 16
0
# ###############################################################################################################################
# ## Prepare data
# #

from keras.preprocessing import sequence
import reader as dataset  # 有3个方法
# create_vocab
# read_dataset
# get_data

# args = parser.parse_args() 这里保存了所有的参数

# 这一步就会创建vocab
vocab, train_x, test_x, overall_maxlen = dataset.get_data(
    args.domain, vocab_size=args.vocab_size,
    maxlen=args.maxlen)  # 读取已经经过preprocessing的数据.
train_x = sequence.pad_sequences(train_x, maxlen=overall_maxlen)
test_x = sequence.pad_sequences(test_x, maxlen=overall_maxlen)

train_x = train_x[0:3000]  # 限制长度进行test
print('Number of training examples: ', len(train_x))
print('Length of vocab: ', len(vocab))


def sentence_batch_generator(data, batch_size):
    n_batch = len(data) // batch_size
    batch_count = 0
    np.random.shuffle(data)

    while True:
Esempio n. 17
0
def set_rss():
    for key in rss.keys():
        if key not in rss_contents.keys():
            rss_data = reader.get_data(rss[key])
            rss_contents[key] = rss_data
Esempio n. 18
0
def _main(config, config_idx, train):
    base_filename = config.name + '_cfg' + str(config_idx)
    logger = set_up_logger('logs/' + base_filename + '.log')
    title = '{}: {} ({}) config index {}'.format(__file__, config.name,
                                                 config.desc, config_idx)
    logger.info('START ' + title + '\n\n{}\n'.format(config))

    data = get_data(config, train)

    if config.device != 'cpu':
        assert 'theano' not in sys.modules
        import theano.sandbox.cuda
        theano.sandbox.cuda.use(config.device)
    from model import get_model
    model = get_model(config, data)

    if not train:
        assert config.tst_load_model_path
        if not model.load(config.tst_load_model_path):
            raise AssertionError('Failed loading model weights from {}'.format(
                config.tst_load_model_path))
        ans_hats = _tst_epoch(config, model, data)
        write_test_predictions(ans_hats, config.pred_json_path)
        logger.info('END ' + title)
        return

    # Training loop
    epoch_results = []
    max_em = -np.inf
    max_f1 = -np.inf
    epochs_with_no_improvement = 0
    np_rng = np.random.RandomState(config.seed // 2)
    for epoch in range(1, config.max_num_epochs + 1):
        trn_loss, trn_acc, trn_samples_per_sec = _trn_epoch(
            config, model, data, epoch, np_rng)
        dev_loss, dev_acc, dev_em, dev_f1 = _dev_epoch(config, model, data)
        if dev_em > max_em:
            model.save('models/' + base_filename + '_best_em.pkl')
            max_em = dev_em
            # Best EM so far, reset epochs_with_no_improvement
            epochs_with_no_improvement = 0
        if dev_f1 > max_f1:
            model.save('models/' + base_filename + '_best_f1.pkl')
            max_f1 = dev_f1
            # Best F1 so far, reset epochs_with_no_improvement
            epochs_with_no_improvement = 0
        if dev_em <= max_em and dev_f1 <= max_f1:
            # Neither dev_em nor dev_f1 are better than max, increment epochs
            # with no improvement.
            epochs_with_no_improvement += 1
        if config.save_freq and epoch % config.save_freq == 0:
            model.save('models/' + base_filename +
                       '_e{:03d}.pkl'.format(epoch))
        epoch_results.append(
            EpochResult(trn_loss, trn_acc, dev_loss, dev_acc, dev_em, dev_f1))
        if config.plot:
            plot_epoch_results(epoch_results, 'logs/' + base_filename + '.png')
        logger.info(
            '\n\nFinished epoch {} for: (config index {}) (samples/sec: {:<.1f})\n{}\n\nResults:\n{}\n\n'
            .format(epoch, config_idx, trn_samples_per_sec,
                    config.format_compared(),
                    format_epoch_results(epoch_results)))
        # Check if we have to do early stopping.
        if epochs_with_no_improvement > config.patience:
            logger.info("Patience exceeded.")
            break
    logger.info('END ' + title)
    'rmsprop', 'sgd', 'adagrad', 'adadelta', 'adam', 'adamax'
}
# assert args.domain in {'restaurant', 'beer'}

if args.seed > 0:
    np.random.seed(args.seed)

# ###############################################################################################################################
# ## Prepare data
# #

from keras.preprocessing import sequence
import reader as dataset

vocab, train_x, overall_maxlen = dataset.get_data(args.domain,
                                                  vocab_size=args.vocab_size,
                                                  maxlen=args.maxlen,
                                                  require_test=False)
train_x = sequence.pad_sequences(train_x, maxlen=overall_maxlen)

# train_x = train_x[0:30000]
print('Number of training examples: ', len(train_x))
print('Length of vocab: ', len(vocab))


def sentence_batch_generator(data, batch_size):
    n_batch = len(data) // batch_size
    batch_count = 0
    np.random.shuffle(data)

    while True:
        if batch_count == n_batch:
Esempio n. 20
0
    anss_val = np.array([_np_ans_word_idxs_to_ans_idx(ans_stt, ans_end, max_ans_len) for \
                         ans_stt, ans_end in anss], dtype=np.int32)
    ans_stts_val = anss[:, 0]
    ans_ends_val = anss[:, 1]

    gpu_anss = torch.from_numpy(anss_val)
    gpu_ans_stts = torch.from_numpy(ans_stts_val)
    gpu_ans_ends = torch.from_numpy(ans_ends_val)
    return gpu_anss, gpu_ans_stts, gpu_ans_ends

config = Config()
base_filename = config.name + '_cfg' + str(0)
logger = set_up_logger('logs/' + base_filename + '.log')
title = '{}: {}'.format(__file__, config.name)
logger.info('START ' + title + '\n\n{}\n'.format(config))
data = get_data(config, train=True)
emb_val = data.word_emb_data.word_emb  # (voc size, emb_dim)
first_known_word = data.word_emb_data.first_known_word
assert config.emb_dim == emb_val.shape[1]
assert first_known_word > 0
emb_val[:first_known_word] = 0
emb = torch.from_numpy(emb_val)

#load all the data, train and dev data
trn_ctxs, trn_ctx_masks, trn_ctx_lens, trn_qtns, trn_qtn_masks, trn_qtn_lens, trn_qtn_ctx_idxs, trn_anss, trn_ans_stts, trn_ans_ends = _gpu_dataset(
    'trn', data.trn, config)
dev_ctxs, dev_ctx_masks, dev_ctx_lens, dev_qtns, dev_qtn_masks, dev_qtn_lens, dev_qtn_ctx_idxs, dev_anss, dev_ans_stts, dev_ans_ends = _gpu_dataset(
    'dev', data.dev, config)

def print_param(mdoel):
    for name, param in model.state_dict().items():
Esempio n. 21
0
def prepare_sentence_data(
        datapaths,
        embedding_path=None,
        embedding='word2vec',
        emb_dim=100,
        prompt_id=1,
        vocab_size=0,
        tokenize_text=True,
        to_lower=True,
        vocab_path=None,
        score_index=6,
        need_context=True
):
    assert len(datapaths) == 3, "data paths should include train, dev and test path"
    (train_x, train_y, train_prompts, train_text), \
    (dev_x, dev_y, dev_prompts, dev_text), \
    (test_x, test_y, test_prompts, test_text), \
    vocab, overall_maxlen, overall_maxnum = \
        reader.get_data(
            datapaths,
            prompt_id,
            vocab_size,
            tokenize_text,
            to_lower,
            vocab_path,
            score_index)

    X_train, y_train, mask_train = utils.padding_sentence_sequences(train_x, train_y, overall_maxnum, overall_maxlen,
                                                                    post_padding=True)
    X_dev, y_dev, mask_dev = utils.padding_sentence_sequences(dev_x, dev_y, overall_maxnum, overall_maxlen,
                                                              post_padding=True)
    X_test, y_test, mask_test = utils.padding_sentence_sequences(test_x, test_y, overall_maxnum, overall_maxlen,
                                                                 post_padding=True)

    if need_context:
        context, context_len, context_num = reader.get_context(prompt_id, vocab, to_lower)
    else:
        # Dummy context
        context = [[0]]
        context_len = 1
        context_num = 1
    train_context = [context] * len(train_x)
    dev_context = [context] * len(dev_x)
    test_context = [context] * len(test_x)

    train_context, _, _ = utils.padding_sentence_sequences(train_context, train_y, context_num, context_len, post_padding=True)
    dev_context, _, _ = utils.padding_sentence_sequences(dev_context, dev_y, context_num, context_len, post_padding=True)
    test_context, _, _ = utils.padding_sentence_sequences(test_context, test_y, context_num, context_len, post_padding=True)

    train_mean = y_train.mean(axis=0)
    train_std = y_train.std(axis=0)
    dev_mean = y_dev.mean(axis=0)
    dev_std = y_dev.std(axis=0)
    test_mean = y_test.mean(axis=0)
    test_std = y_test.std(axis=0)

    # We need the dev and test sets in the original scale for evaluation
    # dev_y_org = y_dev.astype(reader.get_ref_dtype())
    # test_y_org = y_test.astype(reader.get_ref_dtype())

    # Convert scores to boundary of [0 1] for training and evaluation (loss calculation)
    Y_train = utils.get_model_friendly_scores(y_train, prompt_id)
    Y_dev = utils.get_model_friendly_scores(y_dev, prompt_id)
    Y_test = utils.get_model_friendly_scores(y_test, prompt_id)
    scaled_train_mean = utils.get_model_friendly_scores(train_mean, prompt_id)
    scaled_dev_mean = utils.get_model_friendly_scores(dev_mean, prompt_id)
    scaled_test_mean = utils.get_model_friendly_scores(test_mean, prompt_id)
    # print Y_train.shape

    logger.info('Statistics:')

    logger.info('  train X shape: ' + str(X_train.shape))
    logger.info('  dev X shape:   ' + str(X_dev.shape))
    logger.info('  test X shape:  ' + str(X_test.shape))

    if need_context:
        logger.info('  train context shape: ' + str(train_context.shape))
        logger.info('  dev context shape: ' + str(dev_context.shape))
        logger.info('  test context shape: ' + str(test_context.shape))

    logger.info('  train Y shape: ' + str(Y_train.shape))
    logger.info('  dev Y shape:   ' + str(Y_dev.shape))
    logger.info('  test Y shape:  ' + str(Y_test.shape))

    logger.info('  train_y mean: %s, stdev: %s, train_y mean after scaling: %s' %
                (str(train_mean), str(train_std), str(scaled_train_mean)))
    logger.info('  dev_y mean: %s, stdev: %s, dev_y mean after scaling: %s' %
                (str(dev_mean), str(dev_std), str(scaled_dev_mean)))
    logger.info('  test_y mean: %s, stdev: %s, test_y mean after scaling: %s' %
                (str(test_mean), str(test_std), str(scaled_test_mean)))

    if embedding_path:
        emb_dict, emb_dim, _ = utils.load_word_embedding_dict(embedding, embedding_path, vocab, logger, emb_dim)
        emb_matrix = utils.build_embedding_table(vocab, emb_dict, emb_dim, logger, caseless=True)
    else:
        emb_matrix = None

    return (X_train, Y_train, mask_train, train_context, train_text), \
           (X_dev, Y_dev, mask_dev, dev_context, dev_text), \
           (X_test, Y_test, mask_test, test_context, test_text), \
           vocab, len(vocab), emb_matrix, overall_maxlen, overall_maxnum, scaled_train_mean, context_len, context_num
Esempio n. 22
0
				embedding = np.zeros(300,dtype=np.float32)
			sen_embedding = sen_embedding + embedding
		return sen_embedding
	elif embedding_type == 'WORD2VEC':
		sen_embedding = np.zeros(300,dtype=np.float32)
		for word in sentence:
			try:
				embedding = model_word2vec[word]
			except:
				embedding = np.zeros(300,dtype=np.float32)
			sen_embedding = sen_embedding + embedding
		return sen_embedding		
			
	return None	

x, y , ac_num, ac_words = reader.get_data()
x_embedding = []
for node in x:
	node_embedding = []
	for sent in node:
		temp_sent = sent.split(" ")
		sent_embedding = sentence_embedding([temp_value.strip() for temp_value in temp_sent],"WORD2VEC")
		node_embedding.append(sent_embedding)
	x_embedding.append(node_embedding)
x_embedding = np.asarray(x_embedding)

#done with the data preperation step 
x_embedding,y = shuffle_copy(x_embedding,y)

evals = []
# 0.63
Esempio n. 23
0
def train_model_each_cluster(args,cluster_size,embtype):
    logger.info("Cluster Size: {}".format(cluster_size))
    args.aspect_size = cluster_size
    if args.seed > 0:
        np.random.seed(args.seed)

    aspect_file_name = config.aspect_file_name[args.lang].format(args.emb_technique, config.word_emb_training_type, args.epochs, config.filter_word_on, embtype, cluster_size)
    model_path = config.model_param_file[args.lang].format(args.emb_technique, config.word_emb_training_type, args.epochs, config.filter_word_on)
    util.createPath(aspect_file_name)

    vocab, train_x, overall_maxlen = dataset.get_data(vocab_size=args.vocab_size,
                                                      maxlen=args.maxlen, lang=args.lang)
    train_x = sequence.pad_sequences(train_x, maxlen=overall_maxlen)
    print('Number of training examples: ', len(train_x))
    print('Length of vocab: ', len(vocab))

    optimizer = get_optimizer(args.algorithm)
    logger.info('Building {} based model for {}'.format(args.emb_technique, args.lang))
    model = create_model(args, overall_maxlen, vocab)
    # freeze the word embedding layer
    model.get_layer('word_emb').trainable = False
    model.compile(optimizer=optimizer, loss=util.max_margin_loss, metrics=[util.max_margin_loss])

    logger.info("-" * 80)

    vocab_inv = {}
    for w, ind in vocab.items():
        vocab_inv[ind] = w

    sen_gen = sentence_batch_generator(train_x, args.batch_size)
    neg_gen = negative_batch_generator(train_x, args.batch_size, args.neg_size)
    batches_per_epoch = len(train_x) // args.batch_size

    min_loss = float('inf')
    for ii in range(args.epochs):
        t0 = time()
        loss, max_margin_loss = 0., 0.

        for b in tqdm(range(batches_per_epoch)):
            sen_input = next(sen_gen)
            neg_input = next(neg_gen)

            batch_loss, batch_max_margin_loss = model.train_on_batch([sen_input, neg_input],
                                                                     np.ones((args.batch_size, 1)))
            loss += batch_loss / batches_per_epoch
            max_margin_loss += batch_max_margin_loss / batches_per_epoch

        tr_time = time() - t0

        if loss < min_loss:
            min_loss = loss
            word_emb = K.get_value(model.get_layer('word_emb').embeddings)
            aspect_emb = K.get_value(model.get_layer('aspect_emb').W)
            word_emb = word_emb / np.linalg.norm(word_emb, axis=-1, keepdims=True)
            aspect_emb = aspect_emb / np.linalg.norm(aspect_emb, axis=-1, keepdims=True)
            aspect_file = open(aspect_file_name, 'wt', encoding='utf-8')
            model.save(model_path)
            for ind in range(len(aspect_emb)):
                desc = aspect_emb[ind]
                sims = word_emb.dot(desc.T)
                ordered_words = np.argsort(sims)[::-1]
                desc_list = [vocab_inv[w] + "|" + str(sims[w]) for w in ordered_words[:50]]
                # print('Aspect %d:' % ind)
                # print(desc_list)
                aspect_file.write('Aspect %d:\n' % ind)
                aspect_file.write(' '.join(desc_list) + '\n\n')

        per_cluster_train_loss = loss

        logger.info('Epoch %d, train: %is' % (ii, tr_time))
        logger.info(
            'Total loss: %.4f, max_margin_loss: %.4f, ortho_reg: %.4f' % (
                loss, max_margin_loss, loss - max_margin_loss))

    return per_cluster_train_loss
assert args.algorithm in {
    'rmsprop', 'sgd', 'adagrad', 'adadelta', 'adam', 'adamax'
}
# assert args.domain in {'restaurant', 'beer'}

if args.seed > 0:
    np.random.seed(args.seed)

# ###############################################################################################################################
# ## Prepare data
# #

from keras.preprocessing import sequence
import reader as dataset

vocab, train_x, test_x, overall_maxlen = dataset.get_data(
    args.domain, vocab_size=args.vocab_size, maxlen=args.maxlen)
train_x = sequence.pad_sequences(train_x, maxlen=overall_maxlen)
test_x = sequence.pad_sequences(test_x, maxlen=overall_maxlen)

# train_x = train_x[0:30000]
print('Number of training examples: ', len(train_x))
print('Length of vocab: ', len(vocab))


def sentence_batch_generator(data, batch_size):
    n_batch = len(data) // batch_size
    batch_count = 0
    np.random.shuffle(data)

    while True:
        if batch_count == n_batch:
Esempio n. 25
0
        if args.train:
            print("Training...")
            df_train = pd.read_json(args.data_dir + "csqa.train.json")
            df_dev = pd.read_json(args.data_dir + "csqa.dev.json")
            if args.random_emb:
                train_embs = np.random.uniform(-1, 1, (406213, 768))
                dev_embs = np.random.uniform(-1, 1, (45076, 768))
            else:
                train_embs = reader.load_embs(args.data_dir +
                                              "csqa.train.embeddings.bin")
                dev_embs = reader.load_embs(args.data_dir +
                                            "csqa.dev.embeddings.bin")
            train_data = reader.get_data(
                df_train,
                train_embs,
                rel2idx,
                subset=int(config["parameters"]["subset_train"]),
                shuffle=False,
                random=args.random_emb,
            )

            dev_data = reader.get_data(
                df_dev,
                dev_embs,
                rel2idx,
                subset=int(config["parameters"]["subset_dev"]),
                shuffle=False,
                random=args.random_emb,
            )
            classifier.train(train_data, dev_data)

        if args.test: