Example #1
0
def main():
    config = Config()

    if config.get_token() == 1:
        print("oAuth rotation failed, is the refresh token valid?")
        return 401
    underlying_data = get_underlying_data(config)
    spread_data = SpreadFinder(underlying_data=underlying_data, min_imp_vol=70)

    print('\n')
Example #2
0
    def __init__(self):
        # init class structure

        self.pc = PulseControl()
        self.eq = EqControl(self.pc)
        self.config = Config()

        self.padb = paDatabase(self.pc)
        self.sg = SoundGen(self.padb, self.pc)
        self.pamm = paModuleManager(self.pc, self.eq, self.padb, self.config)
Example #3
0
def gen_config(target_max_length):

    source_emb = pkl.load(open(FLAGS.source_emb))
    target_emb = pkl.load(open(FLAGS.target_emb))
    source_word2id = pkl.load(open(FLAGS.source_word2id))
    source_id2word = pkl.load(open(FLAGS.source_id2word))
    target_word2id = pkl.load(open(FLAGS.target_word2id))
    target_id2word = pkl.load(open(FLAGS.target_id2word))

    return Config(source_emb=source_emb,
                  target_emb=target_emb,
                  source_word2id=source_word2id,
                  source_id2word=source_id2word,
                  target_word2id=target_word2id,
                  target_id2word=target_id2word,
                  source_vocab_size=source_emb.shape[0],
                  target_vocab_size=target_emb.shape[0],
                  word_dim=300,
                  source_hidden_units=512,
                  target_hidden_units=512,
                  target_max_length=target_max_length,
                  batch_size=128,
                  learning_rate=1e-3,
                  epochs=10,
                  sample_size=512)
Example #4
0
def main4():

    args = get_args()
    if args.data == "nyt":
        vocab_file = "/home/ml/lyu40/PycharmProjects/data/nyt/lda_domains/preprocessed/vocab_100d.p"
        with open(vocab_file, "rb") as f:
            vocab = pickle.load(f, encoding='latin1')
    else:
        vocab_file = '/home/ml/ydong26/data/CNNDM/CNN_DM_pickle_data/vocab_100d.p'
        with open(vocab_file, "rb") as f:
            vocab = pickle.load(f, encoding='latin1')
    config = Config(
        vocab_size=vocab.embedding.shape[0],
        embedding_dim=vocab.embedding.shape[1],
        category_size=args.category_size,
        category_dim=50,
        word_input_size=100,
        sent_input_size=2 * args.hidden,
        word_GRU_hidden_units=args.hidden,
        sent_GRU_hidden_units=args.hidden,
        pretrained_embedding=vocab.embedding,
        word2id=vocab.w2i,
        id2word=vocab.i2w,
    )
    doc = Document(content=[[
        'to', 'the', 'editor', 're', 'for', 'women', 'worried', 'about',
        'fertility', 'egg', 'bank', 'is', 'a', 'new', 'option', 'sept', '00',
        'imagine', 'my', 'joy', 'in', 'reading', 'the', 'morning',
        'newspapers', 'on', 'the', 'day', 'of', 'my', '00th', 'birthday',
        'and', 'finding', 'not', 'one', 'but', 'two', 'articles', 'on', 'how',
        'women', 's', 'fertility', 'drops', 'off', 'precipitously', 'after',
        'age', '00'
    ], [
        'one', 'in', 'the', 'times', 'and', 'one', 'in', 'another', 'newspaper'
    ], ['i', 'sense', 'a', 'conspiracy', 'here'],
                            [
                                'have', 'you', 'been', 'talking', 'to', 'my',
                                'mother', 'in', 'law'
                            ], ['laura', 'heymann', 'washington']],
                   summary=[[
                       'laura', 'heymann', 'letter', 'on', 'sept', '00',
                       'article', 'about', 'using', 'egg', 'bank', 'to',
                       'prolong', 'fertility', 'expresses', 'ironic', 'humor',
                       'about', 'her', 'age', 'and', 'chances', 'of',
                       'becoming', 'pregnant'
                   ]],
                   label=[0.01] * 100,
                   label_idx=[0.01] * 100)
    extract_net = model_all.FullyShare(config)
    label_idx = torch.tensor([2], dtype=torch.float, device='cuda:0').cuda()

    x = prepare_data(doc, vocab.w2i)
    sents = Variable(torch.from_numpy(x)).cuda()

    if label_idx.dim() == 2:
        outputs = extract_net(sents, label_idx[0])
    else:
        outputs = extract_net(sents, label_idx)
Example #5
0
def dm_analysis(dm_model_path, docs):
    try:
        embeddings = pickle.load(open("analyze_embeddings.p", "rb"))
    except FileNotFoundError:
        args = get_args()
        with open(args.vocab_file, "rb") as f:
            vocab = pickle.load(f, encoding='latin1')
        config = Config(
            vocab_size=vocab.embedding.shape[0],
            embedding_dim=vocab.embedding.shape[1],
            category_size=args.category_size,
            category_dim=50,
            word_input_size=100,
            sent_input_size=2 * args.hidden,
            word_GRU_hidden_units=args.hidden,
            sent_GRU_hidden_units=args.hidden,
            pretrained_embedding=vocab.embedding,
            word2id=vocab.w2i,
            id2word=vocab.i2w,
        )
        dm_model = DomainModel(config)
        dm_model_dict = torch.load(dm_model_path)['state_dict']
        dm_model.load_state_dict(dm_model_dict)

        dm_enc_analyzer = Dm_Enc_Analyzer(dm_model.encoder_list)
        dm_dec_analyzer = Dm_Dec_Analyzer(dm_model.decoder_list)

        # evaluate example articles
        # each doc is a Doc object
        embeddings = []
        probs = []
        for doc in docs:
            try:
                print(doc.content)
                x = prepare_data(doc, vocab.w2i)
                sents = Variable(torch.from_numpy(x))
                label_idx = Variable(
                    torch.from_numpy(np.array([doc.label_idx])))
                embedding = dm_enc_analyzer(sents, label_idx)
                embeddings.append(embedding)

                prob = dm_dec_analyzer(embedding)
                probs.append(prob)
            except:
                print("problem in doing evaluation, skip this doc")
                pass

        pickle.dump(embeddings, open("analyze_embeddings.p", "wb"))
        print(probs)
Example #6
0
def main5():
    # evaluation test
    torch.manual_seed(233)
    torch.cuda.set_device(0)
    args = get_args()
    if args.data == "nyt":
        vocab_file = '/home/ml/ydong26/data/nyt_c/processed/vocab_100d.p'
        with open(vocab_file, "rb") as f:
            vocab = pickle.load(f, encoding='latin1')
    else:
        vocab_file = '/home/ml/ydong26/data/CNNDM/CNN_DM_pickle_data/vocab_100d.p'
        with open(vocab_file, "rb") as f:
            vocab = pickle.load(f, encoding='latin1')
    config = Config(
        vocab_size=vocab.embedding.shape[0],
        embedding_dim=vocab.embedding.shape[1],
        category_size=args.category_size,
        category_dim=50,
        word_input_size=100,
        sent_input_size=2 * args.hidden,
        word_GRU_hidden_units=args.hidden,
        sent_GRU_hidden_units=args.hidden,
        pretrained_embedding=vocab.embedding,
        word2id=vocab.w2i,
        id2word=vocab.i2w,
    )
    extract_net = model_all.GeneralModel(config)
    extract_net.cuda()
    model_name = "/home/ml/lyu40/PycharmProjects/E_Yue/model/nyt/5/model.epoch.9.gm.tr"
    checkpoint = torch.load(model_name)
    best_eval_reward = checkpoint['best_eval_reward']
    extract_net.load_state_dict(checkpoint['state_dict'])

    eval_reward, lead3_reward = evaluate.ext_model_eval(
        extract_net, vocab, args, "val")
    print('epoch 9 reward in validation for gm model on nyt data set: ' +
          str(eval_reward) + ' lead3: ' + str(lead3_reward) +
          " best eval award: " + str(best_eval_reward))
Example #7
0
def extractive_training(args, vocab):
    print(args)
    print("generating config")
    config = Config(
        vocab_size=vocab.embedding.shape[0],
        embedding_dim=vocab.embedding.shape[1],
        position_size=500,
        position_dim=50,
        word_input_size=100,
        sent_input_size=2 * args.hidden,
        word_GRU_hidden_units=args.hidden,
        sent_GRU_hidden_units=args.hidden,
        pretrained_embedding=vocab.embedding,
        word2id=vocab.w2i,
        id2word=vocab.i2w,
        dropout=args.dropout,
    )
    model_name = ".".join((args.model_file,
                           str(args.ext_model),
                           str(args.rouge_metric), str(args.std_rouge),
                           str(args.rl_baseline_method), "oracle_l", str(args.oracle_length),
                           "bsz", str(args.batch_size), "rl_loss", str(args.rl_loss_method),
                           "train_example_quota", str(args.train_example_quota),
                           "length_limit", str(args.length_limit),
                           "data", os.path.split(args.data_dir)[-1],
                           "hidden", str(args.hidden),
                           "dropout", str(args.dropout),
                           'ext'))
    print(model_name)

    log_name = ".".join(("../log/model",
                         str(args.ext_model),
                         str(args.rouge_metric), str(args.std_rouge),
                         str(args.rl_baseline_method), "oracle_l", str(args.oracle_length),
                         "bsz", str(args.batch_size), "rl_loss", str(args.rl_loss_method),
                         "train_example_quota", str(args.train_example_quota),
                         "length_limit", str(args.length_limit),
                         "hidden", str(args.hidden),
                         "dropout", str(args.dropout),
                         'ext'))

    print("init data loader and RL learner")
    data_loader = PickleReader(args.data_dir)

    # init statistics
    reward_list = []
    best_eval_reward = 0.
    model_save_name = model_name

    if args.fine_tune:
        model_save_name = model_name + ".fine_tune"
        log_name = log_name + ".fine_tune"
        args.std_rouge = True
        print("fine_tune model with std_rouge, args.std_rouge changed to %s" % args.std_rouge)

    reinforce = ReinforceReward(std_rouge=args.std_rouge, rouge_metric=args.rouge_metric,
                                b=args.batch_size, rl_baseline_method=args.rl_baseline_method,
                                loss_method=1)

    print('init extractive model')

    if args.ext_model == "lstm_summarunner":
        extract_net = model.SummaRuNNer(config)
    elif args.ext_model == "gru_summarunner":
        extract_net = model.GruRuNNer(config)
    elif args.ext_model == "bag_of_words":
        extract_net = model.SimpleRuNNer(config)
    elif args.ext_model == "simpleRNN":
        extract_net = model.SimpleRNN(config)
    elif args.ext_model == "RNES":
        extract_net = model.RNES(config)
    elif args.ext_model == "Refresh":
        extract_net = model.Refresh(config)
    elif args.ext_model == "simpleCONV":
        extract_net = model.simpleCONV(config)
    else:
        print("this is no model to load")

    extract_net.cuda()

    # print("current model name: %s"%model_name)
    # print("current log file: %s"%log_name)

    logging.basicConfig(filename='%s.log' % log_name,
                        level=logging.INFO, format='%(asctime)s [INFO] %(message)s')
    if args.load_ext:
        print("loading existing model%s" % model_name)
        extract_net = torch.load(model_name, map_location=lambda storage, loc: storage)
        extract_net.cuda()
        print("finish loading and evaluate model %s" % model_name)
        # evaluate.ext_model_eval(extract_net, vocab, args, eval_data="test")
        best_eval_reward, _ = evaluate.ext_model_eval(extract_net, vocab, args, "val")

    # Loss and Optimizer
    optimizer_ext = torch.optim.Adam(extract_net.parameters(), lr=args.lr, betas=(0., 0.999))

    print("starting training")
    n_step = 100
    for epoch in range(args.epochs_ext):
        train_iter = data_loader.chunked_data_reader("train", data_quota=args.train_example_quota)
        step_in_epoch = 0
        for dataset in train_iter:
            for step, docs in enumerate(BatchDataLoader(dataset, shuffle=True)):
                try:
                    extract_net.train()
                    # if True:
                    step_in_epoch += 1
                    # for i in range(1):  # how many times a single data gets updated before proceeding
                    doc = docs[0]
                    doc.content = tokens_to_sentences(doc.content)
                    doc.summary = tokens_to_sentences(doc.summary)
                    if args.oracle_length == -1:  # use true oracle length
                        oracle_summary_sent_num = len(doc.summary)
                    else:
                        oracle_summary_sent_num = args.oracle_length

                    x = prepare_data(doc, vocab)
                    if min(x.shape) == 0:
                        continue
                    sents = Variable(torch.from_numpy(x)).cuda()
                    outputs = extract_net(sents)

                    if args.prt_inf and np.random.randint(0, 100) == 0:
                        prt = True
                    else:
                        prt = False

                    loss, reward = reinforce.train(outputs, doc,
                                                   max_num_of_sents=oracle_summary_sent_num,
                                                   max_num_of_bytes=args.length_limit,
                                                   prt=prt)
                    if prt:
                        print('Probabilities: ', outputs.squeeze().data.cpu().numpy())
                        print('-' * 80)

                    reward_list.append(reward)

                    if isinstance(loss, Variable):
                        loss.backward()

                    if step % 1 == 0:
                        torch.nn.utils.clip_grad_norm(extract_net.parameters(), 1)  # gradient clipping
                        optimizer_ext.step()
                        optimizer_ext.zero_grad()
                    # print('Epoch %d Step %d Reward %.4f'%(epoch,step_in_epoch,reward))
                    logging.info('Epoch %d Step %d Reward %.4f' % (epoch, step_in_epoch, reward))
                except Exception as e:
                    print(e)

                if (step_in_epoch) % n_step == 0 and step_in_epoch != 0:
                    print('Epoch ' + str(epoch) + ' Step ' + str(step_in_epoch) +
                          ' reward: ' + str(np.mean(reward_list)))
                    reward_list = []

                if (step_in_epoch) % 10000 == 0 and step_in_epoch != 0:
                    print("doing evaluation")
                    extract_net.eval()
                    eval_reward, lead3_reward = evaluate.ext_model_eval(extract_net, vocab, args, "val")
                    if eval_reward > best_eval_reward:
                        best_eval_reward = eval_reward
                        print("saving model %s with eval_reward:" % model_save_name, eval_reward, "leadreward",
                              lead3_reward)
                        torch.save(extract_net, model_name)
                    print('epoch ' + str(epoch) + ' reward in validation: '
                          + str(eval_reward) + ' lead3: ' + str(lead3_reward))
    return extract_net
Example #8
0
# set seed
torch.manual_seed(args.seed)
torch.cuda.manual_seed(args.seed)
random.seed(args.seed)

# set cuda device
torch.cuda.set_device(args.gpu)

logging.info('generate config')

pretrained_embedding = pkl.load(open(args.emb_file))
config = Config(vocab_size=pretrained_embedding.shape[0],
                embedding_dim=pretrained_embedding.shape[1],
                position_size=500,
                position_dim=50,
                word_input_size=100,
                sent_input_size=2 * args.hidden,
                word_GRU_hidden_units=args.hidden,
                sent_GRU_hidden_units=args.hidden,
                pretrained_embedding=pretrained_embedding)

word2id = pkl.load(open('../data/word2id.pkl'))

logging.info('loadding train dataset')
train_dataset = pkl.load(open(args.train_file))
train_loader = DataLoader(train_dataset)

logging.info('loadding validation dataset')
validation_dataset = pkl.load(open(args.validation_file))
validation_loader = DataLoader(validation_dataset, shuffle=False)
Example #9
0
class MessageCentral():
    def __init__(self):
        # init class structure

        self.pc = PulseControl()
        self.eq = EqControl(self.pc)
        self.config = Config()

        self.padb = paDatabase(self.pc)
        self.sg = SoundGen(self.padb, self.pc)
        self.pamm = paModuleManager(self.pc, self.eq, self.padb, self.config)

    #
    #	start message, called if pulse audio gets connected
    #

    def on_pulse_connect(self):
        log("pact: start pulse control")
        self.pc.start()
        self.padb.on_pa_connect()
        self.pamm.on_pa_connect()

        SocketCom("kodi").call_func("up", "service", [])
        SocketCom("kodi").call_func("get", "player", [])

    #
    # Dispatch messages
    #

    def on_message(self, target, func, arg, conn):
        try:
            # filter messages
            if self.padb.on_message(target, func, arg): return

            # other messages are just forwarded

            cmd = "on_" + target + "_" + func
            methods = []

            for cl in [self.padb, self, self.pamm, self.eq, self.sg]:
                try:
                    methods.append(getattr(cl, cmd))
                except AttributeError:
                    pass
                except Exception as e:
                    opthandle(e)

            if len(methods) == 0:
                SocketCom.respond(conn, None)
                return

            for method in methods:
                ret = method(*arg)
                SocketCom.respond(conn, ret)

        except PulseError as e:
            handle(e)
            logerror("pact: in {},{},{}".format(target, func, str(arg)))
            logerror("pact: try to recover")

            try:
                self.pc.stop()
                self.on_pulse_connect()
            except Exception as e:
                handle(e)
                logerror("pact: recover failed")

        except Exception as e:
            logerror("pact: in {},{},{}".format(target, func, str(arg)))
            handle(e)

    #
    #	message collector, just collect fast incomeing messages from pulse audio
    #	handle them after a timeout
    #

    def on_pa_update(self):
        log("pact: on_pa_update")

        messages = self.padb.do_update()

        for message, arg in messages:
            try:
                method = getattr(self.pamm, message)
            except Exception:
                continue
            method(*arg)

        self.pamm.do_update()
        log("pact: %s" % str(self.padb))

        for message, arg in messages:
            try:
                method = getattr(self.sg, message)
            except Exception:
                continue
            method(*arg)

    #
    #	message handler of self
    #

    #def on_outlist_get(self):
    #	return self.padb.get_outlist()

    def on_latency_get(self):
        return self.padb.get_latency()

    def on_latency_set(self, latency_info):
        self.pc.set_port_latency(latency_info)

        if self.padb.output_sink:
            if self.padb.cureq_sink:
                self.config.set("eq_latency", int(latency_info["latency"]),
                                self.padb.output_sink.name)
            else:
                self.config.set("latency", int(latency_info["latency"]),
                                self.padb.output_sink.name)

    # just save the current selected profile to config file
    def on_eq_profile_load(self, _index, profile):
        if self.padb.output_sink:
            self.config.set("eq_profile", profile, self.padb.output_sink.name)

    # just save the current selected room correction to config file
    def on_room_correction_set(self, _index, name):
        if self.padb.output_sink:
            self.config.set("eq_correction", name, self.padb.output_sink.name)

    # just remove the current selected room correction from config file
    def on_room_correction_unset(self, _index):
        if self.padb.output_sink:
            self.config.set("eq_correction", None, self.padb.output_sink.name)

    # helper
    def on_pa_module_log(self):
        for key, val in list(vars(self.pamm).items()):
            log(key + "=" + str(val))
Example #10
0
def extractive_training(args, vocab):
    writer = SummaryWriter('../log')
    print(args)
    print("generating config")
    config = Config(
        vocab_size=vocab.embedding.shape[0],
        embedding_dim=vocab.embedding.shape[1],
        position_size=500,
        position_dim=50,
        word_input_size=100,
        sent_input_size=2 * args.hidden,
        word_GRU_hidden_units=args.hidden,
        sent_GRU_hidden_units=args.hidden,
        pretrained_embedding=vocab.embedding,
        word2id=vocab.w2i,
        id2word=vocab.i2w,
        dropout=args.dropout,
        pooling_way=args.pooling_way,
        num_layers = args.num_layers,
        num_directions = args.num_directions,
        fixed_length=args.fixed_length,
        num_filters=args.num_filters,
        filter_sizes=args.filter_sizes,
        batch_size=args.batch_size,
        novelty=args.novelty,
    )
    model_name = ".".join(("../model/"+str(args.ext_model),
                         "termination_", str(args.terminated_way),
                         "pooling_", str(args.pooling_way),
                         "max_sent", str(args.oracle_length),
                         "min_sents", str(args.min_num_of_sents),
                         "rl_m",str(args.rl_baseline_method), 
                         "oracle_l", str(args.oracle_length),
                         "bsz", str(args.batch_size), 
                         "rl_loss", str(args.rl_loss_method),
                         "hidden", str(args.hidden),
                         "dropout", str(args.dropout),
                         'ext'))
    print(model_name)

    log_name = ".".join(("../log/"+str(args.ext_model),
                         "termination_", str(args.terminated_way),
                         "pooling_", str(args.pooling_way),
                         "max_sent", str(args.oracle_length),
                         "min_sents", str(args.min_num_of_sents),
                         "rl_m",str(args.rl_baseline_method), 
                         "oracle_l", str(args.oracle_length),
                         "bsz", str(args.batch_size), 
                         "rl_loss", str(args.rl_loss_method),
                         "hidden", str(args.hidden),
                         "dropout", str(args.dropout),
                         'log'))

    print("init data loader and RL learner")
    data_loader = PickleReader(args.data_dir)

    # init statistics
    reward_list = []
    best_eval_reward = 0.
    model_save_name = model_name

    if args.fine_tune:
        model_save_name = model_name + ".fine_tune"
        log_name = log_name + ".fine_tune"
        args.std_rouge = True
        print("fine_tune model with std_rouge, args.std_rouge changed to %s" % args.std_rouge)

    print('init extractive model')

    extract_net = model.SHE(config).cuda()
    reinforce = ReinforceReward(terminated_way=args.terminated_way, std_rouge=args.std_rouge, rouge_metric=args.rouge_metric,
                                    b=args.batch_size, rl_baseline_method=args.rl_baseline_method,
                                    loss_method=1)
    extract_net.cuda()


    logging.basicConfig(filename='%s' % log_name,
                        level=logging.INFO, format='%(asctime)s [INFO] %(message)s')
    if args.load_ext:
        print("loading existing model%s" % model_name)
        extract_net = torch.load(model_name, map_location=lambda storage, loc: storage)
        extract_net.cuda()
        print("finish loading and evaluate model %s" % model_name)
        # evaluate.ext_model_eval(extract_net, vocab, args, eval_data="test")
        best_eval_reward, _ = evaluate.ext_model_eval(extract_net, vocab, args, "val")

    # Loss and Optimizer
    optimizer_ext = torch.optim.Adam(extract_net.parameters(), lr=args.lr, betas=(0., 0.999))

    print("starting training")
    n_step = 100
    error_counter = 0

    for epoch in range(args.epochs_ext):
        train_iter = data_loader.chunked_data_reader("train", data_quota=args.train_example_quota)
        step_in_epoch = 0
        for dataset in train_iter:
            # for step, docs in enumerate(BatchDataLoader(dataset, shuffle=True, batch_size=args.batch_size )):
            for step, docs in enumerate(BatchDataLoader(dataset, shuffle=True)):
                try:
                    extract_net.train()
                    # if True:
                    step_in_epoch += 1
                    # for i in range(1):  # how many times a single data gets updated before proceeding
                    doc = docs[0]
                    doc.content = tokens_to_sentences(doc.content)
                    doc.summary = tokens_to_sentences(doc.summary)

                    if len(doc.content) == 0 or len(doc.summary) == 0:
                        continue

                    if len(doc.content) <3:
                        summary_index_list = range(min(len(doc.content),3))
                        loss = 0
                        reward = from_summary_index_compute_rouge(doc, summary_index_list,
                                                            std_rouge=args.std_rouge,
                                                            rouge_metric=args.rouge_metric,
                                                            max_num_of_bytes=args.length_limit)                    
                    
                    
                    
                    else:
                        if args.oracle_length == -1:  # use true oracle length
                            oracle_summary_sent_num = len(doc.summary)
                        else:
                            oracle_summary_sent_num = args.oracle_length

                        x = prepare_data(doc, vocab)
                        if min(x.shape) == 0:
                            continue
                        sents = Variable(torch.from_numpy(x)).cuda()

                        outputs = extract_net(sents)

                        if args.prt_inf and np.random.randint(0, 1000) == 0:
                            prt = True
                        else:
                            prt = False
                        loss, reward = reinforce.train(outputs, doc,
                                                min_num_of_sents=args.min_num_of_sents,
                                                max_num_of_sents=oracle_summary_sent_num,
                                                max_num_of_bytes=args.length_limit,
                                                prt=prt)
                        if prt:
                            print('Probabilities: ', outputs.squeeze().data.cpu().numpy())
                            print('-' * 80)
                    reward_list.append(reward)

                    if isinstance(loss, Variable):
                        loss.backward()

                    if step % 1 == 0:
                        torch.nn.utils.clip_grad_norm(extract_net.parameters(), 1)  # gradient clipping
                        optimizer_ext.step()
                        optimizer_ext.zero_grad()
                    # print('Epoch %d Step %d Reward %.4f'%(epoch,step_in_epoch,reward))
                    logging.info('Epoch %d Step %d Reward %.4f' % (epoch, step_in_epoch, reward))

                except Exception as e:
                    error_counter += 1
                    print(e)

                if (step_in_epoch) % n_step == 0 and step_in_epoch != 0:
                    print('Epoch ' + str(epoch) + ' Step ' + str(step_in_epoch) +
                       ' reward: ' + str(np.mean(reward_list)))
                    print('error_count: ',error_counter)
                    mean_loss = np.mean(reward_list)
                    writer.add_scalar('Train/SHE', mean_loss, step_in_epoch)
                    reward_list = []

                if (step_in_epoch) % 2000 == 0 and step_in_epoch != 0:
                    print("doing evaluation")
                    extract_net.eval()
                    eval_reward, lead3_reward = evaluate.ext_model_eval(extract_net, vocab, args, "val")
                    if eval_reward > best_eval_reward:
                        best_eval_reward = eval_reward
                        print("saving model %s with eval_reward:" % model_save_name, eval_reward, "leadreward",
                              lead3_reward)
                        torch.save(extract_net, model_name)
                    writer.add_scalar('val/SHE', eval_reward, step_in_epoch)
                    f = open('log/learning_curve','a')
                    f.write(str(eval_reward)+'\t'+str(lead3_reward)+'\n')
                    f.close()
                    print('epoch ' + str(epoch) + ' reward in validation: '
                          + str(eval_reward) +  ' lead3: ' + str(lead3_reward))
                    print('Error Counter: ',error_counter)
        

    return extract_net
Example #11
0
def main6():
    # vae test
    doc = Document(content=[[
        'to', 'the', 'editor', 're', 'for', 'women', 'worried', 'about',
        'fertility', 'egg', 'bank', 'is', 'a', 'new', 'option', 'sept', '00',
        'imagine', 'my', 'joy', 'in', 'reading', 'the', 'morning',
        'newspapers', 'on', 'the', 'day', 'of', 'my', '00th', 'birthday',
        'and', 'finding', 'not', 'one', 'but', 'two', 'articles', 'on', 'how',
        'women', 's', 'fertility', 'drops', 'off', 'precipitously', 'after',
        'age', '00'
    ], [
        'one', 'in', 'the', 'times', 'and', 'one', 'in', 'another', 'newspaper'
    ], ['i', 'sense', 'a', 'conspiracy', 'here'],
                            [
                                'have', 'you', 'been', 'talking', 'to', 'my',
                                'mother', 'in', 'law'
                            ], ['laura', 'heymann', 'washington']],
                   summary=[[
                       'laura', 'heymann', 'letter', 'on', 'sept', '00',
                       'article', 'about', 'using', 'egg', 'bank', 'to',
                       'prolong', 'fertility', 'expresses', 'ironic', 'humor',
                       'about', 'her', 'age', 'and', 'chances', 'of',
                       'becoming', 'pregnant'
                   ]],
                   label=[0.01] * 100,
                   label_idx=[0.01] * 100)
    torch.manual_seed(233)
    torch.cuda.set_device(0)
    args = get_args()
    if args.data == "nyt":
        vocab_file = "/home/ml/lyu40/PycharmProjects/data/nyt/lda_domains/preprocessed/vocab_100d.p"
        with open(vocab_file, "rb") as f:
            vocab = pickle.load(f, encoding='latin1')
    else:
        vocab_file = '/home/ml/ydong26/data/CNNDM/CNN_DM_pickle_data/vocab_100d.p'
        with open(vocab_file, "rb") as f:
            vocab = pickle.load(f, encoding='latin1')
    config = Config(
        vocab_size=vocab.embedding.shape[0],
        embedding_dim=vocab.embedding.shape[1],
        category_size=args.category_size,
        category_dim=50,
        word_input_size=100,
        sent_input_size=2 * args.hidden,
        word_GRU_hidden_units=args.hidden,
        sent_GRU_hidden_units=args.hidden,
        pretrained_embedding=vocab.embedding,
        word2id=vocab.w2i,
        id2word=vocab.i2w,
    )
    model = VAE(config)

    if torch.cuda.is_available():
        model.cuda()
    train_loss = 0
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    x = prepare_data(
        doc, vocab.w2i
    )  # list of tokens ex.x=[[1,2,1],[1,1]] x = Variable(torch.from_numpy(x)).cuda()
    sents = Variable(torch.from_numpy(x)).cuda()
    optimizer.zero_grad()
    loss = 0
    for sent in sents:
        recon_batch, mu, logvar = model(sent.float())
        loss += loss_function(recon_batch, sent, mu, logvar)
    loss.backward()
    train_loss += loss.data[0]
    optimizer.step()
Example #12
0
def load_and_test_model(model_type, model_path):
    if not os.path.isfile(model_path):
        raise IOError('Cant find the model path.')
    torch.manual_seed(233)

    args = get_args()
    if args.length_limit > 0:
        args.oracle_length = 2
    torch.cuda.set_device(args.device)

    print('generate config')
    with open(args.homepath + args.vocab_file, "rb") as f:
        vocab = pickle.load(f)
    print(vocab)

    print(args)
    print("generating config")
    config = Config(
        vocab_size=vocab.embedding.shape[0],
        embedding_dim=vocab.embedding.shape[1],
        category_size=args.category_size,
        category_dim=50,
        word_input_size=100,
        sent_input_size=2 * args.hidden,
        word_GRU_hidden_units=args.hidden,
        sent_GRU_hidden_units=args.hidden,
        pretrained_embedding=vocab.embedding,
        word2id=vocab.w2i,
        id2word=vocab.i2w,
    )

    print('init extractive model')
    if model_type == "fs":
        extract_net = model_all.FullyShare(config)
    elif model_type == "ps":
        extract_net = model_all.PrivateShare(config)
    elif model_type == "dm":
        extract_net = model_all.DomainModel(config)
    elif model_type == "gm":
        extract_net = model_all.GeneralModel(config)
    else:
        print("this model is not implemented yet")

    try:
        print("=> loading model '{}'".format(model_path))
        checkpoint = torch.load(model_path,
                                map_location={
                                    'cuda:1': 'cuda:0',
                                    'cuda:2': 'cuda:0',
                                    'cuda:3': 'cuda:0'
                                })
        args.start_epoch = checkpoint['epoch']
        best_eval_reward = checkpoint['best_eval_reward']
        extract_net.load_state_dict(checkpoint['state_dict'])
        print("=> loaded model '{}' (epoch {})".format(model_path,
                                                       checkpoint['epoch']))
    except:
        raise IOError('Cant load the model %s' % model_path)

    extract_net.cuda()

    ext_model_eval(extract_net, vocab, args, 'test')
Example #13
0
def main():
    torch.manual_seed(233)
    log_name = "/home/ml/lyu40/PycharmProjects/E_Yue/log/vae/vae_" + args.data + ".log"
    logging.basicConfig(filename='%s.log' % log_name,
                        level=logging.INFO,
                        format='%(asctime)s [INFO] %(message)s')
    torch.cuda.set_device(0)
    data_loader = PickleReader()
    print('generate config')
    if args.data == "nyt":
        vocab_file = "/home/ml/lyu40/PycharmProjects/data/nyt/lda_domains/preprocessed/vocab_100d.p"
        with open(vocab_file, "rb") as f:
            vocab = pickle.load(f, encoding='latin1')
    else:
        vocab_file = '/home/ml/ydong26/data/CNNDM/CNN_DM_pickle_data/vocab_100d.p'
        with open(vocab_file, "rb") as f:
            vocab = pickle.load(f, encoding='latin1')

    config = Config(
        vocab_size=vocab.embedding.shape[0],
        embedding_dim=vocab.embedding.shape[1],
        category_size=args.category_size,
        category_dim=50,
        word_input_size=100,
        sent_input_size=2 * args.hidden,
        word_GRU_hidden_units=args.hidden,
        sent_GRU_hidden_units=args.hidden,
        pretrained_embedding=vocab.embedding,
        word2id=vocab.w2i,
        id2word=vocab.i2w,
    )
    print("vocab_size:", vocab.embedding.shape[0])
    print("V:", len(vocab.w2i))
    #V = len(vocab.w2i)
    model = VAE(config)
    if torch.cuda.is_available():
        model.cuda()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    print("starting training")
    for epoch in range(args.start_epoch + 1, args.epochs_ext):
        model.train()
        train_loss = 0
        train_iter = data_loader.chunked_data_reader(
            "train", data_quota=args.train_example_quota)
        train_size = 0
        # train_iter: the data sets for this training epoch
        print("finish loading the data for this epoch")
        step_in_epoch = 0
        #print("train_size:", train_size)
        #dataset_num = sum([1 for dataset in train_iter])
        #print("number of dataset:", dataset_num)
        for dataset in train_iter:
            for step, docs in enumerate(BatchDataLoader(dataset,
                                                        shuffle=False)):
                # try:

                train_size += 1
                step_in_epoch += 1
                doc = docs[0]
                x, one_hot_x = prepare_data(
                    doc, vocab.w2i
                )  # list of tokens ex.x=[[1,2,1],[1,1]] x = Variable(torch.from_numpy(x)).cuda()
                # x = flatten_list(x)
                print("min(x.shape):", min(x.shape))
                if min(x.shape) == 0:
                    continue
                sents = Variable(torch.from_numpy(x)).cuda()
                # one_hot_sents = Variable(torch.from_numpy(one_hot_x)).cuda().view(-1,1,len(vocab.w2i))

                print("type of sents:", sents.type())
                recon_x, mu, logvar = model(sents)
                #one_hot_x = one_hot_x.reshape(-1, one_hot_x.shape[-1])
                #print("shape of one_hot_x:", one_hot_x.shape)
                step_loss = 0
                x = flatten_list(x)
                for i in range(recon_x.size()[0]):
                    optimizer.zero_grad()
                    loss = loss_function(recon_x[i], x[i], mu, logvar)
                    loss.backward()
                    optimizer.step()
                    train_loss += loss.data[0]
                    step_loss += loss.data[0]
                #loss = loss_function(recon_x, np.array(flatten_list(x)), mu, logvar)
                #loss.backward()
                #optimizer.step()
                #train_loss += loss.data[0]

            #for i in range(recon_x.size()[0]):
            #optimizer.zero_grad()
            #loss = loss_function(recon_x[i], one_hot_x[i], mu, logvar)
            #loss.backward()
            #optimizer.step()
            #train_loss += loss.data[0]
            #del loss
            #loss = loss_function(recon_x, one_hot_x.reshape(-1, one_hot_x.shape[-1]), mu, logvar)    # one_hot_sents: (N*W,1,V)
            #loss.backward()
            #train_loss += loss.data[0]

                logging.info(
                    'Epoch %d Step %d loss %.4f' %
                    (epoch, step_in_epoch, step_loss / recon_x.size()[0]))

                #except Exception as e:
                #print("skip one example because error during training, input is %s" % docs[0].content)
                #print("Exception:")
                #print(e)
                #pass
        logging.info('Epoch %d avg loss %.4f' %
                     (epoch, train_loss / train_size))

    torch.save(model.state_dict(), './vae.pth')
Example #14
0
def extractive_training(args, vocab):
    print(args)
    print("generating config")
    config = Config(
        vocab_size=vocab.embedding.shape[0],
        embedding_dim=vocab.embedding.shape[1],
        category_size=args.category_size,
        category_dim=50,
        word_input_size=100,
        sent_input_size=2 * args.hidden,
        word_GRU_hidden_units=args.hidden,
        sent_GRU_hidden_units=args.hidden,
        pretrained_embedding=vocab.embedding,
        word2id=vocab.w2i,
        id2word=vocab.i2w,
    )

    def create_model_name(
            epoch):  #this method creates model name for loading and saving
        path = args.model_file + args.data + "/" + str(
            args.num_topics) + "/model"
        return ".".join((path, 'epoch', str(epoch), args.ext_model, 'tr'))

    model_name = create_model_name(args.start_epoch)
    print(model_name)

    log_name = '/home/ml/lyu40/PycharmProjects/E_Yue/log/' + args.data + "/" + str(
        args.num_topics) + "/" + args.ext_model + ".tr"
    eval_file_name = '/home/ml/lyu40/PycharmProjects/E_Yue/log/' + args.data + "/" + str(
        args.num_topics) + "/" + args.ext_model + ".eval"

    print("init data loader and RL learner")
    data_loader = PickleReader()

    # init statistics
    reward_list = []
    best_eval_reward = 0.
    model_save_name = args.resume
    reinforce = ReinforceReward(std_rouge=args.std_rouge,
                                rouge_metric=args.rouge_metric,
                                b=args.batch_size,
                                rl_baseline_method=args.rl_baseline_method,
                                loss_method=1)

    print('init extractive model')
    if args.ext_model == "fs":
        extract_net = model_all.FullyShare(config)
    elif args.ext_model == "ps":
        extract_net = model_all.PrivateShare(config)
    elif args.ext_model == "dm":
        extract_net = model_all.DomainModel(config)
    elif args.ext_model == "gm":
        extract_net = model_all.GeneralModel(config)
    else:
        print("this model is not implemented yet")
    # Loss and Optimizer
    optimizer = torch.optim.Adam(extract_net.parameters(),
                                 lr=args.lr,
                                 betas=(0., 0.999))
    logging.basicConfig(filename='%s.log' % log_name,
                        level=logging.INFO,
                        format='%(asctime)s [INFO] %(message)s')

    if args.resume:
        if os.path.isfile(model_name):
            try:
                print("=> loading checkpoint '{}'".format(model_name))
                checkpoint = torch.load(model_name)
                args.start_epoch = checkpoint['epoch']
                best_eval_reward = checkpoint['best_eval_reward']
                extract_net.load_state_dict(checkpoint['state_dict'])
                # optimizer.load_state_dict(checkpoint['optimizer'])
                print("=> loaded checkpoint '{}' (epoch {})".format(
                    model_name, checkpoint['epoch']))
            except:
                extract_net = torch.load(
                    model_name, map_location=lambda storage, loc: storage)
                print("=> finish loaded checkpoint '{}' (epoch {})".format(
                    model_name, args.start_epoch))
        else:
            print("=> no checkpoint found at '{}'".format(model_name))
        # evaluate.ext_model_eval(extract_net, vocab, args, eval_data="test")
        # best_eval_reward, _ = evaluate.ext_model_eval(extract_net, vocab, args, eval_data="val")
    extract_net.cuda()

    #do a quick test, remove afterwards
    # evaluate.ext_model_eval(extract_net, vocab, args, "test")
    print("starting training")
    for epoch in range(args.start_epoch + 1, args.epochs_ext):
        train_iter = data_loader.chunked_data_reader(
            "train", data_quota=args.train_example_quota)
        # train_iter: the data sets for this training epoch
        print("finish loading the data for this epoch")
        step_in_epoch = 0
        for dataset in train_iter:
            for step, docs in enumerate(BatchDataLoader(dataset,
                                                        shuffle=False)):

                try:
                    # if True:
                    #     print("trying step %d"%step_in_epoch)
                    step_in_epoch += 1
                    doc = docs[0]
                    if args.oracle_length == -1:  # use true oracle length
                        oracle_summary_sent_num = len(doc.summary)
                    else:
                        oracle_summary_sent_num = args.oracle_length

                    x = prepare_data(doc, vocab.w2i)
                    if min(x.shape) == 0:
                        continue
                    sents = Variable(torch.from_numpy(x)).cuda()
                    label_idx = Variable(
                        torch.from_numpy(np.array([doc.label_idx]))).cuda()
                    print(
                        "label_idx:", label_idx
                    )  # label_idx: tensor([ 2], dtype=torch.int32, device='cuda:0')
                    #print("content:", doc.content)
                    #print("summary:", doc.summary)

                    if label_idx.dim() == 2:
                        outputs = extract_net(sents, label_idx[0])
                    else:
                        outputs = extract_net(sents, label_idx)
                    #print("outputs: ", outputs)

                    # if np.random.randint(0, 100) == 0:
                    #     prt = True
                    # else:
                    #     prt = False
                    prt = False
                    loss, reward, summary_index_list = reinforce.train(
                        outputs,
                        doc,
                        max_num_of_sents=oracle_summary_sent_num,
                        max_num_of_chars=args.length_limit,
                        prt=prt)
                    if prt:
                        print('Probabilities: ',
                              outputs.squeeze().data.cpu().numpy())
                        print('-' * 80)

                    reward_list.append(reward)

                    if isinstance(loss, Variable):
                        loss.backward()

                    if step % 10 == 0:
                        torch.nn.utils.clip_grad_norm(extract_net.parameters(),
                                                      1)  # gradient clipping
                        optimizer.step()
                        optimizer.zero_grad()
                    #print('Epoch %d Step %d Reward %.4f'%(epoch,step_in_epoch,reward))
                    if reward < 0.0001:
                        print(
                            "very low rouge score for this instance, with reward =",
                            reward)
                        print("outputs:", outputs)
                        print("content:", doc.content)
                        print("summary:", doc.summary)
                        print("selected sentences index list:",
                              summary_index_list)
                        print("*" * 40)
                    logging.info('Epoch %d Step %d Reward %.4f' %
                                 (epoch, step_in_epoch, reward))
                except Exception as e:
                    print(
                        "skip one example because error during training, input is %s"
                        % docs[0].content)
                    print("Exception:")
                    print(e)
                    pass

                n_step = 200
                if (step_in_epoch) % n_step == 0 and step_in_epoch != 0:
                    print('Epoch ' + str(epoch) + ' Step ' +
                          str(step_in_epoch) + ' reward: ' +
                          str(np.mean(reward_list)))
                    reward_list = []

                if (step_in_epoch) % 50000 == 0 and step_in_epoch != 0:
                    save_checkpoint(
                        {
                            'epoch': epoch,
                            'state_dict': extract_net.state_dict(),
                            'best_eval_reward': best_eval_reward,
                            'optimizer': optimizer.state_dict(),
                        },
                        False,
                        filename=create_model_name(epoch))

                    print("doing evaluation")
                    eval_reward, lead3_reward = evaluate.ext_model_eval(
                        extract_net, vocab, args, "val")
                    if eval_reward > best_eval_reward:
                        best_eval_reward = eval_reward
                        print(
                            "saving model %s with eval_reward:" %
                            model_save_name, eval_reward, "leadreward",
                            lead3_reward)
                        try:
                            save_checkpoint(
                                {
                                    'epoch': epoch,
                                    'step_in_epoch': step_in_epoch,
                                    'state_dict': extract_net.state_dict(),
                                    'best_eval_reward': best_eval_reward,
                                    'optimizer': optimizer.state_dict(),
                                },
                                True,
                                filename=create_model_name(epoch))
                        except:
                            print(
                                'cant save the model since shutil doesnt work')

                    print('epoch ' + str(epoch) + ' reward in validation: ' +
                          str(eval_reward) + ' lead3: ' + str(lead3_reward))
                    with open(eval_file_name, "a") as file:
                        file.write('epoch ' + str(epoch) +
                                   ' reward in validation: ' +
                                   str(eval_reward) + ' lead3: ' +
                                   str(lead3_reward) + "\n")
    return extract_net