def transformation(fun: str, arr: np.ndarray):
    p = Parser(fun)

    try:
        p.exec(0)
    except Exception:
        return
    else:

        fig = complex_plot(p, arr)

        fig.savefig("output.png")
Beispiel #2
0
    def __init__(self,
                 agent,
                 kb,
                 lexicon,
                 config,
                 generator,
                 manager,
                 realizer=None):
        parser = Parser(agent, kb, lexicon)
        state = DialogueState(agent, kb)
        super(RulebasedSession, self).__init__(agent,
                                               kb,
                                               parser,
                                               generator,
                                               manager,
                                               state,
                                               sample_temperature=2.)

        self.kb = kb
        self.attr_type = {attr.name: attr.value_type for attr in kb.attributes}
        self.num_items = len(kb.items)
        self.entity_counts = self.count_entity()
        self.entity_coords = self.get_entity_coords()
        self.entity_weights = self.init_entity_weight()
        self.item_weights = [1.] * self.num_items

        self.realizer = realizer
Beispiel #3
0
    def __init__(self, config, project_dir):
        # config for db and parser
        self.config = config
        self.parser = Parser(config=config['XML'])
        self.DB = DataBase(
            host=config['DB']['HOST'],
            port=config['DB']['PORT']
        )
        # full path to a directories where project information (*.json) resides.
        self.project_dir = project_dir

        # list of project information from *.json
        self.projects = []
        #todo: make befow for loop as a function
        print('Load project *.json files ...')
        for filename in glob.glob(os.path.join(project_dir, '*.json')):
            data = load_json(filename)
            if data is not None:
                print('\tLoaded {}'.format(filename))
                self.projects.append(data)

        # maximum number of syncer threads
        self.max_num_syncers = 2
        self.num_syncers = 0
        # syncer pool, key: project file name, value: syncer
        self.syncer_pool = {}
Beispiel #4
0
def parse_example(example, lexicon, templates):
    """Parse example and collect templates.
    """
    kbs = example.scenario.kbs
    parsers = [Parser(agent, kbs[agent], lexicon) for agent in (0, 1)]
    states = [DialogueState(agent, kbs[agent]) for agent in (0, 1)]
    # Add init utterance <start>
    parsed_utterances = [states[0].utterance[0], states[1].utterance[1]]
    for event in example.events:
        writing_agent = event.agent  # Speaking agent
        reading_agent = 1 - writing_agent
        #print event.agent

        received_utterance = parsers[reading_agent].parse(
            event, states[reading_agent])
        if received_utterance:
            sent_utterance = copy.deepcopy(received_utterance)
            if sent_utterance.tokens:
                sent_utterance.template = parsers[
                    writing_agent].extract_template(sent_utterance.tokens,
                                                    states[writing_agent])

            templates.add_template(sent_utterance, states[writing_agent])
            parsed_utterances.append(received_utterance)
            #print 'sent:', ' '.join(sent_utterance.template)
            #print 'received:', ' '.join(received_utterance.template)

            # Update states
            states[reading_agent].update(writing_agent, received_utterance)
            states[writing_agent].update(writing_agent, sent_utterance)
    return parsed_utterances
 def __init__(self, agent, kb, lexicon, config, generator, manager):
     parser = Parser(agent, kb, lexicon)
     state = DialogueState(agent, kb)
     super(RulebasedSession, self).__init__(agent, kb, parser, generator, manager, state, sample_temperature=5.)
     self.title_scores = self.score_titles()
     for k, v in self.title_scores.iteritems():
         print k, v
Beispiel #6
0
def test(args):
    test_set = Dataset.from_bin_file(args.test_file)
    assert args.load_model

    print('load model from [%s]' % args.load_model, file=sys.stderr)
    params = torch.load(args.load_model,
                        map_location=lambda storage, loc: storage)
    vocab = params['vocab']
    transition_system = params['transition_system']
    saved_args = params['args']
    saved_state = params['state_dict']
    saved_args.cuda = args.cuda

    parser = Parser(saved_args, vocab, transition_system)
    parser.load_state_dict(saved_state)

    if args.cuda: parser = parser.cuda()
    parser.eval()

    eval_results, decode_results = evaluation.evaluate(
        test_set.examples,
        parser,
        args,
        verbose=True,
        return_decode_result=True)
    print(eval_results, file=sys.stderr)
    if args.save_decode_to:
        pkl.dump(decode_results, open(args.save_decode_to, 'wb'))
def input_function(fun: str, in_zoom: float = 1.0, out_zoom: float = 1.0):
    p = Parser(fun)

    try:
        p.exec(0)
    except Exception:
        return None

    arr = np.arange(-10 * in_zoom, 10 * in_zoom, 0.02)

    out_arr = np.arange(-10 * out_zoom, 10 * out_zoom, 0.02)

    vf = np.vectorize(p.real2complex, otypes=[complex])

    res = vf(out_arr)

    fig = fun_plot(p, arr)

    fig.savefig("input.png")

    return res
Beispiel #8
0
    def __init__(self,
                 cfg: Union[str, IO],
                 quant: bool = False,
                 onnx: bool = False):
        super().__init__()
        self.quant = quant
        self.qstub = QuantStub()
        self.destub = DeQuantStub()

        if isinstance(cfg, str):
            cfg = open(cfg, 'r')
        self.module_list = nn.ModuleList(Parser(cfg).torch_layers(quant, onnx))
        cfg.close()
    def __init__(self, agent, kb, lexicon, config, generator, manager):
        parser = Parser(agent, kb, lexicon)
        state = DialogueState(agent, kb)
        super(RulebasedSession, self).__init__(agent,
                                               kb,
                                               parser,
                                               generator,
                                               manager,
                                               state,
                                               sample_temperature=5.)

        self.kb = kb
        self.personas = kb.personas
Beispiel #10
0
    def __init__(self,
                 rootDir,
                 fsmapFn,
                 db_host='localhost',
                 db_port=27017,
                 xml_config=None):
        self.rootDir = os.path.realpath(os.path.abspath(rootDir))
        self.fsmapFn = fsmapFn
        self.db_host = db_host
        self.db_port = db_port

        self.parser = Parser(xml_config) if xml_config is not None else None

        self.extensions = ['.xml', '.jpg', '.tiff']

        # to ensure safe operation on fsmap
        self.fsmap_lock = threading.Lock()

        self.fsMap = self._load()
        self._traverse()
        self._save()  # for debuggin...

        # lazy connection to MongoDB server
        # Must ensure mongod is running!
        self.client = pymongo.MongoClient(self.db_host, self.db_port)
        self.clientPool = {}

        # streaming queues
        self.fs_event_q = Queue()
        self.stream_q = Queue()

        # old map
        # This keeps old fsmap information when file system changes manually
        # e.g. folder move, rename, etc
        # If it is not empty dictionary, there is a bug....
        self._old_fsmap = {}
Beispiel #11
0
    def __init__(self, agent, kb, lexicon, config, generator, manager):
        parser = Parser(agent, kb, lexicon)
        state = DialogueState(agent, kb)
        super(CraigslistRulebasedSession, self).__init__(agent, kb, parser, generator, manager, state, sample_temperature=10.)

        self.kb = kb
        self.title = self.shorten_title(self.kb.facts['item']['Title'])
        self.config = default_config if config is None else config

        self.target = self.kb.target
        self.bottomline = None
        self.listing_price = self.kb.listing_price
        self.category = self.kb.category

        # Direction of desired price
        self.inc = None
Beispiel #12
0
    def __init__(self, agent, kb, lexicon, config, generator, manager):
        parser = Parser(agent, kb, lexicon)
        state = DialogueState(agent, kb)
        super(RulebasedSession, self).__init__(agent,
                                               kb,
                                               parser,
                                               generator,
                                               manager,
                                               state,
                                               sample_temperature=1.)

        self.kb = kb
        self.item_values = kb.item_values
        self.item_counts = kb.item_counts
        self.items = kb.item_values.keys()
        self.partner_item_weights = {item: 1. for item in self.items}
        self.config = default_config if config is None else config

        items = [(item, value, self.item_counts[item])
                 for item, value in self.item_values.iteritems()]
        # Sort items by value from high to low
        self.sorted_items = sorted(items, key=lambda x: x[1], reverse=True)
        self.init_proposal()
Beispiel #13
0
def self_training(args):
    """Perform self-training

    First load decoding results on disjoint data
    also load pre-trained model and perform supervised
    training on both existing training data and the
    decoded results
    """

    print('load pre-trained model from [%s]' % args.load_model, file=sys.stderr)
    params = torch.load(args.load_model, map_location=lambda storage, loc: storage)
    vocab = params['vocab']
    transition_system = params['transition_system']
    saved_args = params['args']
    saved_state = params['state_dict']

    # transfer arguments
    saved_args.cuda = args.cuda
    saved_args.save_to = args.save_to
    saved_args.train_file = args.train_file
    saved_args.unlabeled_file = args.unlabeled_file
    saved_args.dev_file = args.dev_file
    saved_args.load_decode_results = args.load_decode_results
    args = saved_args

    update_args(args)

    model = Parser(saved_args, vocab, transition_system)
    model.load_state_dict(saved_state)

    if args.cuda: model = model.cuda()
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    print('load unlabeled data [%s]' % args.unlabeled_file, file=sys.stderr)
    unlabeled_data = Dataset.from_bin_file(args.unlabeled_file)

    print('load decoding results of unlabeled data [%s]' % args.load_decode_results, file=sys.stderr)
    decode_results = pickle.load(open(args.load_decode_results))

    labeled_data = Dataset.from_bin_file(args.train_file)
    dev_set = Dataset.from_bin_file(args.dev_file)

    print('Num. examples in unlabeled data: %d' % len(unlabeled_data), file=sys.stderr)
    assert len(unlabeled_data) == len(decode_results)
    self_train_examples = []
    for example, hyps in zip(unlabeled_data, decode_results):
        if hyps:
            hyp = hyps[0]
            sampled_example = Example(idx='self_train-%s' % example.idx,
                                      src_sent=example.src_sent,
                                      tgt_code=hyp.code,
                                      tgt_actions=hyp.action_infos,
                                      tgt_ast=hyp.tree)
            self_train_examples.append(sampled_example)
    print('Num. self training examples: %d, Num. labeled examples: %d' % (len(self_train_examples), len(labeled_data)),
          file=sys.stderr)

    train_set = Dataset(examples=labeled_data.examples + self_train_examples)

    print('begin training, %d training examples, %d dev examples' % (len(train_set), len(dev_set)), file=sys.stderr)
    print('vocab: %s' % repr(vocab), file=sys.stderr)

    epoch = train_iter = 0
    report_loss = report_examples = 0.
    history_dev_scores = []
    num_trial = patience = 0
    while True:
        epoch += 1
        epoch_begin = time.time()

        for batch_examples in train_set.batch_iter(batch_size=args.batch_size, shuffle=True):
            batch_examples = [e for e in batch_examples if len(e.tgt_actions) <= args.decode_max_time_step]

            train_iter += 1
            optimizer.zero_grad()

            loss = -model.score(batch_examples)
            # print(loss.data)
            loss_val = torch.sum(loss).data[0]
            report_loss += loss_val
            report_examples += len(batch_examples)
            loss = torch.mean(loss)

            loss.backward()

            # clip gradient
            if args.clip_grad > 0.:
                grad_norm = torch.nn.utils.clip_grad_norm(model.parameters(), args.clip_grad)

            optimizer.step()

            if train_iter % args.log_every == 0:
                print('[Iter %d] encoder loss=%.5f' %
                      (train_iter,
                       report_loss / report_examples),
                      file=sys.stderr)

                report_loss = report_examples = 0.

        print('[Epoch %d] epoch elapsed %ds' % (epoch, time.time() - epoch_begin), file=sys.stderr)
        # model_file = args.save_to + '.iter%d.bin' % train_iter
        # print('save model to [%s]' % model_file, file=sys.stderr)
        # model.save(model_file)

        # perform validation
        print('[Epoch %d] begin validation' % epoch, file=sys.stderr)
        eval_start = time.time()
        eval_results = evaluation.evaluate(dev_set.examples, model, args, verbose=True)
        dev_acc = eval_results['accuracy']
        print('[Epoch %d] code generation accuracy=%.5f took %ds' % (epoch, dev_acc, time.time() - eval_start), file=sys.stderr)
        is_better = history_dev_scores == [] or dev_acc > max(history_dev_scores)
        history_dev_scores.append(dev_acc)

        if is_better:
            patience = 0
            model_file = args.save_to + '.bin'
            print('save currently the best model ..', file=sys.stderr)
            print('save model to [%s]' % model_file, file=sys.stderr)
            model.save(model_file)
            # also save the optimizers' state
            torch.save(optimizer.state_dict(), args.save_to + '.optim.bin')
        elif epoch == args.max_epoch:
            print('reached max epoch, stop!', file=sys.stderr)
            exit(0)
        elif patience < args.patience:
            patience += 1
            print('hit patience %d' % patience, file=sys.stderr)

        if patience == args.patience:
            num_trial += 1
            print('hit #%d trial' % num_trial, file=sys.stderr)
            if num_trial == args.max_num_trial:
                print('early stop!', file=sys.stderr)
                exit(0)

            # decay lr, and restore from previously best checkpoint
            lr = optimizer.param_groups[0]['lr'] * args.lr_decay
            print('load previously best model and decay learning rate to %f' % lr, file=sys.stderr)

            # load model
            params = torch.load(args.save_to + '.bin', map_location=lambda storage, loc: storage)
            model.load_state_dict(params['state_dict'])
            if args.cuda: model = model.cuda()

            # load optimizers
            if args.reset_optimizer:
                print('reset optimizer', file=sys.stderr)
                optimizer = torch.optim.Adam(model.inference_model.parameters(), lr=lr)
            else:
                print('restore parameters of the optimizers', file=sys.stderr)
                optimizer.load_state_dict(torch.load(args.save_to + '.optim.bin'))

            # set new lr
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr

            # reset patience
            patience = 0
Beispiel #14
0
def train_semi_jae(args):

    bi_direction = args.bi_direction

    encoder_params = torch.load(args.load_model,
                                map_location=lambda storage, loc: storage)
    decoder_params = torch.load(args.load_decoder,
                                map_location=lambda storage, loc: storage)

    print('loaded encoder at %s' % args.load_model, file=sys.stderr)
    print('loaded decoder at %s' % args.load_decoder, file=sys.stderr)

    transition_system = encoder_params['transition_system']
    encoder_params['args'].cuda = decoder_params['args'].cuda = args.cuda

    encoder = Parser(encoder_params['args'], encoder_params['vocab'],
                     transition_system)
    encoder.load_state_dict(encoder_params['state_dict'])
    decoder = Reconstructor(decoder_params['args'], decoder_params['vocab'],
                            transition_system)
    decoder.load_state_dict(decoder_params['state_dict'])

    zprior = LSTMPrior.load(args.load_prior,
                            transition_system=transition_system,
                            cuda=args.cuda)
    print('loaded p(z) prior at %s' % args.load_prior, file=sys.stderr)
    # freeze prior parameters
    for p in zprior.parameters():
        p.requires_grad = False
    zprior.eval()
    xprior = LSTMLanguageModel.load(args.load_src_lm)
    print('loaded p(x) prior at %s' % args.load_src_lm, file=sys.stderr)
    xprior.eval()

    if args.cache:
        jae = JAE_cache(encoder, decoder, zprior, xprior, args)
    else:
        jae = JAE(encoder, decoder, zprior, xprior, args)

    jae.train()
    encoder.train()
    decoder.train()
    if args.cuda: jae.cuda()

    labeled_data = Dataset.from_bin_file(args.train_file)
    # labeled_data.examples = labeled_data.examples[:10]
    unlabeled_data = Dataset.from_bin_file(
        args.unlabeled_file)  # pretend they are un-labeled!
    dev_set = Dataset.from_bin_file(args.dev_file)
    # dev_set.examples = dev_set.examples[:10]

    optimizer = torch.optim.Adam(
        [p for p in jae.parameters() if p.requires_grad], lr=args.lr)

    print(
        '*** begin semi-supervised training %d labeled examples, %d unlabeled examples ***'
        % (len(labeled_data), len(unlabeled_data)),
        file=sys.stderr)
    report_encoder_loss = report_decoder_loss = report_examples = 0.
    report_unsup_examples = report_unsup_encoder_loss = report_unsup_decoder_loss = report_unsup_baseline_loss = 0.
    patience = 0
    num_trial = 1
    epoch = train_iter = 0
    history_dev_scores = []
    while True:
        epoch += 1
        epoch_begin = time.time()
        unlabeled_examples_iter = unlabeled_data.batch_iter(
            batch_size=args.unsup_batch_size, shuffle=True)
        for labeled_examples in labeled_data.batch_iter(
                batch_size=args.batch_size, shuffle=True):
            labeled_examples = [
                e for e in labeled_examples
                if len(e.tgt_actions) <= args.decode_max_time_step
            ]

            train_iter += 1

            optimizer.zero_grad()

            report_examples += len(labeled_examples)

            sup_encoder_loss = -encoder.score(labeled_examples)
            sup_decoder_loss = -decoder.score(labeled_examples)

            report_encoder_loss += sup_encoder_loss.sum().data[0]
            report_decoder_loss += sup_decoder_loss.sum().data[0]

            sup_encoder_loss = torch.mean(sup_encoder_loss)
            sup_decoder_loss = torch.mean(sup_decoder_loss)

            sup_loss = sup_encoder_loss + sup_decoder_loss

            # compute unsupervised loss

            try:
                unlabeled_examples = next(unlabeled_examples_iter)
            except StopIteration:
                # if finished unlabeled data stream, restart it
                unlabeled_examples_iter = unlabeled_data.batch_iter(
                    batch_size=args.batch_size, shuffle=True)
                unlabeled_examples = next(unlabeled_examples_iter)
                unlabeled_examples = [
                    e for e in unlabeled_examples
                    if len(e.tgt_actions) <= args.decode_max_time_step
                ]

            unsup_encoder_loss, unsup_decoder_loss, meta_data = jae.get_unsupervised_loss(
                unlabeled_examples, args.moves)
            if bi_direction:
                unsup_encoder_loss_back, unsup_decoder_loss_back, meta_data_back = jae.get_unsupervised_loss_backward(
                    unlabeled_examples, args.moves)

            nan = False
            if nn_utils.isnan(sup_loss.data):
                print('Nan in sup_loss')
                nan = True
            if nn_utils.isnan(unsup_encoder_loss.data):
                print('Nan in unsup_encoder_loss!', file=sys.stderr)
                nan = True
            if nn_utils.isnan(unsup_decoder_loss.data):
                print('Nan in unsup_decoder_loss!', file=sys.stderr)
                nan = True
            if bi_direction:
                if nn_utils.isnan(unsup_encoder_loss_back.data):
                    print('Nan in unsup_encoder_loss_back!', file=sys.stderr)
                    nan = True
                if nn_utils.isnan(unsup_decoder_loss_back.data):
                    print('Nan in unsup_decoder_loss_back!', file=sys.stderr)
                    nan = True

            if nan:
                continue
            if bi_direction:
                report_unsup_encoder_loss += (
                    unsup_encoder_loss.sum().data[0] +
                    unsup_encoder_loss_back.sum().data[0])
                report_unsup_decoder_loss += (
                    unsup_decoder_loss.sum().data[0] +
                    unsup_decoder_loss_back.sum().data[0])
            else:
                report_unsup_encoder_loss += unsup_encoder_loss.sum().data[0]
                report_unsup_decoder_loss += unsup_decoder_loss.sum().data[0]
            report_unsup_examples += unsup_encoder_loss.size(0)

            if bi_direction:
                unsup_loss = torch.mean(unsup_encoder_loss) + torch.mean(
                    unsup_decoder_loss) + torch.mean(
                        unsup_encoder_loss_back) + torch.mean(
                            unsup_decoder_loss_back)
            else:
                unsup_loss = torch.mean(unsup_encoder_loss) + torch.mean(
                    unsup_decoder_loss)
            loss = sup_loss + args.unsup_loss_weight * unsup_loss

            loss.backward()
            # clip gradient
            grad_norm = torch.nn.utils.clip_grad_norm(jae.parameters(),
                                                      args.clip_grad)
            optimizer.step()
            if train_iter % args.log_every == 0:
                print(
                    '[Iter %d] supervised: encoder loss=%.5f, decoder loss=%.5f'
                    % (train_iter, report_encoder_loss / report_examples,
                       report_decoder_loss / report_examples),
                    file=sys.stderr)

                print(
                    '[Iter %d] unsupervised: encoder loss=%.5f, decoder loss=%.5f, baseline loss=%.5f'
                    % (train_iter,
                       report_unsup_encoder_loss / report_unsup_examples,
                       report_unsup_decoder_loss / report_unsup_examples,
                       report_unsup_baseline_loss / report_unsup_examples),
                    file=sys.stderr)

                samples = meta_data['samples']
                for v in meta_data.values():
                    if isinstance(v, Variable): v.cpu()
                for i, sample in enumerate(samples[:1]):
                    print('\t[%s] Source: %s' %
                          (sample.idx, ' '.join(sample.src_sent)),
                          file=sys.stderr)
                    print('\t[%s] Code: \n%s' % (sample.idx, sample.tgt_code),
                          file=sys.stderr)
                    ref_example = [
                        e for e in unlabeled_examples
                        if e.idx == int(sample.idx[:sample.idx.index('-')])
                    ][0]
                    print('\t[%s] Gold Code: \n%s' %
                          (sample.idx, ref_example.tgt_code),
                          file=sys.stderr)
                    print(
                        '\t[%s] Log p(z|x): %f' %
                        (sample.idx, meta_data['encoding_scores'][i].data[0]),
                        file=sys.stderr)
                    print('\t[%s] Log p(x|z): %f' %
                          (sample.idx,
                           meta_data['reconstruction_scores'][i].data[0]),
                          file=sys.stderr)
                    print('\t[%s] Encoder Loss: %f' %
                          (sample.idx, unsup_encoder_loss[i].data[0]),
                          file=sys.stderr)
                    print('\t**************************', file=sys.stderr)

                report_encoder_loss = report_decoder_loss = report_examples = 0.
                report_unsup_encoder_loss = report_unsup_decoder_loss = report_unsup_baseline_loss = report_unsup_examples = 0.

        print('[Epoch %d] epoch elapsed %ds' %
              (epoch, time.time() - epoch_begin),
              file=sys.stderr)
        # perform validation
        print('[Epoch %d] begin validation' % epoch, file=sys.stderr)

        eval_start = time.time()
        eval_results = evaluation.evaluate(dev_set.examples,
                                           encoder,
                                           args,
                                           verbose=True)
        encoder.train()
        dev_acc = eval_results['accuracy']
        print('[Epoch %d] code generation accuracy=%.5f took %ds' %
              (epoch, dev_acc, time.time() - eval_start),
              file=sys.stderr)
        is_better = history_dev_scores == [] or dev_acc > max(
            history_dev_scores)
        history_dev_scores.append(dev_acc)

        if is_better:
            patience = 0
            model_file = args.save_to + '.bin'
            print('save currently the best model ..', file=sys.stderr)
            print('save model to [%s]' % model_file, file=sys.stderr)
            jae.save(model_file)
            # also save the optimizers' state
            torch.save(optimizer.state_dict(), args.save_to + '.optim.bin')
        elif epoch == args.max_epoch:
            print('reached max epoch, stop!', file=sys.stderr)
            exit(0)
        elif patience < args.patience:
            patience += 1
            print('hit patience %d' % patience, file=sys.stderr)

        if patience == args.patience:
            num_trial += 1
            print('hit #%d trial' % num_trial, file=sys.stderr)
            if num_trial == args.max_num_trial:
                print('early stop!', file=sys.stderr)
                exit(0)

            # decay lr, and restore from previously best checkpoint
            lr = optimizer.param_groups[0]['lr'] * args.lr_decay
            print('load previously best model and decay learning rate to %f' %
                  lr,
                  file=sys.stderr)

            # load best model's parameters
            jae.load_parameters(args.save_to + '.bin')
            if args.cuda: jae = jae.cuda()

            # load optimizers
            if args.reset_optimizer:
                print('reset to a new infer_optimizer', file=sys.stderr)
                optimizer = torch.optim.Adam(
                    [p for p in jae.parameters() if p.requires_grad], lr=lr)
            else:
                print('restore parameters of the optimizers', file=sys.stderr)
                optimizer.load_state_dict(
                    torch.load(args.save_to + '.optim.bin'))

            # set new lr
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr

            # reset patience
            patience = 0
Beispiel #15
0
            self.onFinished()


if __name__ == '__main__':
    from config import CONFIG
    from model.utils import load_json
    import pprint

    pp = pprint.PrettyPrinter(indent=4)

    # load project file to update
    project_filename = '../projects/test_saxs.json'
    project = load_json(project_filename)

    # parser and database
    parser = Parser(config=CONFIG['XML'])
    DB = DataBase(host=CONFIG['DB']['HOST'], port=CONFIG['DB']['PORT'])
    colCursor, fsCursor = DB.get_db('test_db', 'test_col')

    # initiate and run worker
    worker = Syncer(name='syncer',
                    project=project,
                    parser=parser,
                    colCursor=colCursor,
                    fsCursor=fsCursor,
                    extensions=['xml', 'jpg', 'tiff'],
                    interval=500)
    worker.start()

    while worker.t.is_alive():
        time.sleep(1)
Beispiel #16
0
class DBHandler(object):
    def __init__(self,
                 rootDir,
                 fsmapFn,
                 db_host='localhost',
                 db_port=27017,
                 xml_config=None):
        self.rootDir = os.path.realpath(os.path.abspath(rootDir))
        self.fsmapFn = fsmapFn
        self.db_host = db_host
        self.db_port = db_port

        self.parser = Parser(xml_config) if xml_config is not None else None

        self.extensions = ['.xml', '.jpg', '.tiff']

        # to ensure safe operation on fsmap
        self.fsmap_lock = threading.Lock()

        self.fsMap = self._load()
        self._traverse()
        self._save()  # for debuggin...

        # lazy connection to MongoDB server
        # Must ensure mongod is running!
        self.client = pymongo.MongoClient(self.db_host, self.db_port)
        self.clientPool = {}

        # streaming queues
        self.fs_event_q = Queue()
        self.stream_q = Queue()

        # old map
        # This keeps old fsmap information when file system changes manually
        # e.g. folder move, rename, etc
        # If it is not empty dictionary, there is a bug....
        self._old_fsmap = {}

    def __del__(self):
        for _, h in self.clientPool.items():
            h.close()
        self.client.close()

    def _load(self):
        if not os.path.exists(self.fsmapFn): return {}

        try:
            with open(self.fsmapFn) as f:
                data = json.load(f)
        except (FileNotFoundError, TypeError, json.decoder.JSONDecodeError):
            print('[WARN] Failed to load saved fsmap, {}!!!'.format(
                self.fsmapFn))
            print('[WARN] Previous fsmap will be ignored, if there is.')
            return {}

        def __recursive_flatten(fsmap: dict, flattened: dict):
            item = dict(fsmap)
            item['children'] = [
                __recursive_flatten(child, flattened)
                for child in item['children']
            ]
            flattened[item['path']] = item

        t = {}
        for key, value in data.items():
            __recursive_flatten(value, t)
        return t

    def _save(self):
        def __convert_to_hierarchical_format(key: str, fsmap: dict):
            item = dict(fsmap[key])
            item['children'] = [
                __convert_to_hierarchical_format(c, fsmap)
                for c in item['children']
            ]
            return item

        t = {}
        p_keys = [
            key for key, value in self.fsMap.items() if value['parent'] is None
        ]

        for key in p_keys:
            t[key] = __convert_to_hierarchical_format(key, self.fsMap)
        with open(self.fsmapFn, 'w') as f:
            json.dump(t, f, indent=2, sort_keys=True)

    def _traverse(self, save_old=False):
        """Traverse root directory"""
        fsmap = {}
        for dirpath, _, _ in os.walk(self.rootDir, followlinks=True):
            path = dirpath.replace(self.rootDir, '')
            tokens = path.split(os.sep)[1:]
            parent_path = os.path.join(self.rootDir, *tokens[:-1])

            real_path = os.path.realpath(dirpath)
            if len(path) == 0:
                name = dirpath
                parent = None
            else:
                name = os.path.basename(path)
                fsmap[parent_path]['children'].append(dirpath)
                parent = fsmap[parent_path]['path']

            fsmap[dirpath] = {
                'path': dirpath,  # absolute path to current directory
                'realpath': real_path,  # realpath for symlink
                'name': name,  # name of current directory for display
                'children':
                [],  # list of absolute pathes of direct children directories
                'parent': parent,  # absolute path to direct parent directory
                'link': None,  # linked path

                # valid path flag
                # It will turn into False, if the given path doesn't exist by
                #  comparing with fsmap in the file.
                'valid': True,  # valid path flag

                # This set to Ture, once a client set the `db` field.
                # Then, `db` filed can be modified only manually via fsmap file.
                # Such modification requires to re-run the web server.
                'db': None,  # related database (db, collection)
                'fixed': False,  # can modify?

                # used for syncing
                'file': None,  # sample file name used to determine group name
                'sep':
                None,  # separator used to parse group name from the file
                'group': None,  # group name in this folder
                'last_sync': None,  # the last date and time sync is applied
            }

        # update for symlink
        for key, value in fsmap.items():
            if not (key == value['realpath']):
                if value['realpath'] in fsmap:
                    fsmap[value['realpath']]['link'] = key
                    value['link'] = fsmap[value['realpath']]['path']

        # save unregistered fsmap from old one
        if save_old:
            for key, value in self.fsMap.items():
                if key not in fsmap:
                    self._old_fsmap[key] = dict(value)

        _keys_to_copy = [
            'valid', 'db', 'fixed', 'file', 'sep', 'group', 'last_sync'
        ]

        def __merge_fsmap(dstMap: dict, srcMap: dict):
            for _path, _srcItem in srcMap.items():
                if _path in dstMap:
                    # Is parent same? yes, it must be same as key is the absolute path.
                    # But children could be different. For example, one might delete/move/add
                    # sub-directories. But, we do not care, here.
                    _dstItem = dstMap[_path]
                    for _k in _keys_to_copy:
                        _dstItem[_k] = _srcItem[_k]
                else:
                    # This branch can happen when one delete/move/add subdirectories.
                    # Keep it, so that one can fix it manually in the json file.
                    _srcItem['children'] = []
                    _srcItem['parent'] = None
                    _srcItem['valid'] = False
                    #srcItem['inSync'] = False
                    dstMap[key] = _srcItem

        __merge_fsmap(fsmap, self.fsMap)
        self.fsMap = fsmap

    def _update_fsmap(self, event_type, src_path, dst_path):
        """Invoked when filesystem changes (only for directory changes)"""
        with self.fsmap_lock:
            if event_type in ['created', 'deleted']:
                # on create and delete operation, refresh entire fsmap
                self._traverse()
                self._save()
            elif event_type in ['moved'] and dst_path is not None:
                # moved event includes 'rename' and 'relocate a folder'
                cp_key = ['db', 'file', 'fixed', 'group', 'last_sync', 'sep']

                self._traverse(True)
                if src_path in self._old_fsmap and dst_path in self.fsMap:
                    old_item = self._old_fsmap[src_path]
                    new_item = self.fsMap[dst_path]
                    for k, v in old_item.items():
                        if k in cp_key:
                            new_item[k] = v
                    del self._old_fsmap[src_path]
                else:
                    print('Error in handling DirMovedEvent: ', src_path,
                          dst_path)

    def _db_key(self, _db, _col, _fs):
        _key = '{:s}::{:s}::{:s}'.format(_db, _col, _fs)
        return _key

    def _db_key_list(self, path, recursive, isUnique=False):
        _key_list = []

        def __recursive_db(_path, fsmap):
            if _path not in fsmap: return

            _db = fsmap[_path]['db']
            if _db is None: return

            _key = self._db_key(_db[0], _db[1], _db[2])
            if not isUnique:
                _key_list.append((_path, _key))
            else:
                if _key not in _key_list:
                    _key_list.append(_key)

            if recursive:
                for _c_path in fsmap[_path]['children']:
                    __recursive_db(_c_path, fsmap)

        __recursive_db(path, self.fsMap)
        return _key_list

    def _get_db_handler(self, db_col_fs):
        _db, _col, _fs = db_col_fs
        _key = self._db_key(_db, _col, _fs)
        if _key in self.clientPool:
            return self.clientPool[_key]
        else:
            _h = MultiViewMongo(connection=self.client,
                                db_name=_db,
                                collection_name=_col,
                                fs_name=_fs)
            self.clientPool[_key] = _h
            return _h

    def _get_db_handler_by_key(self, key: str):
        if key in self.clientPool:
            return self.clientPool[key]
        else:
            tokens = key.split('::')
            _h = MultiViewMongo(connection=self.client,
                                db_name=tokens[0],
                                collection_name=tokens[1],
                                fs_name=tokens[2])
            self.clientPool[key] = _h
            return _h

    def _update_file(self, event_type, src_path, dst_path):
        """Invoked when files change
            By watchdog:
            By syncer:
        """
        if self.parser is None:
            print('parser is not set.')
            return None
        if dst_path is None:
            _path = src_path
            path, filename = os.path.split(src_path)
        else:
            _path = dst_path
            path, filename = os.path.split(dst_path)

        if len(filename) == 0:
            print('fail to detect filename.')
            return None

        ext = os.path.splitext(filename)[1]
        if len(ext) == 0 or ext not in self.extensions:
            print('Unsupported extension type. {:s}'.format(ext))
            return None

        if path not in self.fsMap:
            print("Path is not in fsmap. {:s}".format(path))
            return None
        if self.fsMap[path]['db'] is None:
            print("DB is not set on this path. {:s}".format(path))
            return None
        if self.fsMap[path]['group'] is None:
            print("Group name is not set to this path. {:s}".format(path))
            return None
        db = self.fsMap[path]['db']
        group = self.fsMap[path]['group']

        if event_type in ['created', 'modified', 'syncing', 'moved']:
            doc = self.parser.run(_path, ext, group)
            if doc is None:
                return None

            h = self._get_db_handler(db)
            if h.save_one(doc, ext) == 0:
                return None

            if ext == '.xml':
                query = {"sample": group, "item": doc['item']}
                res = h.load(query=query, fields={}, getarrays=False)
                res = self.after_query(res)
                return json.dumps(res)

        elif event_type in ['deleted']:
            # currently we do not delete any document in the db (should we?)
            pass
        else:
            # unknown event_type
            pass

        return None

    def _add_fs_event(self, what, event_type, src_path, dst_path):
        """Invoked by observer and syncers"""
        self.fs_event_q.put((what, event_type, src_path, dst_path))

    def get_fsmap_as_list(self):
        """
        Used to return the lastes file system information.
        Always, first scan file system itself to detect any changes made in
        the file system by someone else.
        """
        with self.fsmap_lock:
            self._traverse()
            fsmap_list = [[key, value] for key, value in self.fsMap.items()
                          if value['valid']]
        return fsmap_list

    def set_fsmap(self, fsmap_list):
        """Used to set db config by a client"""
        with self.fsmap_lock:
            for path, value in fsmap_list:
                # path is not found
                # (can happen when file system is manually changed)
                if path not in self.fsMap: continue

                # db is already set by other clients, ignore this.
                # Only administrator can change this manually.
                if self.fsMap[path]['fixed']: continue

                # check db config a client set
                if value['db'] is None: continue  # db is not set
                if len(value['db']) != 3: continue  # must be 3-D array

                new_db = value['db'][0]
                new_col = value['db'][1]
                if len(new_db) == 0 or len(new_col) == 0:
                    continue  # in-complete setting
                if new_db == 'null' or new_col == 'null':
                    continue  # in-complete setting

                # update db config
                item = self.fsMap[path]
                item['db'] = [new_db, new_col, 'fs']
                item['fixed'] = True

            self._save()

    # def get_sync_samples(self, path, recursive):
    #     """
    #     This is called to initiate syncing operation.
    #     Args:
    #         path:
    #         recursive:
    #
    #     Returns:
    #
    #     """
    #     if path not in self.fsMap: return []
    #     if not os.path.exists(path): return []
    #
    #     sample_files = {}
    #     for dirpath, _, files in os.walk(path, followlinks=True):
    #         for f in files:
    #             name, ext = os.path.splitext(f)
    #             if ext in self.extensions:
    #                 sample_files[dirpath] = name
    #                 break
    #
    #         if not recursive: break
    #     return sample_files

    # def set_sync_info(self, info:dict):
    #     """update `inSync` and `sep` fields in fsmap"""
    #
    #     with self.fsmap_lock:
    #         responses = {}
    #         for path, sep in info.items():
    #             resp = {
    #                 'valid': Syncer.CAN_SYNC
    #             }
    #             if path in self.fsMap:
    #                 item = self.fsMap[path]
    #                 if item['inSync']:
    #                     resp['valid'] = Syncer.CANNOT_SYNC
    #                 elif item['db'] is None or len(item['db']) != 3:
    #                     resp['valid'] = Syncer.NO_DB
    #                 else:
    #                     item['inSync'] = True
    #                     item['sep'] = sep
    #             else:
    #                 resp['valid'] = Syncer.NO_PATH
    #             responses[path] = resp
    #
    #         self._save()
    #
    #     return responses

    # def run_syncer(self, resp:dict):
    #     """run syncer, some information will be added to resp"""
    #
    #     files_to_sync = []
    #     for path, info in resp.items():
    #         if info['valid']:
    #             item = {
    #                 'path': path,
    #                 'files': [],
    #                 'client': self.get_client(self.get_db(path))
    #             }
    #             for _, _, files in os.walk(path):
    #                 item['files'] = [f for f in files
    #                                  if os.path.splitext(f)[1] in self.extensions]
    #                 break
    #             files_to_sync.append(item)
    #             info['total'] = len(item['files'])
    #         else:
    #             info['total'] = 0
    #         info['progressed'] = 0
    #
    #     # create syncer
    #     syncer_id = Syncer.generate_syncer_id()
    #     #syncer = Syncer(items_to_sync=files_to_sync)
    #
    #     # update pool
    #     #self.syncerPool[syncer_id] = syncer
    #
    #     # run syncer
    #     #syncer.start()
    #
    #     return syncer_id, resp

    # def get_client(self, db_collection_fs):
    #     if db_collection_fs is None or len(db_collection_fs) != 3:
    #         return None
    #
    #     db = db_collection_fs[0]
    #     col = db_collection_fs[1]
    #     fs = db_collection_fs[2]
    #     key = '{}:{}:{}'.format(db, col, fs)
    #     if key in self.clientPool:
    #         h = self.clientPool[key]
    #     else:
    #         h = MultiViewMongo(
    #             connection=self.client,
    #             db_name=db,
    #             collection_name=col,
    #             fs_name=fs
    #         )
    #         self.clientPool[key] = h
    #     return h

    # def set_db(self, path, db, col):
    #     if path not in self.fsMap:
    #         return False
    #
    #     def __recursive_update(key: str, fsmap: dict):
    #         item = fsmap[key]
    #         if item['db'] is None: item['db'] = [db, col, 'fs']
    #         for child in item['children']:
    #             __recursive_update(child, fsmap)
    #
    #     # update db setting recursively
    #     # If a path is already set before (or maybe by other client),
    #     # it didn't modify it. Given path may be not set as a client wants.
    #     with self.fsmap_lock:
    #         __recursive_update(path, self.fsMap)
    #         self._save()
    #
    #     return True

    # def get_db(self, path):
    #     db = None
    #     with self.fsmap_lock:
    #         if path in self.fsMap:
    #             db = self.fsMap[path]['db']
    #     return db

    def after_query(self, res):
        """Post processor on queried results"""
        if not isinstance(res, list):
            res = [res]

        res = [replace_objid_to_str(doc) for doc in res]
        res = [flatten_dict(doc) for doc in res]
        # for doc in res:
        #     doc['sample'] = '[{:s}][{:s}]{:s}'.format(db, col, doc['sample'])
        #     doc['_id'] = '[{:s}][{:s}]{:s}'.format(db, col, doc['_id'])

        return res

    def get_samplelist(self, path, recursive):
        if path not in self.fsMap:
            return []

        samplelist = {}

        db_key_list = self._db_key_list(path, recursive)
        _db_list = self.client.list_database_names()
        for _path, _key in db_key_list:
            _db, _col, _fs = _key.split("::")

            if _db not in _db_list:
                continue

            _col_list = self.client[_db].collection_names()
            if _col not in _col_list:
                continue

            h = self._get_db_handler_by_key(_key)
            pipeline = [{
                "$match": {
                    "path": _path
                }
            }, {
                "$match": {
                    "sample": {
                        "$exists": True,
                        "$ne": None
                    }
                }
            }, {
                "$group": {
                    "_id": "$sample",
                    "count": {
                        "$sum": 1
                    }
                }
            }]
            res = list(h.collection.aggregate(pipeline))

            for r in res:
                _id = r['_id']
                _count = r['count']

                if _id in samplelist:
                    samplelist[_id] += _count
                else:
                    samplelist[_id] = _count

        return samplelist

    def get_samples(self, names, path, recursive):
        if path not in self.fsMap:
            return {}

        sampleData = {}
        db_key_list = self._db_key_list(path, recursive, False)
        _db_list = self.client.list_database_names()
        for _path, _key in db_key_list:
            _db, _col, _fs = _key.split("::")

            if _db not in _db_list:
                continue

            _col_list = self.client[_db].collection_names()
            if _col not in _col_list:
                continue

            h = self._get_db_handler_by_key(_key)
            for name in names:
                query = {"sample": name, "path": _path}
                res = h.load(query=query, fields={}, getarrays=False)

                if res is None:
                    continue

                res = self.after_query(res)

                if name in sampleData:
                    sampleData[name].append(res)
                else:
                    sampleData[name] = res
        return sampleData

    def get_tiff(self, id, path):
        if path not in self.fsMap:
            return []

        if self.fsMap[path]['db'] is None:
            return []

        db = self.fsMap[path]['db']
        h = self._get_db_handler(db)

        try:
            _id = ObjectId(id)
        except InvalidId:
            return []

        query = {'_id': _id, 'tiff': {'$exists': True}}
        fields = {'tiff': 1, '_id': 0}
        res = h.load(query, fields, getarrays=True)

        if res is None:
            return []

        data = res['tiff']['data']
        res['tiff']['data'] = data.tolist()
        return res['tiff']
Beispiel #17
0
def train(args):
    grammar = ASDLGrammar.from_text(open(args.asdl_file).read())
    transition_system = TransitionSystem.get_class_by_lang(args.lang)(grammar)
    train_set = Dataset.from_bin_file(args.train_file)
    dev_set = Dataset.from_bin_file(args.dev_file)
    vocab = pickle.load(open(args.vocab))

    model = Parser(args, vocab, transition_system)
    model.train()
    if args.cuda: model.cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    print('begin training, %d training examples, %d dev examples' % (len(train_set), len(dev_set)), file=sys.stderr)
    print('vocab: %s' % repr(vocab), file=sys.stderr)

    epoch = train_iter = 0
    report_loss = report_examples = 0.
    history_dev_scores = []
    num_trial = patience = 0
    while True:
        epoch += 1
        epoch_begin = time.time()

        for batch_examples in train_set.batch_iter(batch_size=args.batch_size, shuffle=True):
            batch_examples = [e for e in batch_examples if len(e.tgt_actions) <= args.decode_max_time_step]

            train_iter += 1
            optimizer.zero_grad()

            loss = -model.score(batch_examples)
            # print(loss.data)
            loss_val = torch.sum(loss).data[0]
            report_loss += loss_val
            report_examples += len(batch_examples)
            loss = torch.mean(loss)

            loss.backward()

            # clip gradient
            if args.clip_grad > 0.:
                grad_norm = torch.nn.utils.clip_grad_norm(model.parameters(), args.clip_grad)

            optimizer.step()

            if train_iter % args.log_every == 0:
                print('[Iter %d] encoder loss=%.5f' %
                      (train_iter,
                       report_loss / report_examples),
                      file=sys.stderr)

                report_loss = report_examples = 0.

        print('[Epoch %d] epoch elapsed %ds' % (epoch, time.time() - epoch_begin), file=sys.stderr)
        # model_file = args.save_to + '.iter%d.bin' % train_iter
        # print('save model to [%s]' % model_file, file=sys.stderr)
        # model.save(model_file)

        # perform validation
        print('[Epoch %d] begin validation' % epoch, file=sys.stderr)
        eval_start = time.time()
        eval_results = evaluation.evaluate(dev_set.examples, model, args, verbose=True)
        dev_acc = eval_results['accuracy']
        print('[Epoch %d] code generation accuracy=%.5f took %ds' % (epoch, dev_acc, time.time() - eval_start), file=sys.stderr)
        is_better = history_dev_scores == [] or dev_acc > max(history_dev_scores)
        history_dev_scores.append(dev_acc)

        if is_better:
            patience = 0
            model_file = args.save_to + '.bin'
            print('save currently the best model ..', file=sys.stderr)
            print('save model to [%s]' % model_file, file=sys.stderr)
            model.save(model_file)
            # also save the optimizers' state
            torch.save(optimizer.state_dict(), args.save_to + '.optim.bin')
        elif epoch == args.max_epoch:
            print('reached max epoch, stop!', file=sys.stderr)
            exit(0)
        elif patience < args.patience:
            patience += 1
            print('hit patience %d' % patience, file=sys.stderr)

        if patience == args.patience:
            num_trial += 1
            print('hit #%d trial' % num_trial, file=sys.stderr)
            if num_trial == args.max_num_trial:
                print('early stop!', file=sys.stderr)
                exit(0)

            # decay lr, and restore from previously best checkpoint
            lr = optimizer.param_groups[0]['lr'] * args.lr_decay
            print('load previously best model and decay learning rate to %f' % lr, file=sys.stderr)

            # load model
            params = torch.load(args.save_to + '.bin', map_location=lambda storage, loc: storage)
            model.load_state_dict(params['state_dict'])
            if args.cuda: model = model.cuda()

            # load optimizers
            if args.reset_optimizer:
                print('reset optimizer', file=sys.stderr)
                optimizer = torch.optim.Adam(model.inference_model.parameters(), lr=lr)
            else:
                print('restore parameters of the optimizers', file=sys.stderr)
                optimizer.load_state_dict(torch.load(args.save_to + '.optim.bin'))

            # set new lr
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr

            # reset patience
            patience = 0
Beispiel #18
0
def train_semi(args):
    encoder_params = torch.load(args.load_model, map_location=lambda storage, loc: storage)
    decoder_params = torch.load(args.load_decoder, map_location=lambda storage, loc: storage)

    print('loaded encoder at %s' % args.load_model, file=sys.stderr)
    print('loaded decoder at %s' % args.load_decoder, file=sys.stderr)

    transition_system = encoder_params['transition_system']
    encoder_params['args'].cuda = decoder_params['args'].cuda = args.cuda

    encoder = Parser(encoder_params['args'], encoder_params['vocab'], transition_system)
    encoder.load_state_dict(encoder_params['state_dict'])
    decoder = Reconstructor(decoder_params['args'], decoder_params['vocab'], transition_system)
    decoder.load_state_dict(decoder_params['state_dict'])

    if args.prior == 'lstm':
        prior = LSTMPrior.load(args.load_prior, transition_system=transition_system, cuda=args.cuda)
        print('loaded prior at %s' % args.load_prior, file=sys.stderr)
        # freeze prior parameters
        for p in prior.parameters():
            p.requires_grad = False
        prior.eval()
    else:
        prior = UniformPrior()

    if args.baseline == 'mlp':
        structVAE = StructVAE(encoder, decoder, prior, args)
    elif args.baseline == 'src_lm' or args.baseline == 'src_lm_and_linear':
        src_lm = LSTMLanguageModel.load(args.load_src_lm)
        print('loaded source LM at %s' % args.load_src_lm, file=sys.stderr)
        vae_cls = StructVAE_LMBaseline if args.baseline == 'src_lm' else StructVAE_SrcLmAndLinearBaseline
        structVAE = vae_cls(encoder, decoder, prior, src_lm, args)
    else:
        raise ValueError('unknown baseline')

    structVAE.train()
    if args.cuda: structVAE.cuda()

    labeled_data = Dataset.from_bin_file(args.train_file)
    # labeled_data.examples = labeled_data.examples[:10]
    unlabeled_data = Dataset.from_bin_file(args.unlabeled_file)   # pretend they are un-labeled!
    dev_set = Dataset.from_bin_file(args.dev_file)
    # dev_set.examples = dev_set.examples[:10]

    optimizer = torch.optim.Adam(ifilter(lambda p: p.requires_grad, structVAE.parameters()), lr=args.lr)

    print('*** begin semi-supervised training %d labeled examples, %d unlabeled examples ***' %
          (len(labeled_data), len(unlabeled_data)), file=sys.stderr)
    report_encoder_loss = report_decoder_loss = report_src_sent_words_num = report_tgt_query_words_num = report_examples = 0.
    report_unsup_examples = report_unsup_encoder_loss = report_unsup_decoder_loss = report_unsup_baseline_loss = 0.
    patience = 0
    num_trial = 1
    epoch = train_iter = 0
    history_dev_scores = []
    while True:
        epoch += 1
        epoch_begin = time.time()
        unlabeled_examples_iter = unlabeled_data.batch_iter(batch_size=args.unsup_batch_size, shuffle=True)

        for labeled_examples in labeled_data.batch_iter(batch_size=args.batch_size, shuffle=True):
            labeled_examples = [e for e in labeled_examples if len(e.tgt_actions) <= args.decode_max_time_step]

            train_iter += 1
            optimizer.zero_grad()
            report_examples += len(labeled_examples)

            sup_encoder_loss = -encoder.score(labeled_examples)
            sup_decoder_loss = -decoder.score(labeled_examples)

            report_encoder_loss += sup_encoder_loss.sum().data[0]
            report_decoder_loss += sup_decoder_loss.sum().data[0]

            sup_encoder_loss = torch.mean(sup_encoder_loss)
            sup_decoder_loss = torch.mean(sup_decoder_loss)

            sup_loss = sup_encoder_loss + sup_decoder_loss

            # compute unsupervised loss
            try:
                unlabeled_examples = next(unlabeled_examples_iter)
            except StopIteration:
                # if finished unlabeled data stream, restart it
                unlabeled_examples_iter = unlabeled_data.batch_iter(batch_size=args.batch_size, shuffle=True)
                unlabeled_examples = next(unlabeled_examples_iter)
                unlabeled_examples = [e for e in unlabeled_examples if len(e.tgt_actions) <= args.decode_max_time_step]

            try:
                unsup_encoder_loss, unsup_decoder_loss, unsup_baseline_loss, meta_data = structVAE.get_unsupervised_loss(
                    unlabeled_examples)

                nan = False
                if nn_utils.isnan(sup_loss.data):
                    print('Nan in sup_loss')
                    nan = True
                if nn_utils.isnan(unsup_encoder_loss.data):
                    print('Nan in unsup_encoder_loss!', file=sys.stderr)
                    nan = True
                if nn_utils.isnan(unsup_decoder_loss.data):
                    print('Nan in unsup_decoder_loss!', file=sys.stderr)
                    nan = True
                if nn_utils.isnan(unsup_baseline_loss.data):
                    print('Nan in unsup_baseline_loss!', file=sys.stderr)
                    nan = True

                if nan:
                    # torch.save((unsup_encoder_loss, unsup_decoder_loss, unsup_baseline_loss, meta_data), 'nan_data.bin')
                    continue

                report_unsup_encoder_loss += unsup_encoder_loss.sum().data[0]
                report_unsup_decoder_loss += unsup_decoder_loss.sum().data[0]
                report_unsup_baseline_loss += unsup_baseline_loss.sum().data[0]
                report_unsup_examples += unsup_encoder_loss.size(0)
            except ValueError as e:
                print(e.message, file=sys.stderr)
                continue
            # except Exception as e:
            #     print('********** Error **********', file=sys.stderr)
            #     print('batch labeled examples: ', file=sys.stderr)
            #     for example in labeled_examples:
            #         print('%s %s' % (example.idx, ' '.join(example.src_sent)), file=sys.stderr)
            #     print('batch unlabeled examples: ', file=sys.stderr)
            #     for example in unlabeled_examples:
            #         print('%s %s' % (example.idx, ' '.join(example.src_sent)), file=sys.stderr)
            #     print(e.message, file=sys.stderr)
            #     traceback.print_exc(file=sys.stderr)
            #     for k, v in meta_data.iteritems():
            #         print('%s: %s' % (k, v), file=sys.stderr)
            #     print('********** Error **********', file=sys.stderr)
            #     continue

            unsup_loss = torch.mean(unsup_encoder_loss) + torch.mean(unsup_decoder_loss) + torch.mean(unsup_baseline_loss)

            loss = sup_loss + args.unsup_loss_weight * unsup_loss

            loss.backward()

            # clip gradient
            grad_norm = torch.nn.utils.clip_grad_norm(structVAE.parameters(), args.clip_grad)
            optimizer.step()

            if train_iter % args.log_every == 0:
                print('[Iter %d] supervised: encoder loss=%.5f, decoder loss=%.5f' %
                      (train_iter,
                       report_encoder_loss / report_examples,
                       report_decoder_loss / report_examples),
                      file=sys.stderr)

                print('[Iter %d] unsupervised: encoder loss=%.5f, decoder loss=%.5f, baseline loss=%.5f' %
                      (train_iter,
                       report_unsup_encoder_loss / report_unsup_examples,
                       report_unsup_decoder_loss / report_unsup_examples,
                       report_unsup_baseline_loss / report_unsup_examples),
                      file=sys.stderr)

                # print('[Iter %d] unsupervised: baseline=%.5f, raw learning signal=%.5f, learning signal=%.5f' % (train_iter,
                #                                                                        meta_data['baseline'].mean().data[0],
                #                                                                        meta_data['raw_learning_signal'].mean().data[0],
                #                                                                        meta_data['learning_signal'].mean().data[0]), file=sys.stderr)

                if isinstance(structVAE, StructVAE_LMBaseline):
                    print('[Iter %d] baseline: source LM b_lm_weight: %.3f, b: %.3f' % (train_iter,
                                                                                        structVAE.b_lm_weight.data[0],
                                                                                        structVAE.b.data[0]),
                          file=sys.stderr)

                samples = meta_data['samples']
                for v in meta_data.itervalues():
                    if isinstance(v, Variable): v.cpu()
                for i, sample in enumerate(samples[:15]):
                    print('\t[%s] Source: %s' % (sample.idx, ' '.join(sample.src_sent)), file=sys.stderr)
                    print('\t[%s] Code: \n%s' % (sample.idx, sample.tgt_code), file=sys.stderr)
                    ref_example = [e for e in unlabeled_examples if e.idx == int(sample.idx[:sample.idx.index('-')])][0]
                    print('\t[%s] Gold Code: \n%s' % (sample.idx, ref_example.tgt_code), file=sys.stderr)
                    print('\t[%s] Log p(z|x): %f' % (sample.idx, meta_data['encoding_scores'][i].data[0]), file=sys.stderr)
                    print('\t[%s] Log p(x|z): %f' % (sample.idx, meta_data['reconstruction_scores'][i].data[0]), file=sys.stderr)
                    print('\t[%s] KL term: %f' % (sample.idx, meta_data['kl_term'][i].data[0]), file=sys.stderr)
                    print('\t[%s] Prior: %f' % (sample.idx, meta_data['prior'][i].data[0]), file=sys.stderr)
                    print('\t[%s] baseline: %f' % (sample.idx, meta_data['baseline'][i].data[0]), file=sys.stderr)
                    print('\t[%s] Raw Learning Signal: %f' % (sample.idx, meta_data['raw_learning_signal'][i].data[0]), file=sys.stderr)
                    print('\t[%s] Learning Signal - baseline: %f' % (sample.idx, meta_data['learning_signal'][i].data[0]), file=sys.stderr)
                    print('\t[%s] Encoder Loss: %f' % (sample.idx, unsup_encoder_loss[i].data[0]), file=sys.stderr)
                    print('\t**************************', file=sys.stderr)

                report_encoder_loss = report_decoder_loss = report_examples = 0.
                report_unsup_encoder_loss = report_unsup_decoder_loss = report_unsup_baseline_loss = report_unsup_examples = 0.

        print('[Epoch %d] epoch elapsed %ds' % (epoch, time.time() - epoch_begin), file=sys.stderr)
        # perform validation
        print('[Epoch %d] begin validation' % epoch, file=sys.stderr)

        eval_start = time.time()
        eval_results = evaluation.evaluate(dev_set.examples, encoder, args, verbose=True)
        dev_acc = eval_results['accuracy']
        print('[Epoch %d] code generation accuracy=%.5f took %ds' % (epoch, dev_acc, time.time() - eval_start),
              file=sys.stderr)
        is_better = history_dev_scores == [] or dev_acc > max(history_dev_scores)
        history_dev_scores.append(dev_acc)

        # model_file = args.save_to + '.iter%d.bin' % train_iter
        # print('save model to [%s]' % model_file, file=sys.stderr)
        # structVAE.save(model_file)

        if is_better:
            patience = 0
            model_file = args.save_to + '.bin'
            print('save currently the best model ..', file=sys.stderr)
            print('save model to [%s]' % model_file, file=sys.stderr)
            structVAE.save(model_file)
            # also save the optimizers' state
            torch.save(optimizer.state_dict(), args.save_to + '.optim.bin')
        elif epoch == args.max_epoch:
            print('reached max epoch, stop!', file=sys.stderr)
            exit(0)
        elif patience < args.patience:
            patience += 1
            print('hit patience %d' % patience, file=sys.stderr)

        if patience == args.patience:
            num_trial += 1
            print('hit #%d trial' % num_trial, file=sys.stderr)
            if num_trial == args.max_num_trial:
                print('early stop!', file=sys.stderr)
                exit(0)

            # decay lr, and restore from previously best checkpoint
            lr = optimizer.param_groups[0]['lr'] * args.lr_decay
            print('load previously best model and decay learning rate to %f' % lr, file=sys.stderr)

            # load best model's parameters
            structVAE.load_parameters(args.save_to + '.bin')
            if args.cuda: structVAE = structVAE.cuda()

            # load optimizers
            if args.reset_optimizer:
                print('reset to a new infer_optimizer', file=sys.stderr)
                optimizer = torch.optim.Adam(ifilter(lambda p: p.requires_grad, structVAE.parameters()), lr=lr)
            else:
                print('restore parameters of the optimizers', file=sys.stderr)
                optimizer.load_state_dict(torch.load(args.save_to + '.optim.bin'))

            # set new lr
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr

            # reset patience
            patience = 0
Beispiel #19
0
def log_semi(args):
    print('loading VAE at %s' % args.load_model, file=sys.stderr)
    fname, ext = os.path.splitext(args.load_model)
    encoder_path = fname + '.encoder' + ext
    decoder_path = fname + '.decoder' + ext

    vae_params = torch.load(args.load_model, map_location=lambda storage, loc: storage)
    encoder_params = torch.load(encoder_path, map_location=lambda storage, loc: storage)
    decoder_params = torch.load(decoder_path, map_location=lambda storage, loc: storage)

    transition_system = encoder_params['transition_system']
    vae_params['args'].cuda = encoder_params['args'].cuda = decoder_params['args'].cuda = args.cuda

    encoder = Parser(encoder_params['args'], encoder_params['vocab'], transition_system)
    decoder = Reconstructor(decoder_params['args'], decoder_params['vocab'], transition_system)

    if vae_params['args'].prior == 'lstm':
        prior = LSTMPrior.load(vae_params['args'].load_prior, transition_system=decoder_params['transition_system'], cuda=args.cuda)
        print('loaded prior at %s' % vae_params['args'].load_prior, file=sys.stderr)
        # freeze prior parameters
        for p in prior.parameters():
            p.requires_grad = False
        prior.eval()
    else:
        prior = UniformPrior()

    if vae_params['args'].baseline == 'mlp':
        structVAE = StructVAE(encoder, decoder, prior, vae_params['args'])
    elif vae_params['args'].baseline == 'src_lm' or vae_params['args'].baseline == 'src_lm_and_linear':
        src_lm = LSTMLanguageModel.load(vae_params['args'].load_src_lm)
        print('loaded source LM at %s' % vae_params['args'].load_src_lm, file=sys.stderr)
        Baseline = StructVAE_LMBaseline if args.baseline == 'src_lm' else StructVAE_SrcLmAndLinearBaseline
        structVAE = Baseline(encoder, decoder, prior, src_lm, vae_params['args'])
    else:
        raise ValueError('unknown baseline')

    structVAE.load_parameters(args.load_model)
    structVAE.train()
    if args.cuda: structVAE.cuda()

    unlabeled_data = Dataset.from_bin_file(args.unlabeled_file)  # pretend they are un-labeled!

    print('*** begin sampling ***', file=sys.stderr)
    start_time = time.time()
    train_iter = 0
    log_entries = []
    for unlabeled_examples in unlabeled_data.batch_iter(batch_size=args.batch_size, shuffle=False):
        unlabeled_examples = [e for e in unlabeled_examples if len(e.tgt_actions) <= args.decode_max_time_step]

        train_iter += 1
        try:
            unsup_encoder_loss, unsup_decoder_loss, unsup_baseline_loss, meta_data = structVAE.get_unsupervised_loss(
                unlabeled_examples)

        except ValueError as e:
            print(e.message, file=sys.stderr)
            continue

        samples = meta_data['samples']
        for v in meta_data.itervalues():
            if isinstance(v, Variable): v.cpu()

        for i, sample in enumerate(samples):
            ref_example = [e for e in unlabeled_examples if e.idx == int(sample.idx[:sample.idx.index('-')])][0]
            log_entry = {
                'sample': sample,
                'ref_example': ref_example,
                'log_p_z_x': meta_data['encoding_scores'][i].data[0],
                'log_p_x_z': meta_data['reconstruction_scores'][i].data[0],
                'kl': meta_data['kl_term'][i].data[0],
                'prior': meta_data['prior'][i].data[0],
                'baseline': meta_data['baseline'][i].data[0],
                'learning_signal': meta_data['raw_learning_signal'][i].data[0],
                'learning_signal - baseline': meta_data['learning_signal'][i].data[0],
                'encoder_loss': unsup_encoder_loss[i].data[0],
                'decoder_loss': unsup_decoder_loss[i].data[0]
            }

            log_entries.append(log_entry)

    print('done! took %d s' % (time.time() - start_time), file=sys.stderr)
    pkl.dump(log_entries, open(args.save_to, 'wb'))
Beispiel #20
0
def self_training(args):
    """Perform self-training

    First load decoding results on disjoint data
    also load pre-trained model and perform supervised
    training on both existing training data and the
    decoded results
    """

    print('load pre-trained model from [%s]' % args.load_model,
          file=sys.stderr)
    params = torch.load(args.load_model,
                        map_location=lambda storage, loc: storage)
    vocab = params['vocab']
    transition_system = params['transition_system']
    saved_args = params['args']
    saved_state = params['state_dict']

    # transfer arguments
    saved_args.cuda = args.cuda
    saved_args.save_to = args.save_to
    saved_args.train_file = args.train_file
    saved_args.unlabeled_file = args.unlabeled_file
    saved_args.dev_file = args.dev_file
    saved_args.load_decode_results = args.load_decode_results
    args = saved_args

    update_args(args)

    model = Parser(saved_args, vocab, transition_system)
    model.load_state_dict(saved_state)

    if args.cuda: model = model.cuda()
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)

    print('load unlabeled data [%s]' % args.unlabeled_file, file=sys.stderr)
    unlabeled_data = Dataset.from_bin_file(args.unlabeled_file)

    print('load decoding results of unlabeled data [%s]' %
          args.load_decode_results,
          file=sys.stderr)
    decode_results = pickle.load(open(args.load_decode_results))

    labeled_data = Dataset.from_bin_file(args.train_file)
    dev_set = Dataset.from_bin_file(args.dev_file)

    print('Num. examples in unlabeled data: %d' % len(unlabeled_data),
          file=sys.stderr)
    assert len(unlabeled_data) == len(decode_results)
    self_train_examples = []
    for example, hyps in zip(unlabeled_data, decode_results):
        if hyps:
            hyp = hyps[0]
            sampled_example = Example(idx='self_train-%s' % example.idx,
                                      src_sent=example.src_sent,
                                      tgt_code=hyp.code,
                                      tgt_actions=hyp.action_infos,
                                      tgt_ast=hyp.tree)
            self_train_examples.append(sampled_example)
    print('Num. self training examples: %d, Num. labeled examples: %d' %
          (len(self_train_examples), len(labeled_data)),
          file=sys.stderr)

    train_set = Dataset(examples=labeled_data.examples + self_train_examples)

    print('begin training, %d training examples, %d dev examples' %
          (len(train_set), len(dev_set)),
          file=sys.stderr)
    print('vocab: %s' % repr(vocab), file=sys.stderr)

    epoch = train_iter = 0
    report_loss = report_examples = 0.
    history_dev_scores = []
    num_trial = patience = 0
    while True:
        epoch += 1
        epoch_begin = time.time()

        for batch_examples in train_set.batch_iter(batch_size=args.batch_size,
                                                   shuffle=True):
            batch_examples = [
                e for e in batch_examples
                if len(e.tgt_actions) <= args.decode_max_time_step
            ]

            train_iter += 1
            optimizer.zero_grad()

            loss = -model.score(batch_examples)
            # print(loss.data)
            loss_val = torch.sum(loss).data[0]
            report_loss += loss_val
            report_examples += len(batch_examples)
            loss = torch.mean(loss)

            loss.backward()

            # clip gradient
            if args.clip_grad > 0.:
                grad_norm = torch.nn.utils.clip_grad_norm(
                    model.parameters(), args.clip_grad)

            optimizer.step()

            if train_iter % args.log_every == 0:
                print('[Iter %d] encoder loss=%.5f' %
                      (train_iter, report_loss / report_examples),
                      file=sys.stderr)

                report_loss = report_examples = 0.

        print('[Epoch %d] epoch elapsed %ds' %
              (epoch, time.time() - epoch_begin),
              file=sys.stderr)
        # model_file = args.save_to + '.iter%d.bin' % train_iter
        # print('save model to [%s]' % model_file, file=sys.stderr)
        # model.save(model_file)

        # perform validation
        print('[Epoch %d] begin validation' % epoch, file=sys.stderr)
        eval_start = time.time()
        eval_results = evaluation.evaluate(dev_set.examples,
                                           model,
                                           args,
                                           verbose=True)
        dev_acc = eval_results['accuracy']
        print('[Epoch %d] code generation accuracy=%.5f took %ds' %
              (epoch, dev_acc, time.time() - eval_start),
              file=sys.stderr)
        is_better = history_dev_scores == [] or dev_acc > max(
            history_dev_scores)
        history_dev_scores.append(dev_acc)

        if is_better:
            patience = 0
            model_file = args.save_to + '.bin'
            print('save currently the best model ..', file=sys.stderr)
            print('save model to [%s]' % model_file, file=sys.stderr)
            model.save(model_file)
            # also save the optimizers' state
            torch.save(optimizer.state_dict(), args.save_to + '.optim.bin')
        elif epoch == args.max_epoch:
            print('reached max epoch, stop!', file=sys.stderr)
            exit(0)
        elif patience < args.patience:
            patience += 1
            print('hit patience %d' % patience, file=sys.stderr)

        if patience == args.patience:
            num_trial += 1
            print('hit #%d trial' % num_trial, file=sys.stderr)
            if num_trial == args.max_num_trial:
                print('early stop!', file=sys.stderr)
                exit(0)

            # decay lr, and restore from previously best checkpoint
            lr = optimizer.param_groups[0]['lr'] * args.lr_decay
            print('load previously best model and decay learning rate to %f' %
                  lr,
                  file=sys.stderr)

            # load model
            params = torch.load(args.save_to + '.bin',
                                map_location=lambda storage, loc: storage)
            model.load_state_dict(params['state_dict'])
            if args.cuda: model = model.cuda()

            # load optimizers
            if args.reset_optimizer:
                print('reset optimizer', file=sys.stderr)
                optimizer = torch.optim.Adam(
                    model.inference_model.parameters(), lr=lr)
            else:
                print('restore parameters of the optimizers', file=sys.stderr)
                optimizer.load_state_dict(
                    torch.load(args.save_to + '.optim.bin'))

            # set new lr
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr

            # reset patience
            patience = 0
Beispiel #21
0
    if not isinstance(res, list):
        res = [res]

    res = [replace_objid_to_str(doc) for doc in res]
    res = [flatten_dict(doc) for doc in res]

    return res


if __name__ == '__main__':
    from config import CONFIG
    from model.parser import Parser
    import pprint
    import os

    parser = Parser(config=CONFIG['XML'])
    DB = DataBase(host=CONFIG['DB']['HOST'], port=CONFIG['DB']['PORT'])
    pp = pprint.PrettyPrinter(indent=4)

    colCursor, fsCursor = DB.get_db('test_db', 'test_col')

    data_dir = [
        '/Users/scott/Desktop/data/saxs/analysis_proper/results/',
        '/Users/scott/Desktop/data/saxs/analysis_proper/thumbnails',
        '/Users/scott/Desktop/data/saxs/tiff'
    ]

    test_files = [
        'C67_GD2-69-6_th0.110_1929.1s_T200.006C_5.00s_61288_saxs.xml',
        'C67_GD2-69-6_th0.110_1929.1s_T200.006C_5.00s_61288_saxs.jpg',
        'C67_GD2-69-6_th0.110_1929.1s_T200.006C_5.00s_61288_saxs.tiff'