def transformation(fun: str, arr: np.ndarray): p = Parser(fun) try: p.exec(0) except Exception: return else: fig = complex_plot(p, arr) fig.savefig("output.png")
def __init__(self, agent, kb, lexicon, config, generator, manager, realizer=None): parser = Parser(agent, kb, lexicon) state = DialogueState(agent, kb) super(RulebasedSession, self).__init__(agent, kb, parser, generator, manager, state, sample_temperature=2.) self.kb = kb self.attr_type = {attr.name: attr.value_type for attr in kb.attributes} self.num_items = len(kb.items) self.entity_counts = self.count_entity() self.entity_coords = self.get_entity_coords() self.entity_weights = self.init_entity_weight() self.item_weights = [1.] * self.num_items self.realizer = realizer
def __init__(self, config, project_dir): # config for db and parser self.config = config self.parser = Parser(config=config['XML']) self.DB = DataBase( host=config['DB']['HOST'], port=config['DB']['PORT'] ) # full path to a directories where project information (*.json) resides. self.project_dir = project_dir # list of project information from *.json self.projects = [] #todo: make befow for loop as a function print('Load project *.json files ...') for filename in glob.glob(os.path.join(project_dir, '*.json')): data = load_json(filename) if data is not None: print('\tLoaded {}'.format(filename)) self.projects.append(data) # maximum number of syncer threads self.max_num_syncers = 2 self.num_syncers = 0 # syncer pool, key: project file name, value: syncer self.syncer_pool = {}
def parse_example(example, lexicon, templates): """Parse example and collect templates. """ kbs = example.scenario.kbs parsers = [Parser(agent, kbs[agent], lexicon) for agent in (0, 1)] states = [DialogueState(agent, kbs[agent]) for agent in (0, 1)] # Add init utterance <start> parsed_utterances = [states[0].utterance[0], states[1].utterance[1]] for event in example.events: writing_agent = event.agent # Speaking agent reading_agent = 1 - writing_agent #print event.agent received_utterance = parsers[reading_agent].parse( event, states[reading_agent]) if received_utterance: sent_utterance = copy.deepcopy(received_utterance) if sent_utterance.tokens: sent_utterance.template = parsers[ writing_agent].extract_template(sent_utterance.tokens, states[writing_agent]) templates.add_template(sent_utterance, states[writing_agent]) parsed_utterances.append(received_utterance) #print 'sent:', ' '.join(sent_utterance.template) #print 'received:', ' '.join(received_utterance.template) # Update states states[reading_agent].update(writing_agent, received_utterance) states[writing_agent].update(writing_agent, sent_utterance) return parsed_utterances
def __init__(self, agent, kb, lexicon, config, generator, manager): parser = Parser(agent, kb, lexicon) state = DialogueState(agent, kb) super(RulebasedSession, self).__init__(agent, kb, parser, generator, manager, state, sample_temperature=5.) self.title_scores = self.score_titles() for k, v in self.title_scores.iteritems(): print k, v
def test(args): test_set = Dataset.from_bin_file(args.test_file) assert args.load_model print('load model from [%s]' % args.load_model, file=sys.stderr) params = torch.load(args.load_model, map_location=lambda storage, loc: storage) vocab = params['vocab'] transition_system = params['transition_system'] saved_args = params['args'] saved_state = params['state_dict'] saved_args.cuda = args.cuda parser = Parser(saved_args, vocab, transition_system) parser.load_state_dict(saved_state) if args.cuda: parser = parser.cuda() parser.eval() eval_results, decode_results = evaluation.evaluate( test_set.examples, parser, args, verbose=True, return_decode_result=True) print(eval_results, file=sys.stderr) if args.save_decode_to: pkl.dump(decode_results, open(args.save_decode_to, 'wb'))
def input_function(fun: str, in_zoom: float = 1.0, out_zoom: float = 1.0): p = Parser(fun) try: p.exec(0) except Exception: return None arr = np.arange(-10 * in_zoom, 10 * in_zoom, 0.02) out_arr = np.arange(-10 * out_zoom, 10 * out_zoom, 0.02) vf = np.vectorize(p.real2complex, otypes=[complex]) res = vf(out_arr) fig = fun_plot(p, arr) fig.savefig("input.png") return res
def __init__(self, cfg: Union[str, IO], quant: bool = False, onnx: bool = False): super().__init__() self.quant = quant self.qstub = QuantStub() self.destub = DeQuantStub() if isinstance(cfg, str): cfg = open(cfg, 'r') self.module_list = nn.ModuleList(Parser(cfg).torch_layers(quant, onnx)) cfg.close()
def __init__(self, agent, kb, lexicon, config, generator, manager): parser = Parser(agent, kb, lexicon) state = DialogueState(agent, kb) super(RulebasedSession, self).__init__(agent, kb, parser, generator, manager, state, sample_temperature=5.) self.kb = kb self.personas = kb.personas
def __init__(self, rootDir, fsmapFn, db_host='localhost', db_port=27017, xml_config=None): self.rootDir = os.path.realpath(os.path.abspath(rootDir)) self.fsmapFn = fsmapFn self.db_host = db_host self.db_port = db_port self.parser = Parser(xml_config) if xml_config is not None else None self.extensions = ['.xml', '.jpg', '.tiff'] # to ensure safe operation on fsmap self.fsmap_lock = threading.Lock() self.fsMap = self._load() self._traverse() self._save() # for debuggin... # lazy connection to MongoDB server # Must ensure mongod is running! self.client = pymongo.MongoClient(self.db_host, self.db_port) self.clientPool = {} # streaming queues self.fs_event_q = Queue() self.stream_q = Queue() # old map # This keeps old fsmap information when file system changes manually # e.g. folder move, rename, etc # If it is not empty dictionary, there is a bug.... self._old_fsmap = {}
def __init__(self, agent, kb, lexicon, config, generator, manager): parser = Parser(agent, kb, lexicon) state = DialogueState(agent, kb) super(CraigslistRulebasedSession, self).__init__(agent, kb, parser, generator, manager, state, sample_temperature=10.) self.kb = kb self.title = self.shorten_title(self.kb.facts['item']['Title']) self.config = default_config if config is None else config self.target = self.kb.target self.bottomline = None self.listing_price = self.kb.listing_price self.category = self.kb.category # Direction of desired price self.inc = None
def __init__(self, agent, kb, lexicon, config, generator, manager): parser = Parser(agent, kb, lexicon) state = DialogueState(agent, kb) super(RulebasedSession, self).__init__(agent, kb, parser, generator, manager, state, sample_temperature=1.) self.kb = kb self.item_values = kb.item_values self.item_counts = kb.item_counts self.items = kb.item_values.keys() self.partner_item_weights = {item: 1. for item in self.items} self.config = default_config if config is None else config items = [(item, value, self.item_counts[item]) for item, value in self.item_values.iteritems()] # Sort items by value from high to low self.sorted_items = sorted(items, key=lambda x: x[1], reverse=True) self.init_proposal()
def self_training(args): """Perform self-training First load decoding results on disjoint data also load pre-trained model and perform supervised training on both existing training data and the decoded results """ print('load pre-trained model from [%s]' % args.load_model, file=sys.stderr) params = torch.load(args.load_model, map_location=lambda storage, loc: storage) vocab = params['vocab'] transition_system = params['transition_system'] saved_args = params['args'] saved_state = params['state_dict'] # transfer arguments saved_args.cuda = args.cuda saved_args.save_to = args.save_to saved_args.train_file = args.train_file saved_args.unlabeled_file = args.unlabeled_file saved_args.dev_file = args.dev_file saved_args.load_decode_results = args.load_decode_results args = saved_args update_args(args) model = Parser(saved_args, vocab, transition_system) model.load_state_dict(saved_state) if args.cuda: model = model.cuda() model.train() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) print('load unlabeled data [%s]' % args.unlabeled_file, file=sys.stderr) unlabeled_data = Dataset.from_bin_file(args.unlabeled_file) print('load decoding results of unlabeled data [%s]' % args.load_decode_results, file=sys.stderr) decode_results = pickle.load(open(args.load_decode_results)) labeled_data = Dataset.from_bin_file(args.train_file) dev_set = Dataset.from_bin_file(args.dev_file) print('Num. examples in unlabeled data: %d' % len(unlabeled_data), file=sys.stderr) assert len(unlabeled_data) == len(decode_results) self_train_examples = [] for example, hyps in zip(unlabeled_data, decode_results): if hyps: hyp = hyps[0] sampled_example = Example(idx='self_train-%s' % example.idx, src_sent=example.src_sent, tgt_code=hyp.code, tgt_actions=hyp.action_infos, tgt_ast=hyp.tree) self_train_examples.append(sampled_example) print('Num. self training examples: %d, Num. labeled examples: %d' % (len(self_train_examples), len(labeled_data)), file=sys.stderr) train_set = Dataset(examples=labeled_data.examples + self_train_examples) print('begin training, %d training examples, %d dev examples' % (len(train_set), len(dev_set)), file=sys.stderr) print('vocab: %s' % repr(vocab), file=sys.stderr) epoch = train_iter = 0 report_loss = report_examples = 0. history_dev_scores = [] num_trial = patience = 0 while True: epoch += 1 epoch_begin = time.time() for batch_examples in train_set.batch_iter(batch_size=args.batch_size, shuffle=True): batch_examples = [e for e in batch_examples if len(e.tgt_actions) <= args.decode_max_time_step] train_iter += 1 optimizer.zero_grad() loss = -model.score(batch_examples) # print(loss.data) loss_val = torch.sum(loss).data[0] report_loss += loss_val report_examples += len(batch_examples) loss = torch.mean(loss) loss.backward() # clip gradient if args.clip_grad > 0.: grad_norm = torch.nn.utils.clip_grad_norm(model.parameters(), args.clip_grad) optimizer.step() if train_iter % args.log_every == 0: print('[Iter %d] encoder loss=%.5f' % (train_iter, report_loss / report_examples), file=sys.stderr) report_loss = report_examples = 0. print('[Epoch %d] epoch elapsed %ds' % (epoch, time.time() - epoch_begin), file=sys.stderr) # model_file = args.save_to + '.iter%d.bin' % train_iter # print('save model to [%s]' % model_file, file=sys.stderr) # model.save(model_file) # perform validation print('[Epoch %d] begin validation' % epoch, file=sys.stderr) eval_start = time.time() eval_results = evaluation.evaluate(dev_set.examples, model, args, verbose=True) dev_acc = eval_results['accuracy'] print('[Epoch %d] code generation accuracy=%.5f took %ds' % (epoch, dev_acc, time.time() - eval_start), file=sys.stderr) is_better = history_dev_scores == [] or dev_acc > max(history_dev_scores) history_dev_scores.append(dev_acc) if is_better: patience = 0 model_file = args.save_to + '.bin' print('save currently the best model ..', file=sys.stderr) print('save model to [%s]' % model_file, file=sys.stderr) model.save(model_file) # also save the optimizers' state torch.save(optimizer.state_dict(), args.save_to + '.optim.bin') elif epoch == args.max_epoch: print('reached max epoch, stop!', file=sys.stderr) exit(0) elif patience < args.patience: patience += 1 print('hit patience %d' % patience, file=sys.stderr) if patience == args.patience: num_trial += 1 print('hit #%d trial' % num_trial, file=sys.stderr) if num_trial == args.max_num_trial: print('early stop!', file=sys.stderr) exit(0) # decay lr, and restore from previously best checkpoint lr = optimizer.param_groups[0]['lr'] * args.lr_decay print('load previously best model and decay learning rate to %f' % lr, file=sys.stderr) # load model params = torch.load(args.save_to + '.bin', map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) if args.cuda: model = model.cuda() # load optimizers if args.reset_optimizer: print('reset optimizer', file=sys.stderr) optimizer = torch.optim.Adam(model.inference_model.parameters(), lr=lr) else: print('restore parameters of the optimizers', file=sys.stderr) optimizer.load_state_dict(torch.load(args.save_to + '.optim.bin')) # set new lr for param_group in optimizer.param_groups: param_group['lr'] = lr # reset patience patience = 0
def train_semi_jae(args): bi_direction = args.bi_direction encoder_params = torch.load(args.load_model, map_location=lambda storage, loc: storage) decoder_params = torch.load(args.load_decoder, map_location=lambda storage, loc: storage) print('loaded encoder at %s' % args.load_model, file=sys.stderr) print('loaded decoder at %s' % args.load_decoder, file=sys.stderr) transition_system = encoder_params['transition_system'] encoder_params['args'].cuda = decoder_params['args'].cuda = args.cuda encoder = Parser(encoder_params['args'], encoder_params['vocab'], transition_system) encoder.load_state_dict(encoder_params['state_dict']) decoder = Reconstructor(decoder_params['args'], decoder_params['vocab'], transition_system) decoder.load_state_dict(decoder_params['state_dict']) zprior = LSTMPrior.load(args.load_prior, transition_system=transition_system, cuda=args.cuda) print('loaded p(z) prior at %s' % args.load_prior, file=sys.stderr) # freeze prior parameters for p in zprior.parameters(): p.requires_grad = False zprior.eval() xprior = LSTMLanguageModel.load(args.load_src_lm) print('loaded p(x) prior at %s' % args.load_src_lm, file=sys.stderr) xprior.eval() if args.cache: jae = JAE_cache(encoder, decoder, zprior, xprior, args) else: jae = JAE(encoder, decoder, zprior, xprior, args) jae.train() encoder.train() decoder.train() if args.cuda: jae.cuda() labeled_data = Dataset.from_bin_file(args.train_file) # labeled_data.examples = labeled_data.examples[:10] unlabeled_data = Dataset.from_bin_file( args.unlabeled_file) # pretend they are un-labeled! dev_set = Dataset.from_bin_file(args.dev_file) # dev_set.examples = dev_set.examples[:10] optimizer = torch.optim.Adam( [p for p in jae.parameters() if p.requires_grad], lr=args.lr) print( '*** begin semi-supervised training %d labeled examples, %d unlabeled examples ***' % (len(labeled_data), len(unlabeled_data)), file=sys.stderr) report_encoder_loss = report_decoder_loss = report_examples = 0. report_unsup_examples = report_unsup_encoder_loss = report_unsup_decoder_loss = report_unsup_baseline_loss = 0. patience = 0 num_trial = 1 epoch = train_iter = 0 history_dev_scores = [] while True: epoch += 1 epoch_begin = time.time() unlabeled_examples_iter = unlabeled_data.batch_iter( batch_size=args.unsup_batch_size, shuffle=True) for labeled_examples in labeled_data.batch_iter( batch_size=args.batch_size, shuffle=True): labeled_examples = [ e for e in labeled_examples if len(e.tgt_actions) <= args.decode_max_time_step ] train_iter += 1 optimizer.zero_grad() report_examples += len(labeled_examples) sup_encoder_loss = -encoder.score(labeled_examples) sup_decoder_loss = -decoder.score(labeled_examples) report_encoder_loss += sup_encoder_loss.sum().data[0] report_decoder_loss += sup_decoder_loss.sum().data[0] sup_encoder_loss = torch.mean(sup_encoder_loss) sup_decoder_loss = torch.mean(sup_decoder_loss) sup_loss = sup_encoder_loss + sup_decoder_loss # compute unsupervised loss try: unlabeled_examples = next(unlabeled_examples_iter) except StopIteration: # if finished unlabeled data stream, restart it unlabeled_examples_iter = unlabeled_data.batch_iter( batch_size=args.batch_size, shuffle=True) unlabeled_examples = next(unlabeled_examples_iter) unlabeled_examples = [ e for e in unlabeled_examples if len(e.tgt_actions) <= args.decode_max_time_step ] unsup_encoder_loss, unsup_decoder_loss, meta_data = jae.get_unsupervised_loss( unlabeled_examples, args.moves) if bi_direction: unsup_encoder_loss_back, unsup_decoder_loss_back, meta_data_back = jae.get_unsupervised_loss_backward( unlabeled_examples, args.moves) nan = False if nn_utils.isnan(sup_loss.data): print('Nan in sup_loss') nan = True if nn_utils.isnan(unsup_encoder_loss.data): print('Nan in unsup_encoder_loss!', file=sys.stderr) nan = True if nn_utils.isnan(unsup_decoder_loss.data): print('Nan in unsup_decoder_loss!', file=sys.stderr) nan = True if bi_direction: if nn_utils.isnan(unsup_encoder_loss_back.data): print('Nan in unsup_encoder_loss_back!', file=sys.stderr) nan = True if nn_utils.isnan(unsup_decoder_loss_back.data): print('Nan in unsup_decoder_loss_back!', file=sys.stderr) nan = True if nan: continue if bi_direction: report_unsup_encoder_loss += ( unsup_encoder_loss.sum().data[0] + unsup_encoder_loss_back.sum().data[0]) report_unsup_decoder_loss += ( unsup_decoder_loss.sum().data[0] + unsup_decoder_loss_back.sum().data[0]) else: report_unsup_encoder_loss += unsup_encoder_loss.sum().data[0] report_unsup_decoder_loss += unsup_decoder_loss.sum().data[0] report_unsup_examples += unsup_encoder_loss.size(0) if bi_direction: unsup_loss = torch.mean(unsup_encoder_loss) + torch.mean( unsup_decoder_loss) + torch.mean( unsup_encoder_loss_back) + torch.mean( unsup_decoder_loss_back) else: unsup_loss = torch.mean(unsup_encoder_loss) + torch.mean( unsup_decoder_loss) loss = sup_loss + args.unsup_loss_weight * unsup_loss loss.backward() # clip gradient grad_norm = torch.nn.utils.clip_grad_norm(jae.parameters(), args.clip_grad) optimizer.step() if train_iter % args.log_every == 0: print( '[Iter %d] supervised: encoder loss=%.5f, decoder loss=%.5f' % (train_iter, report_encoder_loss / report_examples, report_decoder_loss / report_examples), file=sys.stderr) print( '[Iter %d] unsupervised: encoder loss=%.5f, decoder loss=%.5f, baseline loss=%.5f' % (train_iter, report_unsup_encoder_loss / report_unsup_examples, report_unsup_decoder_loss / report_unsup_examples, report_unsup_baseline_loss / report_unsup_examples), file=sys.stderr) samples = meta_data['samples'] for v in meta_data.values(): if isinstance(v, Variable): v.cpu() for i, sample in enumerate(samples[:1]): print('\t[%s] Source: %s' % (sample.idx, ' '.join(sample.src_sent)), file=sys.stderr) print('\t[%s] Code: \n%s' % (sample.idx, sample.tgt_code), file=sys.stderr) ref_example = [ e for e in unlabeled_examples if e.idx == int(sample.idx[:sample.idx.index('-')]) ][0] print('\t[%s] Gold Code: \n%s' % (sample.idx, ref_example.tgt_code), file=sys.stderr) print( '\t[%s] Log p(z|x): %f' % (sample.idx, meta_data['encoding_scores'][i].data[0]), file=sys.stderr) print('\t[%s] Log p(x|z): %f' % (sample.idx, meta_data['reconstruction_scores'][i].data[0]), file=sys.stderr) print('\t[%s] Encoder Loss: %f' % (sample.idx, unsup_encoder_loss[i].data[0]), file=sys.stderr) print('\t**************************', file=sys.stderr) report_encoder_loss = report_decoder_loss = report_examples = 0. report_unsup_encoder_loss = report_unsup_decoder_loss = report_unsup_baseline_loss = report_unsup_examples = 0. print('[Epoch %d] epoch elapsed %ds' % (epoch, time.time() - epoch_begin), file=sys.stderr) # perform validation print('[Epoch %d] begin validation' % epoch, file=sys.stderr) eval_start = time.time() eval_results = evaluation.evaluate(dev_set.examples, encoder, args, verbose=True) encoder.train() dev_acc = eval_results['accuracy'] print('[Epoch %d] code generation accuracy=%.5f took %ds' % (epoch, dev_acc, time.time() - eval_start), file=sys.stderr) is_better = history_dev_scores == [] or dev_acc > max( history_dev_scores) history_dev_scores.append(dev_acc) if is_better: patience = 0 model_file = args.save_to + '.bin' print('save currently the best model ..', file=sys.stderr) print('save model to [%s]' % model_file, file=sys.stderr) jae.save(model_file) # also save the optimizers' state torch.save(optimizer.state_dict(), args.save_to + '.optim.bin') elif epoch == args.max_epoch: print('reached max epoch, stop!', file=sys.stderr) exit(0) elif patience < args.patience: patience += 1 print('hit patience %d' % patience, file=sys.stderr) if patience == args.patience: num_trial += 1 print('hit #%d trial' % num_trial, file=sys.stderr) if num_trial == args.max_num_trial: print('early stop!', file=sys.stderr) exit(0) # decay lr, and restore from previously best checkpoint lr = optimizer.param_groups[0]['lr'] * args.lr_decay print('load previously best model and decay learning rate to %f' % lr, file=sys.stderr) # load best model's parameters jae.load_parameters(args.save_to + '.bin') if args.cuda: jae = jae.cuda() # load optimizers if args.reset_optimizer: print('reset to a new infer_optimizer', file=sys.stderr) optimizer = torch.optim.Adam( [p for p in jae.parameters() if p.requires_grad], lr=lr) else: print('restore parameters of the optimizers', file=sys.stderr) optimizer.load_state_dict( torch.load(args.save_to + '.optim.bin')) # set new lr for param_group in optimizer.param_groups: param_group['lr'] = lr # reset patience patience = 0
self.onFinished() if __name__ == '__main__': from config import CONFIG from model.utils import load_json import pprint pp = pprint.PrettyPrinter(indent=4) # load project file to update project_filename = '../projects/test_saxs.json' project = load_json(project_filename) # parser and database parser = Parser(config=CONFIG['XML']) DB = DataBase(host=CONFIG['DB']['HOST'], port=CONFIG['DB']['PORT']) colCursor, fsCursor = DB.get_db('test_db', 'test_col') # initiate and run worker worker = Syncer(name='syncer', project=project, parser=parser, colCursor=colCursor, fsCursor=fsCursor, extensions=['xml', 'jpg', 'tiff'], interval=500) worker.start() while worker.t.is_alive(): time.sleep(1)
class DBHandler(object): def __init__(self, rootDir, fsmapFn, db_host='localhost', db_port=27017, xml_config=None): self.rootDir = os.path.realpath(os.path.abspath(rootDir)) self.fsmapFn = fsmapFn self.db_host = db_host self.db_port = db_port self.parser = Parser(xml_config) if xml_config is not None else None self.extensions = ['.xml', '.jpg', '.tiff'] # to ensure safe operation on fsmap self.fsmap_lock = threading.Lock() self.fsMap = self._load() self._traverse() self._save() # for debuggin... # lazy connection to MongoDB server # Must ensure mongod is running! self.client = pymongo.MongoClient(self.db_host, self.db_port) self.clientPool = {} # streaming queues self.fs_event_q = Queue() self.stream_q = Queue() # old map # This keeps old fsmap information when file system changes manually # e.g. folder move, rename, etc # If it is not empty dictionary, there is a bug.... self._old_fsmap = {} def __del__(self): for _, h in self.clientPool.items(): h.close() self.client.close() def _load(self): if not os.path.exists(self.fsmapFn): return {} try: with open(self.fsmapFn) as f: data = json.load(f) except (FileNotFoundError, TypeError, json.decoder.JSONDecodeError): print('[WARN] Failed to load saved fsmap, {}!!!'.format( self.fsmapFn)) print('[WARN] Previous fsmap will be ignored, if there is.') return {} def __recursive_flatten(fsmap: dict, flattened: dict): item = dict(fsmap) item['children'] = [ __recursive_flatten(child, flattened) for child in item['children'] ] flattened[item['path']] = item t = {} for key, value in data.items(): __recursive_flatten(value, t) return t def _save(self): def __convert_to_hierarchical_format(key: str, fsmap: dict): item = dict(fsmap[key]) item['children'] = [ __convert_to_hierarchical_format(c, fsmap) for c in item['children'] ] return item t = {} p_keys = [ key for key, value in self.fsMap.items() if value['parent'] is None ] for key in p_keys: t[key] = __convert_to_hierarchical_format(key, self.fsMap) with open(self.fsmapFn, 'w') as f: json.dump(t, f, indent=2, sort_keys=True) def _traverse(self, save_old=False): """Traverse root directory""" fsmap = {} for dirpath, _, _ in os.walk(self.rootDir, followlinks=True): path = dirpath.replace(self.rootDir, '') tokens = path.split(os.sep)[1:] parent_path = os.path.join(self.rootDir, *tokens[:-1]) real_path = os.path.realpath(dirpath) if len(path) == 0: name = dirpath parent = None else: name = os.path.basename(path) fsmap[parent_path]['children'].append(dirpath) parent = fsmap[parent_path]['path'] fsmap[dirpath] = { 'path': dirpath, # absolute path to current directory 'realpath': real_path, # realpath for symlink 'name': name, # name of current directory for display 'children': [], # list of absolute pathes of direct children directories 'parent': parent, # absolute path to direct parent directory 'link': None, # linked path # valid path flag # It will turn into False, if the given path doesn't exist by # comparing with fsmap in the file. 'valid': True, # valid path flag # This set to Ture, once a client set the `db` field. # Then, `db` filed can be modified only manually via fsmap file. # Such modification requires to re-run the web server. 'db': None, # related database (db, collection) 'fixed': False, # can modify? # used for syncing 'file': None, # sample file name used to determine group name 'sep': None, # separator used to parse group name from the file 'group': None, # group name in this folder 'last_sync': None, # the last date and time sync is applied } # update for symlink for key, value in fsmap.items(): if not (key == value['realpath']): if value['realpath'] in fsmap: fsmap[value['realpath']]['link'] = key value['link'] = fsmap[value['realpath']]['path'] # save unregistered fsmap from old one if save_old: for key, value in self.fsMap.items(): if key not in fsmap: self._old_fsmap[key] = dict(value) _keys_to_copy = [ 'valid', 'db', 'fixed', 'file', 'sep', 'group', 'last_sync' ] def __merge_fsmap(dstMap: dict, srcMap: dict): for _path, _srcItem in srcMap.items(): if _path in dstMap: # Is parent same? yes, it must be same as key is the absolute path. # But children could be different. For example, one might delete/move/add # sub-directories. But, we do not care, here. _dstItem = dstMap[_path] for _k in _keys_to_copy: _dstItem[_k] = _srcItem[_k] else: # This branch can happen when one delete/move/add subdirectories. # Keep it, so that one can fix it manually in the json file. _srcItem['children'] = [] _srcItem['parent'] = None _srcItem['valid'] = False #srcItem['inSync'] = False dstMap[key] = _srcItem __merge_fsmap(fsmap, self.fsMap) self.fsMap = fsmap def _update_fsmap(self, event_type, src_path, dst_path): """Invoked when filesystem changes (only for directory changes)""" with self.fsmap_lock: if event_type in ['created', 'deleted']: # on create and delete operation, refresh entire fsmap self._traverse() self._save() elif event_type in ['moved'] and dst_path is not None: # moved event includes 'rename' and 'relocate a folder' cp_key = ['db', 'file', 'fixed', 'group', 'last_sync', 'sep'] self._traverse(True) if src_path in self._old_fsmap and dst_path in self.fsMap: old_item = self._old_fsmap[src_path] new_item = self.fsMap[dst_path] for k, v in old_item.items(): if k in cp_key: new_item[k] = v del self._old_fsmap[src_path] else: print('Error in handling DirMovedEvent: ', src_path, dst_path) def _db_key(self, _db, _col, _fs): _key = '{:s}::{:s}::{:s}'.format(_db, _col, _fs) return _key def _db_key_list(self, path, recursive, isUnique=False): _key_list = [] def __recursive_db(_path, fsmap): if _path not in fsmap: return _db = fsmap[_path]['db'] if _db is None: return _key = self._db_key(_db[0], _db[1], _db[2]) if not isUnique: _key_list.append((_path, _key)) else: if _key not in _key_list: _key_list.append(_key) if recursive: for _c_path in fsmap[_path]['children']: __recursive_db(_c_path, fsmap) __recursive_db(path, self.fsMap) return _key_list def _get_db_handler(self, db_col_fs): _db, _col, _fs = db_col_fs _key = self._db_key(_db, _col, _fs) if _key in self.clientPool: return self.clientPool[_key] else: _h = MultiViewMongo(connection=self.client, db_name=_db, collection_name=_col, fs_name=_fs) self.clientPool[_key] = _h return _h def _get_db_handler_by_key(self, key: str): if key in self.clientPool: return self.clientPool[key] else: tokens = key.split('::') _h = MultiViewMongo(connection=self.client, db_name=tokens[0], collection_name=tokens[1], fs_name=tokens[2]) self.clientPool[key] = _h return _h def _update_file(self, event_type, src_path, dst_path): """Invoked when files change By watchdog: By syncer: """ if self.parser is None: print('parser is not set.') return None if dst_path is None: _path = src_path path, filename = os.path.split(src_path) else: _path = dst_path path, filename = os.path.split(dst_path) if len(filename) == 0: print('fail to detect filename.') return None ext = os.path.splitext(filename)[1] if len(ext) == 0 or ext not in self.extensions: print('Unsupported extension type. {:s}'.format(ext)) return None if path not in self.fsMap: print("Path is not in fsmap. {:s}".format(path)) return None if self.fsMap[path]['db'] is None: print("DB is not set on this path. {:s}".format(path)) return None if self.fsMap[path]['group'] is None: print("Group name is not set to this path. {:s}".format(path)) return None db = self.fsMap[path]['db'] group = self.fsMap[path]['group'] if event_type in ['created', 'modified', 'syncing', 'moved']: doc = self.parser.run(_path, ext, group) if doc is None: return None h = self._get_db_handler(db) if h.save_one(doc, ext) == 0: return None if ext == '.xml': query = {"sample": group, "item": doc['item']} res = h.load(query=query, fields={}, getarrays=False) res = self.after_query(res) return json.dumps(res) elif event_type in ['deleted']: # currently we do not delete any document in the db (should we?) pass else: # unknown event_type pass return None def _add_fs_event(self, what, event_type, src_path, dst_path): """Invoked by observer and syncers""" self.fs_event_q.put((what, event_type, src_path, dst_path)) def get_fsmap_as_list(self): """ Used to return the lastes file system information. Always, first scan file system itself to detect any changes made in the file system by someone else. """ with self.fsmap_lock: self._traverse() fsmap_list = [[key, value] for key, value in self.fsMap.items() if value['valid']] return fsmap_list def set_fsmap(self, fsmap_list): """Used to set db config by a client""" with self.fsmap_lock: for path, value in fsmap_list: # path is not found # (can happen when file system is manually changed) if path not in self.fsMap: continue # db is already set by other clients, ignore this. # Only administrator can change this manually. if self.fsMap[path]['fixed']: continue # check db config a client set if value['db'] is None: continue # db is not set if len(value['db']) != 3: continue # must be 3-D array new_db = value['db'][0] new_col = value['db'][1] if len(new_db) == 0 or len(new_col) == 0: continue # in-complete setting if new_db == 'null' or new_col == 'null': continue # in-complete setting # update db config item = self.fsMap[path] item['db'] = [new_db, new_col, 'fs'] item['fixed'] = True self._save() # def get_sync_samples(self, path, recursive): # """ # This is called to initiate syncing operation. # Args: # path: # recursive: # # Returns: # # """ # if path not in self.fsMap: return [] # if not os.path.exists(path): return [] # # sample_files = {} # for dirpath, _, files in os.walk(path, followlinks=True): # for f in files: # name, ext = os.path.splitext(f) # if ext in self.extensions: # sample_files[dirpath] = name # break # # if not recursive: break # return sample_files # def set_sync_info(self, info:dict): # """update `inSync` and `sep` fields in fsmap""" # # with self.fsmap_lock: # responses = {} # for path, sep in info.items(): # resp = { # 'valid': Syncer.CAN_SYNC # } # if path in self.fsMap: # item = self.fsMap[path] # if item['inSync']: # resp['valid'] = Syncer.CANNOT_SYNC # elif item['db'] is None or len(item['db']) != 3: # resp['valid'] = Syncer.NO_DB # else: # item['inSync'] = True # item['sep'] = sep # else: # resp['valid'] = Syncer.NO_PATH # responses[path] = resp # # self._save() # # return responses # def run_syncer(self, resp:dict): # """run syncer, some information will be added to resp""" # # files_to_sync = [] # for path, info in resp.items(): # if info['valid']: # item = { # 'path': path, # 'files': [], # 'client': self.get_client(self.get_db(path)) # } # for _, _, files in os.walk(path): # item['files'] = [f for f in files # if os.path.splitext(f)[1] in self.extensions] # break # files_to_sync.append(item) # info['total'] = len(item['files']) # else: # info['total'] = 0 # info['progressed'] = 0 # # # create syncer # syncer_id = Syncer.generate_syncer_id() # #syncer = Syncer(items_to_sync=files_to_sync) # # # update pool # #self.syncerPool[syncer_id] = syncer # # # run syncer # #syncer.start() # # return syncer_id, resp # def get_client(self, db_collection_fs): # if db_collection_fs is None or len(db_collection_fs) != 3: # return None # # db = db_collection_fs[0] # col = db_collection_fs[1] # fs = db_collection_fs[2] # key = '{}:{}:{}'.format(db, col, fs) # if key in self.clientPool: # h = self.clientPool[key] # else: # h = MultiViewMongo( # connection=self.client, # db_name=db, # collection_name=col, # fs_name=fs # ) # self.clientPool[key] = h # return h # def set_db(self, path, db, col): # if path not in self.fsMap: # return False # # def __recursive_update(key: str, fsmap: dict): # item = fsmap[key] # if item['db'] is None: item['db'] = [db, col, 'fs'] # for child in item['children']: # __recursive_update(child, fsmap) # # # update db setting recursively # # If a path is already set before (or maybe by other client), # # it didn't modify it. Given path may be not set as a client wants. # with self.fsmap_lock: # __recursive_update(path, self.fsMap) # self._save() # # return True # def get_db(self, path): # db = None # with self.fsmap_lock: # if path in self.fsMap: # db = self.fsMap[path]['db'] # return db def after_query(self, res): """Post processor on queried results""" if not isinstance(res, list): res = [res] res = [replace_objid_to_str(doc) for doc in res] res = [flatten_dict(doc) for doc in res] # for doc in res: # doc['sample'] = '[{:s}][{:s}]{:s}'.format(db, col, doc['sample']) # doc['_id'] = '[{:s}][{:s}]{:s}'.format(db, col, doc['_id']) return res def get_samplelist(self, path, recursive): if path not in self.fsMap: return [] samplelist = {} db_key_list = self._db_key_list(path, recursive) _db_list = self.client.list_database_names() for _path, _key in db_key_list: _db, _col, _fs = _key.split("::") if _db not in _db_list: continue _col_list = self.client[_db].collection_names() if _col not in _col_list: continue h = self._get_db_handler_by_key(_key) pipeline = [{ "$match": { "path": _path } }, { "$match": { "sample": { "$exists": True, "$ne": None } } }, { "$group": { "_id": "$sample", "count": { "$sum": 1 } } }] res = list(h.collection.aggregate(pipeline)) for r in res: _id = r['_id'] _count = r['count'] if _id in samplelist: samplelist[_id] += _count else: samplelist[_id] = _count return samplelist def get_samples(self, names, path, recursive): if path not in self.fsMap: return {} sampleData = {} db_key_list = self._db_key_list(path, recursive, False) _db_list = self.client.list_database_names() for _path, _key in db_key_list: _db, _col, _fs = _key.split("::") if _db not in _db_list: continue _col_list = self.client[_db].collection_names() if _col not in _col_list: continue h = self._get_db_handler_by_key(_key) for name in names: query = {"sample": name, "path": _path} res = h.load(query=query, fields={}, getarrays=False) if res is None: continue res = self.after_query(res) if name in sampleData: sampleData[name].append(res) else: sampleData[name] = res return sampleData def get_tiff(self, id, path): if path not in self.fsMap: return [] if self.fsMap[path]['db'] is None: return [] db = self.fsMap[path]['db'] h = self._get_db_handler(db) try: _id = ObjectId(id) except InvalidId: return [] query = {'_id': _id, 'tiff': {'$exists': True}} fields = {'tiff': 1, '_id': 0} res = h.load(query, fields, getarrays=True) if res is None: return [] data = res['tiff']['data'] res['tiff']['data'] = data.tolist() return res['tiff']
def train(args): grammar = ASDLGrammar.from_text(open(args.asdl_file).read()) transition_system = TransitionSystem.get_class_by_lang(args.lang)(grammar) train_set = Dataset.from_bin_file(args.train_file) dev_set = Dataset.from_bin_file(args.dev_file) vocab = pickle.load(open(args.vocab)) model = Parser(args, vocab, transition_system) model.train() if args.cuda: model.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) print('begin training, %d training examples, %d dev examples' % (len(train_set), len(dev_set)), file=sys.stderr) print('vocab: %s' % repr(vocab), file=sys.stderr) epoch = train_iter = 0 report_loss = report_examples = 0. history_dev_scores = [] num_trial = patience = 0 while True: epoch += 1 epoch_begin = time.time() for batch_examples in train_set.batch_iter(batch_size=args.batch_size, shuffle=True): batch_examples = [e for e in batch_examples if len(e.tgt_actions) <= args.decode_max_time_step] train_iter += 1 optimizer.zero_grad() loss = -model.score(batch_examples) # print(loss.data) loss_val = torch.sum(loss).data[0] report_loss += loss_val report_examples += len(batch_examples) loss = torch.mean(loss) loss.backward() # clip gradient if args.clip_grad > 0.: grad_norm = torch.nn.utils.clip_grad_norm(model.parameters(), args.clip_grad) optimizer.step() if train_iter % args.log_every == 0: print('[Iter %d] encoder loss=%.5f' % (train_iter, report_loss / report_examples), file=sys.stderr) report_loss = report_examples = 0. print('[Epoch %d] epoch elapsed %ds' % (epoch, time.time() - epoch_begin), file=sys.stderr) # model_file = args.save_to + '.iter%d.bin' % train_iter # print('save model to [%s]' % model_file, file=sys.stderr) # model.save(model_file) # perform validation print('[Epoch %d] begin validation' % epoch, file=sys.stderr) eval_start = time.time() eval_results = evaluation.evaluate(dev_set.examples, model, args, verbose=True) dev_acc = eval_results['accuracy'] print('[Epoch %d] code generation accuracy=%.5f took %ds' % (epoch, dev_acc, time.time() - eval_start), file=sys.stderr) is_better = history_dev_scores == [] or dev_acc > max(history_dev_scores) history_dev_scores.append(dev_acc) if is_better: patience = 0 model_file = args.save_to + '.bin' print('save currently the best model ..', file=sys.stderr) print('save model to [%s]' % model_file, file=sys.stderr) model.save(model_file) # also save the optimizers' state torch.save(optimizer.state_dict(), args.save_to + '.optim.bin') elif epoch == args.max_epoch: print('reached max epoch, stop!', file=sys.stderr) exit(0) elif patience < args.patience: patience += 1 print('hit patience %d' % patience, file=sys.stderr) if patience == args.patience: num_trial += 1 print('hit #%d trial' % num_trial, file=sys.stderr) if num_trial == args.max_num_trial: print('early stop!', file=sys.stderr) exit(0) # decay lr, and restore from previously best checkpoint lr = optimizer.param_groups[0]['lr'] * args.lr_decay print('load previously best model and decay learning rate to %f' % lr, file=sys.stderr) # load model params = torch.load(args.save_to + '.bin', map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) if args.cuda: model = model.cuda() # load optimizers if args.reset_optimizer: print('reset optimizer', file=sys.stderr) optimizer = torch.optim.Adam(model.inference_model.parameters(), lr=lr) else: print('restore parameters of the optimizers', file=sys.stderr) optimizer.load_state_dict(torch.load(args.save_to + '.optim.bin')) # set new lr for param_group in optimizer.param_groups: param_group['lr'] = lr # reset patience patience = 0
def train_semi(args): encoder_params = torch.load(args.load_model, map_location=lambda storage, loc: storage) decoder_params = torch.load(args.load_decoder, map_location=lambda storage, loc: storage) print('loaded encoder at %s' % args.load_model, file=sys.stderr) print('loaded decoder at %s' % args.load_decoder, file=sys.stderr) transition_system = encoder_params['transition_system'] encoder_params['args'].cuda = decoder_params['args'].cuda = args.cuda encoder = Parser(encoder_params['args'], encoder_params['vocab'], transition_system) encoder.load_state_dict(encoder_params['state_dict']) decoder = Reconstructor(decoder_params['args'], decoder_params['vocab'], transition_system) decoder.load_state_dict(decoder_params['state_dict']) if args.prior == 'lstm': prior = LSTMPrior.load(args.load_prior, transition_system=transition_system, cuda=args.cuda) print('loaded prior at %s' % args.load_prior, file=sys.stderr) # freeze prior parameters for p in prior.parameters(): p.requires_grad = False prior.eval() else: prior = UniformPrior() if args.baseline == 'mlp': structVAE = StructVAE(encoder, decoder, prior, args) elif args.baseline == 'src_lm' or args.baseline == 'src_lm_and_linear': src_lm = LSTMLanguageModel.load(args.load_src_lm) print('loaded source LM at %s' % args.load_src_lm, file=sys.stderr) vae_cls = StructVAE_LMBaseline if args.baseline == 'src_lm' else StructVAE_SrcLmAndLinearBaseline structVAE = vae_cls(encoder, decoder, prior, src_lm, args) else: raise ValueError('unknown baseline') structVAE.train() if args.cuda: structVAE.cuda() labeled_data = Dataset.from_bin_file(args.train_file) # labeled_data.examples = labeled_data.examples[:10] unlabeled_data = Dataset.from_bin_file(args.unlabeled_file) # pretend they are un-labeled! dev_set = Dataset.from_bin_file(args.dev_file) # dev_set.examples = dev_set.examples[:10] optimizer = torch.optim.Adam(ifilter(lambda p: p.requires_grad, structVAE.parameters()), lr=args.lr) print('*** begin semi-supervised training %d labeled examples, %d unlabeled examples ***' % (len(labeled_data), len(unlabeled_data)), file=sys.stderr) report_encoder_loss = report_decoder_loss = report_src_sent_words_num = report_tgt_query_words_num = report_examples = 0. report_unsup_examples = report_unsup_encoder_loss = report_unsup_decoder_loss = report_unsup_baseline_loss = 0. patience = 0 num_trial = 1 epoch = train_iter = 0 history_dev_scores = [] while True: epoch += 1 epoch_begin = time.time() unlabeled_examples_iter = unlabeled_data.batch_iter(batch_size=args.unsup_batch_size, shuffle=True) for labeled_examples in labeled_data.batch_iter(batch_size=args.batch_size, shuffle=True): labeled_examples = [e for e in labeled_examples if len(e.tgt_actions) <= args.decode_max_time_step] train_iter += 1 optimizer.zero_grad() report_examples += len(labeled_examples) sup_encoder_loss = -encoder.score(labeled_examples) sup_decoder_loss = -decoder.score(labeled_examples) report_encoder_loss += sup_encoder_loss.sum().data[0] report_decoder_loss += sup_decoder_loss.sum().data[0] sup_encoder_loss = torch.mean(sup_encoder_loss) sup_decoder_loss = torch.mean(sup_decoder_loss) sup_loss = sup_encoder_loss + sup_decoder_loss # compute unsupervised loss try: unlabeled_examples = next(unlabeled_examples_iter) except StopIteration: # if finished unlabeled data stream, restart it unlabeled_examples_iter = unlabeled_data.batch_iter(batch_size=args.batch_size, shuffle=True) unlabeled_examples = next(unlabeled_examples_iter) unlabeled_examples = [e for e in unlabeled_examples if len(e.tgt_actions) <= args.decode_max_time_step] try: unsup_encoder_loss, unsup_decoder_loss, unsup_baseline_loss, meta_data = structVAE.get_unsupervised_loss( unlabeled_examples) nan = False if nn_utils.isnan(sup_loss.data): print('Nan in sup_loss') nan = True if nn_utils.isnan(unsup_encoder_loss.data): print('Nan in unsup_encoder_loss!', file=sys.stderr) nan = True if nn_utils.isnan(unsup_decoder_loss.data): print('Nan in unsup_decoder_loss!', file=sys.stderr) nan = True if nn_utils.isnan(unsup_baseline_loss.data): print('Nan in unsup_baseline_loss!', file=sys.stderr) nan = True if nan: # torch.save((unsup_encoder_loss, unsup_decoder_loss, unsup_baseline_loss, meta_data), 'nan_data.bin') continue report_unsup_encoder_loss += unsup_encoder_loss.sum().data[0] report_unsup_decoder_loss += unsup_decoder_loss.sum().data[0] report_unsup_baseline_loss += unsup_baseline_loss.sum().data[0] report_unsup_examples += unsup_encoder_loss.size(0) except ValueError as e: print(e.message, file=sys.stderr) continue # except Exception as e: # print('********** Error **********', file=sys.stderr) # print('batch labeled examples: ', file=sys.stderr) # for example in labeled_examples: # print('%s %s' % (example.idx, ' '.join(example.src_sent)), file=sys.stderr) # print('batch unlabeled examples: ', file=sys.stderr) # for example in unlabeled_examples: # print('%s %s' % (example.idx, ' '.join(example.src_sent)), file=sys.stderr) # print(e.message, file=sys.stderr) # traceback.print_exc(file=sys.stderr) # for k, v in meta_data.iteritems(): # print('%s: %s' % (k, v), file=sys.stderr) # print('********** Error **********', file=sys.stderr) # continue unsup_loss = torch.mean(unsup_encoder_loss) + torch.mean(unsup_decoder_loss) + torch.mean(unsup_baseline_loss) loss = sup_loss + args.unsup_loss_weight * unsup_loss loss.backward() # clip gradient grad_norm = torch.nn.utils.clip_grad_norm(structVAE.parameters(), args.clip_grad) optimizer.step() if train_iter % args.log_every == 0: print('[Iter %d] supervised: encoder loss=%.5f, decoder loss=%.5f' % (train_iter, report_encoder_loss / report_examples, report_decoder_loss / report_examples), file=sys.stderr) print('[Iter %d] unsupervised: encoder loss=%.5f, decoder loss=%.5f, baseline loss=%.5f' % (train_iter, report_unsup_encoder_loss / report_unsup_examples, report_unsup_decoder_loss / report_unsup_examples, report_unsup_baseline_loss / report_unsup_examples), file=sys.stderr) # print('[Iter %d] unsupervised: baseline=%.5f, raw learning signal=%.5f, learning signal=%.5f' % (train_iter, # meta_data['baseline'].mean().data[0], # meta_data['raw_learning_signal'].mean().data[0], # meta_data['learning_signal'].mean().data[0]), file=sys.stderr) if isinstance(structVAE, StructVAE_LMBaseline): print('[Iter %d] baseline: source LM b_lm_weight: %.3f, b: %.3f' % (train_iter, structVAE.b_lm_weight.data[0], structVAE.b.data[0]), file=sys.stderr) samples = meta_data['samples'] for v in meta_data.itervalues(): if isinstance(v, Variable): v.cpu() for i, sample in enumerate(samples[:15]): print('\t[%s] Source: %s' % (sample.idx, ' '.join(sample.src_sent)), file=sys.stderr) print('\t[%s] Code: \n%s' % (sample.idx, sample.tgt_code), file=sys.stderr) ref_example = [e for e in unlabeled_examples if e.idx == int(sample.idx[:sample.idx.index('-')])][0] print('\t[%s] Gold Code: \n%s' % (sample.idx, ref_example.tgt_code), file=sys.stderr) print('\t[%s] Log p(z|x): %f' % (sample.idx, meta_data['encoding_scores'][i].data[0]), file=sys.stderr) print('\t[%s] Log p(x|z): %f' % (sample.idx, meta_data['reconstruction_scores'][i].data[0]), file=sys.stderr) print('\t[%s] KL term: %f' % (sample.idx, meta_data['kl_term'][i].data[0]), file=sys.stderr) print('\t[%s] Prior: %f' % (sample.idx, meta_data['prior'][i].data[0]), file=sys.stderr) print('\t[%s] baseline: %f' % (sample.idx, meta_data['baseline'][i].data[0]), file=sys.stderr) print('\t[%s] Raw Learning Signal: %f' % (sample.idx, meta_data['raw_learning_signal'][i].data[0]), file=sys.stderr) print('\t[%s] Learning Signal - baseline: %f' % (sample.idx, meta_data['learning_signal'][i].data[0]), file=sys.stderr) print('\t[%s] Encoder Loss: %f' % (sample.idx, unsup_encoder_loss[i].data[0]), file=sys.stderr) print('\t**************************', file=sys.stderr) report_encoder_loss = report_decoder_loss = report_examples = 0. report_unsup_encoder_loss = report_unsup_decoder_loss = report_unsup_baseline_loss = report_unsup_examples = 0. print('[Epoch %d] epoch elapsed %ds' % (epoch, time.time() - epoch_begin), file=sys.stderr) # perform validation print('[Epoch %d] begin validation' % epoch, file=sys.stderr) eval_start = time.time() eval_results = evaluation.evaluate(dev_set.examples, encoder, args, verbose=True) dev_acc = eval_results['accuracy'] print('[Epoch %d] code generation accuracy=%.5f took %ds' % (epoch, dev_acc, time.time() - eval_start), file=sys.stderr) is_better = history_dev_scores == [] or dev_acc > max(history_dev_scores) history_dev_scores.append(dev_acc) # model_file = args.save_to + '.iter%d.bin' % train_iter # print('save model to [%s]' % model_file, file=sys.stderr) # structVAE.save(model_file) if is_better: patience = 0 model_file = args.save_to + '.bin' print('save currently the best model ..', file=sys.stderr) print('save model to [%s]' % model_file, file=sys.stderr) structVAE.save(model_file) # also save the optimizers' state torch.save(optimizer.state_dict(), args.save_to + '.optim.bin') elif epoch == args.max_epoch: print('reached max epoch, stop!', file=sys.stderr) exit(0) elif patience < args.patience: patience += 1 print('hit patience %d' % patience, file=sys.stderr) if patience == args.patience: num_trial += 1 print('hit #%d trial' % num_trial, file=sys.stderr) if num_trial == args.max_num_trial: print('early stop!', file=sys.stderr) exit(0) # decay lr, and restore from previously best checkpoint lr = optimizer.param_groups[0]['lr'] * args.lr_decay print('load previously best model and decay learning rate to %f' % lr, file=sys.stderr) # load best model's parameters structVAE.load_parameters(args.save_to + '.bin') if args.cuda: structVAE = structVAE.cuda() # load optimizers if args.reset_optimizer: print('reset to a new infer_optimizer', file=sys.stderr) optimizer = torch.optim.Adam(ifilter(lambda p: p.requires_grad, structVAE.parameters()), lr=lr) else: print('restore parameters of the optimizers', file=sys.stderr) optimizer.load_state_dict(torch.load(args.save_to + '.optim.bin')) # set new lr for param_group in optimizer.param_groups: param_group['lr'] = lr # reset patience patience = 0
def log_semi(args): print('loading VAE at %s' % args.load_model, file=sys.stderr) fname, ext = os.path.splitext(args.load_model) encoder_path = fname + '.encoder' + ext decoder_path = fname + '.decoder' + ext vae_params = torch.load(args.load_model, map_location=lambda storage, loc: storage) encoder_params = torch.load(encoder_path, map_location=lambda storage, loc: storage) decoder_params = torch.load(decoder_path, map_location=lambda storage, loc: storage) transition_system = encoder_params['transition_system'] vae_params['args'].cuda = encoder_params['args'].cuda = decoder_params['args'].cuda = args.cuda encoder = Parser(encoder_params['args'], encoder_params['vocab'], transition_system) decoder = Reconstructor(decoder_params['args'], decoder_params['vocab'], transition_system) if vae_params['args'].prior == 'lstm': prior = LSTMPrior.load(vae_params['args'].load_prior, transition_system=decoder_params['transition_system'], cuda=args.cuda) print('loaded prior at %s' % vae_params['args'].load_prior, file=sys.stderr) # freeze prior parameters for p in prior.parameters(): p.requires_grad = False prior.eval() else: prior = UniformPrior() if vae_params['args'].baseline == 'mlp': structVAE = StructVAE(encoder, decoder, prior, vae_params['args']) elif vae_params['args'].baseline == 'src_lm' or vae_params['args'].baseline == 'src_lm_and_linear': src_lm = LSTMLanguageModel.load(vae_params['args'].load_src_lm) print('loaded source LM at %s' % vae_params['args'].load_src_lm, file=sys.stderr) Baseline = StructVAE_LMBaseline if args.baseline == 'src_lm' else StructVAE_SrcLmAndLinearBaseline structVAE = Baseline(encoder, decoder, prior, src_lm, vae_params['args']) else: raise ValueError('unknown baseline') structVAE.load_parameters(args.load_model) structVAE.train() if args.cuda: structVAE.cuda() unlabeled_data = Dataset.from_bin_file(args.unlabeled_file) # pretend they are un-labeled! print('*** begin sampling ***', file=sys.stderr) start_time = time.time() train_iter = 0 log_entries = [] for unlabeled_examples in unlabeled_data.batch_iter(batch_size=args.batch_size, shuffle=False): unlabeled_examples = [e for e in unlabeled_examples if len(e.tgt_actions) <= args.decode_max_time_step] train_iter += 1 try: unsup_encoder_loss, unsup_decoder_loss, unsup_baseline_loss, meta_data = structVAE.get_unsupervised_loss( unlabeled_examples) except ValueError as e: print(e.message, file=sys.stderr) continue samples = meta_data['samples'] for v in meta_data.itervalues(): if isinstance(v, Variable): v.cpu() for i, sample in enumerate(samples): ref_example = [e for e in unlabeled_examples if e.idx == int(sample.idx[:sample.idx.index('-')])][0] log_entry = { 'sample': sample, 'ref_example': ref_example, 'log_p_z_x': meta_data['encoding_scores'][i].data[0], 'log_p_x_z': meta_data['reconstruction_scores'][i].data[0], 'kl': meta_data['kl_term'][i].data[0], 'prior': meta_data['prior'][i].data[0], 'baseline': meta_data['baseline'][i].data[0], 'learning_signal': meta_data['raw_learning_signal'][i].data[0], 'learning_signal - baseline': meta_data['learning_signal'][i].data[0], 'encoder_loss': unsup_encoder_loss[i].data[0], 'decoder_loss': unsup_decoder_loss[i].data[0] } log_entries.append(log_entry) print('done! took %d s' % (time.time() - start_time), file=sys.stderr) pkl.dump(log_entries, open(args.save_to, 'wb'))
def self_training(args): """Perform self-training First load decoding results on disjoint data also load pre-trained model and perform supervised training on both existing training data and the decoded results """ print('load pre-trained model from [%s]' % args.load_model, file=sys.stderr) params = torch.load(args.load_model, map_location=lambda storage, loc: storage) vocab = params['vocab'] transition_system = params['transition_system'] saved_args = params['args'] saved_state = params['state_dict'] # transfer arguments saved_args.cuda = args.cuda saved_args.save_to = args.save_to saved_args.train_file = args.train_file saved_args.unlabeled_file = args.unlabeled_file saved_args.dev_file = args.dev_file saved_args.load_decode_results = args.load_decode_results args = saved_args update_args(args) model = Parser(saved_args, vocab, transition_system) model.load_state_dict(saved_state) if args.cuda: model = model.cuda() model.train() optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) print('load unlabeled data [%s]' % args.unlabeled_file, file=sys.stderr) unlabeled_data = Dataset.from_bin_file(args.unlabeled_file) print('load decoding results of unlabeled data [%s]' % args.load_decode_results, file=sys.stderr) decode_results = pickle.load(open(args.load_decode_results)) labeled_data = Dataset.from_bin_file(args.train_file) dev_set = Dataset.from_bin_file(args.dev_file) print('Num. examples in unlabeled data: %d' % len(unlabeled_data), file=sys.stderr) assert len(unlabeled_data) == len(decode_results) self_train_examples = [] for example, hyps in zip(unlabeled_data, decode_results): if hyps: hyp = hyps[0] sampled_example = Example(idx='self_train-%s' % example.idx, src_sent=example.src_sent, tgt_code=hyp.code, tgt_actions=hyp.action_infos, tgt_ast=hyp.tree) self_train_examples.append(sampled_example) print('Num. self training examples: %d, Num. labeled examples: %d' % (len(self_train_examples), len(labeled_data)), file=sys.stderr) train_set = Dataset(examples=labeled_data.examples + self_train_examples) print('begin training, %d training examples, %d dev examples' % (len(train_set), len(dev_set)), file=sys.stderr) print('vocab: %s' % repr(vocab), file=sys.stderr) epoch = train_iter = 0 report_loss = report_examples = 0. history_dev_scores = [] num_trial = patience = 0 while True: epoch += 1 epoch_begin = time.time() for batch_examples in train_set.batch_iter(batch_size=args.batch_size, shuffle=True): batch_examples = [ e for e in batch_examples if len(e.tgt_actions) <= args.decode_max_time_step ] train_iter += 1 optimizer.zero_grad() loss = -model.score(batch_examples) # print(loss.data) loss_val = torch.sum(loss).data[0] report_loss += loss_val report_examples += len(batch_examples) loss = torch.mean(loss) loss.backward() # clip gradient if args.clip_grad > 0.: grad_norm = torch.nn.utils.clip_grad_norm( model.parameters(), args.clip_grad) optimizer.step() if train_iter % args.log_every == 0: print('[Iter %d] encoder loss=%.5f' % (train_iter, report_loss / report_examples), file=sys.stderr) report_loss = report_examples = 0. print('[Epoch %d] epoch elapsed %ds' % (epoch, time.time() - epoch_begin), file=sys.stderr) # model_file = args.save_to + '.iter%d.bin' % train_iter # print('save model to [%s]' % model_file, file=sys.stderr) # model.save(model_file) # perform validation print('[Epoch %d] begin validation' % epoch, file=sys.stderr) eval_start = time.time() eval_results = evaluation.evaluate(dev_set.examples, model, args, verbose=True) dev_acc = eval_results['accuracy'] print('[Epoch %d] code generation accuracy=%.5f took %ds' % (epoch, dev_acc, time.time() - eval_start), file=sys.stderr) is_better = history_dev_scores == [] or dev_acc > max( history_dev_scores) history_dev_scores.append(dev_acc) if is_better: patience = 0 model_file = args.save_to + '.bin' print('save currently the best model ..', file=sys.stderr) print('save model to [%s]' % model_file, file=sys.stderr) model.save(model_file) # also save the optimizers' state torch.save(optimizer.state_dict(), args.save_to + '.optim.bin') elif epoch == args.max_epoch: print('reached max epoch, stop!', file=sys.stderr) exit(0) elif patience < args.patience: patience += 1 print('hit patience %d' % patience, file=sys.stderr) if patience == args.patience: num_trial += 1 print('hit #%d trial' % num_trial, file=sys.stderr) if num_trial == args.max_num_trial: print('early stop!', file=sys.stderr) exit(0) # decay lr, and restore from previously best checkpoint lr = optimizer.param_groups[0]['lr'] * args.lr_decay print('load previously best model and decay learning rate to %f' % lr, file=sys.stderr) # load model params = torch.load(args.save_to + '.bin', map_location=lambda storage, loc: storage) model.load_state_dict(params['state_dict']) if args.cuda: model = model.cuda() # load optimizers if args.reset_optimizer: print('reset optimizer', file=sys.stderr) optimizer = torch.optim.Adam( model.inference_model.parameters(), lr=lr) else: print('restore parameters of the optimizers', file=sys.stderr) optimizer.load_state_dict( torch.load(args.save_to + '.optim.bin')) # set new lr for param_group in optimizer.param_groups: param_group['lr'] = lr # reset patience patience = 0
if not isinstance(res, list): res = [res] res = [replace_objid_to_str(doc) for doc in res] res = [flatten_dict(doc) for doc in res] return res if __name__ == '__main__': from config import CONFIG from model.parser import Parser import pprint import os parser = Parser(config=CONFIG['XML']) DB = DataBase(host=CONFIG['DB']['HOST'], port=CONFIG['DB']['PORT']) pp = pprint.PrettyPrinter(indent=4) colCursor, fsCursor = DB.get_db('test_db', 'test_col') data_dir = [ '/Users/scott/Desktop/data/saxs/analysis_proper/results/', '/Users/scott/Desktop/data/saxs/analysis_proper/thumbnails', '/Users/scott/Desktop/data/saxs/tiff' ] test_files = [ 'C67_GD2-69-6_th0.110_1929.1s_T200.006C_5.00s_61288_saxs.xml', 'C67_GD2-69-6_th0.110_1929.1s_T200.006C_5.00s_61288_saxs.jpg', 'C67_GD2-69-6_th0.110_1929.1s_T200.006C_5.00s_61288_saxs.tiff'