def update_archive(thermoml_path=None): """Use RSS feeds to find and download any missing ThermoML XML files from the ThermoML archive. Parameters ---------- thermoml_path : str, optional, default=None If specified, use this path to store ThermoML XML files. If None, use the THERMOML_PATH environment variable. """ if thermoml_path is None: if "THERMOML_PATH" in os.environ: thermoml_path = os.environ["THERMOML_PATH"] else: raise(KeyError("You must either specify thermoml_path or the THERMOML_PATH environment variable.")) for key, url in THERMOML_FEEDS.items(): feed = feedparser.parse(url) for entry in feed["entries"]: link = entry["link"] base_filename = urllib_parse.urlsplit(link).path base_filename = base_filename[1:] # Strip off preceeding backslash character so os.path.join will work filename = os.path.join(thermoml_path, base_filename) make_path(filename) if os.path.exists(filename): print("Already downloaded %s from %s" % (filename, link)) else: print("Fetching %s from %s" % (filename, link)) urllib.request.urlretrieve (link, filename)
def build(self, ff_name, water_name): out_filename = self.get_initial_pdb_filename(ff_name, water_name) utils.make_path(out_filename) if os.path.exists(out_filename): return if self.pdb_filename is not None: fixer = pdbfixer.PDBFixer(filename=self.pdb_filename) else: fixer = pdbfixer.PDBFixer(pdbid=self.pdb_id) fixer.findMissingResidues() fixer.findNonstandardResidues() fixer.replaceNonstandardResidues() fixer.findMissingAtoms() fixer.addMissingAtoms() fixer.removeHeterogens(True) fixer.addMissingHydrogens(pH=self.pH) n_chains = len(list(fixer.topology.chains())) chains_to_remove = np.setdiff1d(np.arange(n_chains), self.keep_chains) fixer.removeChains(chains_to_remove) app.PDBFile.writeFile(fixer.topology, fixer.positions, open(out_filename, 'w'))
def production(self, ff_name, water_name): equil_pdb_filename = self.get_equil_pdb_filename(ff_name, water_name) production_dcd_filename = self.get_production_dcd_filename(ff_name, water_name) production_protein_dcd_filename = self.get_production_protein_dcd_filename(ff_name, water_name) utils.make_path(production_dcd_filename) if os.path.exists(production_protein_dcd_filename): return ff = app.ForceField('%s.xml' % ff_name, '%s.xml' % water_name) traj = md.load(equil_pdb_filename) top, bonds = traj.top.to_dataframe() atom_indices = top.index[top.chainID == 0].values pdb = app.PDBFile(equil_pdb_filename) system = ff.createSystem(pdb.topology, nonbondedMethod=app.PME, nonbondedCutoff=self.cutoff, constraints=app.HBonds) integrator = mm.LangevinIntegrator(self.temperature, self.friction, self.timestep) system.addForce(mm.MonteCarloBarostat(self.pressure, self.temperature, self.barostat_frequency)) simulation = app.Simulation(pdb.topology, system, integrator) simulation.context.setPositions(pdb.positions) simulation.context.setVelocitiesToTemperature(self.temperature) print('Production.') simulation.reporters.append(md.reporters.DCDReporter(production_protein_dcd_filename, self.protein_output_frequency, atomSubset=atom_indices)) simulation.reporters.append(app.DCDReporter(production_dcd_filename, self.output_frequency)) simulation.step(self.n_steps)
def build(self, ff_name, water_name): out_filename = self.get_initial_pdb_filename(ff_name, water_name) utils.make_path(out_filename) if os.path.exists(out_filename): return pdbbuilder.build_pdb(self.sequence, out_filename, self.N_cap, self.C_cap, pH=self.pH)
def __init__(self): settings = dict( debug=True, static_path=make_path("static"), template_path=make_path("template") ) tornado.web.Application.__init__(self, controllers.routes, **settings) self.db = Session self.init_gsm()
def production(self): utils.make_path('production/') self.production_dcd_filename = "production/"+self.identifier +"_production.dcd" self.production_pdb_filename = "production/"+self.identifier +"_production.pdb" self.production_data_filename = "production/"+self.identifier +"_production.csv" utils.make_path(self.production_dcd_filename) if os.path.exists(self.production_pdb_filename): return if self.ran_equilibrate: pdb = app.PDBFile(self.equil_pdb_filename) topology = pdb.topology positions = pdb.positions else: positions = self.packed_trj.openmm_positions(0) topology = self.packed_trj.top.to_openmm() topology.setUnitCellDimensions(mm.Vec3(*self.packed_trj.unitcell_lengths[0]) * u.nanometer) ff = self.ffxml system = ff.createSystem(topology, nonbondedMethod=app.PME, nonbondedCutoff=self.cutoff, constraints=app.HBonds) integrator = mm.LangevinIntegrator(self.temperature, self.friction, self.timestep) system.addForce(mm.MonteCarloBarostat(self.pressure, self.temperature, self.barostat_frequency)) simulation = app.Simulation(topology, system, integrator) simulation.context.setPositions(positions) if not self.ran_equilibrate: print('Minimizing.') simulation.minimizeEnergy() simulation.context.setVelocitiesToTemperature(self.temperature) print('Production.') simulation.reporters.append(app.DCDReporter(self.production_dcd_filename, self.output_frequency)) simulation.reporters.append(app.StateDataReporter(self.production_data_filename, self.output_data_frequency, step=True, potentialEnergy=True, temperature=True, density=True)) converged = False while not converged: simulation.step(self.n_steps) d = pd.read_csv(self.production_data_filename, names=["step", "U", "Temperature", "Density"], skiprows=1) density_ts = np.array(d.Density) [t0, g, Neff] = ts.detectEquilibration(density_ts, nskip=1000) density_ts = density_ts[t0:] density_mean_stderr = density_ts.std() / np.sqrt(Neff) if density_mean_stderr < self.stderr_tolerance: converged = True del(simulation) if self.ran_equilibrate: traj = md.load(self.production_dcd_filename, top=self.equil_pdb_filename)[-1] else: traj = md.load(self.production_dcd_filename, top=self.box_pdb_filename)[-1] traj.save(self.production_pdb_filename)
def equilibrate(self, ff_name, water_name): input_pdb_filename = self.get_initial_pdb_filename(ff_name, water_name) equil_pdb_filename = self.get_equil_pdb_filename(ff_name, water_name) equil_dcd_filename = self.get_equil_dcd_filename(ff_name, water_name) equil_protein_pdb_filename = self.get_equil_protein_pdb_filename(ff_name, water_name) utils.make_path(equil_pdb_filename) if os.path.exists(equil_pdb_filename): return ff = app.ForceField('%s.xml' % ff_name, '%s.xml' % water_name) pdb = app.PDBFile(input_pdb_filename) modeller = app.Modeller(pdb.topology, pdb.positions) modeller.addSolvent(ff, model=water_mapping[water_name], padding=self.padding, ionicStrength=self.ionic_strength) topology = modeller.getTopology() positions = modeller.getPositions() system = ff.createSystem(topology, nonbondedMethod=app.PME, nonbondedCutoff=self.cutoff, constraints=app.HBonds) integrator = mm.LangevinIntegrator(self.temperature, self.equil_friction, self.equil_timestep) system.addForce(mm.MonteCarloBarostat(self.pressure, self.temperature, self.barostat_frequency)) platform = mm.Platform.getPlatformByName("CUDA") platform.setPropertyDefaultValue("CudaDeviceIndex", os.environ["CUDA_VISIBLE_DEVICES"]) simulation = app.Simulation(topology, system, integrator, platform=platform) simulation.context.setPositions(positions) print('Minimizing.') simulation.minimizeEnergy() simulation.context.setVelocitiesToTemperature(self.temperature) print('Equilibrating.') simulation.reporters.append(app.PDBReporter(equil_pdb_filename, self.n_equil_steps - 1)) simulation.reporters.append(app.DCDReporter(equil_dcd_filename, self.equil_output_frequency)) simulation.step(self.n_equil_steps) del simulation del system traj = md.load(equil_dcd_filename, top=equil_pdb_filename)[-1] traj.save(equil_pdb_filename) top, bonds = traj.top.to_dataframe() atom_indices = top.index[top.chainID == 0].values traj.restrict_atoms(atom_indices) traj.save(equil_protein_pdb_filename)
def build(self): utils.make_path('monomers/') utils.make_path('boxes/') utils.make_path('ffxml/') self.monomer_pdb_filenames = ["monomers/"+string+".pdb" for string in self.cas_strings] self.box_pdb_filename = "boxes/" + self.identifier + ".pdb" self.ffxml_filename = "ffxml/" + '_'.join(self.cas_strings) + ".xml" utils.make_path(self.box_pdb_filename) rungaff = False if not os.path.exists(self.ffxml_filename): rungaff = True if not os.path.exists(self.box_pdb_filename): for filename in self.monomer_pdb_filenames: if not os.path.exists(filename): rungaff = True if rungaff: self.smiles_strings = [] for mlc in self.cas_strings: self.smiles_strings.append(resolve(mlc, 'smiles')) oemlcs = [] with gaff2xml.utils.enter_temp_directory(): # Avoid dumping 50 antechamber files in local directory. for smiles_string in self.smiles_strings: m = gaff2xml.openeye.smiles_to_oemol(smiles_string) m = gaff2xml.openeye.get_charges(m, strictStereo=False, keep_confs=1) oemlcs.append(m) ligand_trajectories, ffxml = gaff2xml.openeye.oemols_to_ffxml(oemlcs) if not os.path.exists(self.ffxml_filename): outfile = open(self.ffxml_filename, 'w') outfile.write(ffxml.read()) outfile.close() ffxml.seek(0) for k, ligand_traj in enumerate(ligand_trajectories): pdb_filename = self.monomer_pdb_filenames[k] if not os.path.exists(pdb_filename): ligand_traj.save(pdb_filename) self.ffxml = app.ForceField(self.ffxml_filename) if "7732-18-5" in self.cas_strings: self.ffxml.loadFile("tip3p.xml") if not os.path.exists(self.box_pdb_filename): self.packed_trj = gaff2xml.packmol.pack_box(self.monomer_pdb_filenames, self.n_monomers) self.packed_trj.save(self.box_pdb_filename) else: self.packed_trj = md.load(self.box_pdb_filename)
def equilibrate(self): self.ran_equilibrate = True utils.make_path('equil/') self.equil_dcd_filename = "equil/"+self.identifier +"_equil.dcd" self.equil_pdb_filename = "equil/"+self.identifier +"_equil.pdb" utils.make_path(self.equil_pdb_filename) if os.path.exists(self.equil_pdb_filename): return positions = self.packed_trj.openmm_positions(0) topology = self.packed_trj.top.to_openmm() topology.setUnitCellDimensions(mm.Vec3(*self.packed_trj.unitcell_lengths[0]) * u.nanometer) ff = self.ffxml system = ff.createSystem(topology, nonbondedMethod=app.PME, nonbondedCutoff=self.cutoff, constraints=app.HBonds) integrator = mm.LangevinIntegrator(self.temperature, self.equil_friction, self.equil_timestep) system.addForce(mm.MonteCarloBarostat(self.pressure, self.temperature, self.barostat_frequency)) simulation = app.Simulation(topology, system, integrator) simulation.context.setPositions(positions) print('Minimizing.') simulation.minimizeEnergy() simulation.context.setVelocitiesToTemperature(self.temperature) print('Equilibrating.') simulation.reporters.append(app.DCDReporter(self.equil_dcd_filename, self.equil_output_frequency)) simulation.step(self.n_equil_steps) # Re-write a better PDB with correct box sizes. traj = md.load(self.equil_dcd_filename, top=self.box_pdb_filename)[-1] traj.save(self.equil_pdb_filename)
def log(): global best_score print("Logging") tr_logits, tr_cost = iter_apply( trX[:n_valid], trM[:n_valid], trY[:n_valid]) va_logits, va_cost = iter_apply(vaX, vaM, vaY) tr_cost = tr_cost / len(trY[:n_valid]) va_cost = va_cost / n_valid tr_acc = accuracy_score(trY[:n_valid], np.argmax(tr_logits, 1)) * 100. va_acc = accuracy_score(vaY, np.argmax(va_logits, 1)) * 100. logger.log(n_epochs=n_epochs, n_updates=n_updates, tr_cost=tr_cost, va_cost=va_cost, tr_acc=tr_acc, va_acc=va_acc) print('%d %d %.3f %.3f %.2f %.2f' % (n_epochs, n_updates, tr_cost, va_cost, tr_acc, va_acc)) if submit: score = va_acc if score > best_score: best_score = score path = os.path.join(save_dir, desc, 'best_params') chainer.serializers.save_npz(make_path(path), model)
def save(save_path, postfix): ps = sess.run(params) make_path(save_path) joblib.dump(ps, save_path+"/model"+str(postfix)+".pkl")
avg_params = ema.apply(tf.trainable_variables()) train = tf.group(train, avg_params) if not hps.profile: _, ema_loss, ema_states = model(X, S, Y, hps, train=False, ema=ema) # Logging timestamp = time.strftime('r%Y_%m_%d_%H_%M_%S') log_file = os.path.join(hps.logdir, 'lm', timestamp, "log.txt") json_file = os.path.join(hps.logdir, 'lm', timestamp, "json.txt") if os.path.exists(log_file): # avoid 2 jobs sharing log (quick and dirty fix) print(log_file, "already exists, exiting.") exit() make_path(log_file) logging.basicConfig(format='%(asctime)s [%(levelname)s] %(message)s', filename=log_file, level=logging.DEBUG) logging.getLogger().addHandler( logging.StreamHandler()) # Print logs to stderr as well hps.num_params = str(num_trainable_params("model0")) print_trainable_params("model0") json_header = {} for key in sorted(hps.__dict__.keys()): if type(hps.__dict__[key]) in (str, int, float, type, tf.DType): logging.info(str(key) + ': ' + str(hps.__dict__[key])) json_header[str(key)] = str(hps.__dict__[key])
def train(): # load data sets train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list(itertools.chain.from_iterable( [[w[0] for w in s] for s in test_sentences]) ) ) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags 创建字典和标签映射 _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index 准备数据,获取包含索引的列表的集合 train_data = prepare_dataset( train_sentences, char_to_id, tag_to_id, FLAGS.lower ) dev_data = prepare_dataset( dev_sentences, char_to_id, tag_to_id, FLAGS.lower ) test_data = prepare_dataset( test_sentences, char_to_id, tag_to_id, FLAGS.lower ) print("%i / %i / %i sentences in train / dev / test." % ( len(train_data), 0, len(test_data))) train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step%steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
def evaluate_testDataSet(): # load data sets train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, FLAGS.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), len(dev_data), len(test_data))) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) for i in range(100): best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
'synt1bow': synt1_bow, 'synt2bow': synt2_bow} print("==== loading data ====") num = 1000000 para_data = h5py.File(os.path.join(args.data_dir, 'data.h5'), 'r') train_idxs, valid_idxs = random_split(range(num), [num-5000, 5000], generator=torch.Generator().manual_seed(args.seed)) print(f"number of train examples: {len(train_idxs)}") print(f"number of valid examples: {len(valid_idxs)}") train_loader = DataLoader(train_idxs, batch_size=args.train_batch_size, shuffle=True) valid_loader = DataLoader(valid_idxs, batch_size=args.valid_batch_size, shuffle=False) print("==== preparing data ====") make_path(args.cache_dir) tokenizer = BartTokenizer.from_pretrained('facebook/bart-base', cache_dir=args.cache_dir) with open('synt_vocab.pkl', 'rb') as f: synt_vocab = pickle.load(f) dataset = prepare_dataset(para_data, tokenizer, num) print("==== loading model ====") config = BartConfig.from_pretrained('facebook/bart-base', cache_dir=args.cache_dir) config.word_dropout = args.word_dropout config.max_sent_len = args.max_sent_len config.max_synt_len = args.max_synt_len bart = BartModel.from_pretrained('facebook/bart-base', cache_dir=args.cache_dir) model = ParaBart(config)
def save(save_path): ps = sess.run(params) make_path(osp.dirname(save_path)) joblib.dump(ps, save_path)
shuffle=False) # load model model = SynPG(len(dictionary), 300, word_dropout=args.word_dropout) model.load_state_dict(torch.load(args.model_path)) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) criterion = nn.CrossEntropyLoss(ignore_index=dictionary.word2idx["<pad>"]) model = model.cuda() criterion = criterion.cuda() # create folders make_path(args.model_dir) make_path(args.output_dir) print("==== start training ====") for epoch in range(1, args.n_epoch + 1): # training train(epoch, model, train_data, valid_data, train_loader, valid_loader, optimizer, criterion, dictionary, bpe, args) # save model torch.save( model.state_dict(), os.path.join(args.model_dir, "synpg_epoch{:02d}.pt".format(epoch))) # shuffle training data train_loader = DataLoader(train_idxs, batch_size=args.batch_size, shuffle=True)
def train(): train_sentences = load_sentences(FLAGS.train_file) dev_sentences = load_sentences(FLAGS.dev_file) test_sentences = load_sentences(FLAGS.test_file) update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) update_tag_scheme(dev_sentences, FLAGS.tag_schema) if not os.path.isfile(FLAGS.map_file): if FLAGS.pre_emb: dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, 'wb') as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, 'rb') as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id) train_manager = BatchManager(train_data, FLAGS.batch_size, FLAGS.num_steps) dev_manager = BatchManager(dev_data, 100, FLAGS.num_steps) test_manager = BatchManager(test_data, 100, FLAGS.num_steps) make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) os.environ["CUDA_VISIBLE_DEVICES"] = "3" gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9) tf_config = tf.ConfigProto(gpu_options=gpu_options) tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] for i in range(75): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{},".format( iteration, step % steps_per_epoch, steps_per_epoch)) loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
def find_storage_space(pth, identifier=ORIGINAL_IDENTIFIER): '''Find a new path with the identifier''' name, ext = pth.splitext() return make_path(name + identifier + ext, sep='').abspath()
def test_make_path(self): self.assertFalse(make_path(__file__).exists())
def train(): # load data sets datasets = load_sentences(FLAGS.train_file, FLAGS.lower) np.random.seed(1) np.random.shuffle(datasets) train_sentences = datasets[:15000] test_sentences = datasets[15000:] # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # create dictionary for word char_to_id, _ = elmo_char_mapping(FLAGS.elmo_vocab) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i sentences in train / dev." % (len(train_data), len(test_data))) elmo_batcher = get_batcher() train_manager = BatchManager(train_data, FLAGS.batch_size, elmo_batcher) test_manager = BatchManager(test_data, FLAGS.batch_size, elmo_batcher) # make path for store log and model if not exist make_path(FLAGS) # if os.path.isfile(FLAGS.config_file): # config = load_config(FLAGS.config_file) # else: config = config_model(tag_to_id) # save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto(allow_soft_placement=True) tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: elmo_model = load_elmo() model = create_model(sess, Model, FLAGS.ckpt_path, elmo_model, config, logger) # ckpt_file = tf.train.latest_checkpoint(FLAGS.ckpt_path) # if not ckpt_file: # model.lr/=100 #第一个epoch,用非常小的lr先训练 logger.info("start training") loss = [] f1score_lis=[0] for i in range(FLAGS.max_epoch): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, NER loss:{:>9.6f}".format( iteration, step%steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, "test", test_manager, id_to_tag, logger) #这一步会生成ner_predict.utf8文件 f1score = BIO_F1score( predict='result/ner_predict.utf8') logger.info('BIOf1score:{}'.format(f1score)) f1score_lis.append(f1score) # if i==0: # model.lr=FLAGS.lr #第一个epoch结束后,lr恢复初始值 if best and f1score_lis[-1]>f1score_lis[-2]: save_model(sess, model, FLAGS.ckpt_path, logger,step) else: model.lr*=0.95 logger.info('lr:{}'.format(model.lr)) ckpt = tf.train.get_checkpoint_state(FLAGS.ckpt_path) if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path): logger.info("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(sess, ckpt.model_checkpoint_path)
def train(): # 加载数据集 train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # 选择tag形式 (IOB / IOBES) 默认使用IOBES update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) if not os.path.isfile(FLAGS.map_file): if FLAGS.pre_emb: dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: # {'S-LOC': 10, 'E-LOC': 3, 'B-ORG': 4, 'S-PER': 11, 'S-ORG': 12, 'O': 0, # 'E-ORG': 5, 'I-LOC': 6, 'I-PER': 7, 'I-ORG': 1, 'B-PER': 8, 'B-LOC': 2, 'E-PER': 9} char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # 转化成数字化的数据 train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, FLAGS.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), len(dev_data), len(test_data))) #长度不足补0 train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # GPU设置 tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) # 每100次算一次平均loss loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
load_openai_pretrained_model(model, n_ctx=n_ctx, n_special=n_special) if device_id >= 0: cuda.cupy.random.seed(seed) model.to_gpu() lm_head.to_gpu() clf_head.to_gpu() n_updates = 0 n_epochs = 0 if dataset != 'stsb': trYt = trY if submit: path = os.path.join(save_dir, desc, 'best_params') chainer.serializers.save_npz(make_path(path), model) best_score = 0 for i in range(n_iter): print("running epoch", i) run_epoch() n_epochs += 1 log() if submit: path = os.path.join(save_dir, desc, 'best_params') chainer.serializers.load_npz(make_path(path), model) predict() if analysis: if dataset == 'rocstories': rocstories_analysis( data_dir, os.path.join( submission_dir, filenames[dataset]), os.path.join(
def train(): # load data sets train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list(itertools.chain.from_iterable( [[w[0] for w in s] for s in test_sentences]) ) ) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset( train_sentences, char_to_id, tag_to_id, FLAGS.lower ) dev_data = prepare_dataset( dev_sentences, char_to_id, tag_to_id, FLAGS.lower ) test_data = prepare_dataset( test_sentences, char_to_id, tag_to_id, FLAGS.lower ) print("%i / %i / %i sentences in train / dev / test." % ( len(train_data), 0, len(test_data))) train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] for i in range(100): for batch in train_manager.iter_batch(shuffle=True): #print batch step, batch_loss = model.run_step(sess, True, batch) #print step loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step%steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
def production(self): utils.make_path('production/') self.production_dcd_filename = "production/" + self.identifier + "_production.dcd" self.production_pdb_filename = "production/" + self.identifier + "_production.pdb" self.production_data_filename = "production/" + self.identifier + "_production.csv" utils.make_path(self.production_dcd_filename) if os.path.exists(self.production_pdb_filename): return if self.ran_equilibrate: pdb = app.PDBFile(self.equil_pdb_filename) topology = pdb.topology positions = pdb.positions else: positions = self.packed_trj.openmm_positions(0) topology = self.packed_trj.top.to_openmm() topology.setUnitCellDimensions( mm.Vec3(*self.packed_trj.unitcell_lengths[0]) * u.nanometer) ff = self.ffxml system = ff.createSystem(topology, nonbondedMethod=app.PME, nonbondedCutoff=self.cutoff, constraints=app.HBonds) integrator = mm.LangevinIntegrator(self.temperature, self.friction, self.timestep) system.addForce( mm.MonteCarloBarostat(self.pressure, self.temperature, self.barostat_frequency)) simulation = app.Simulation(topology, system, integrator) simulation.context.setPositions(positions) if not self.ran_equilibrate: print('Minimizing.') simulation.minimizeEnergy() simulation.context.setVelocitiesToTemperature(self.temperature) print('Production.') simulation.reporters.append( app.DCDReporter(self.production_dcd_filename, self.output_frequency)) simulation.reporters.append( app.StateDataReporter(self.production_data_filename, self.output_data_frequency, step=True, potentialEnergy=True, temperature=True, density=True)) converged = False while not converged: simulation.step(self.n_steps) d = pd.read_csv(self.production_data_filename, names=["step", "U", "Temperature", "Density"], skiprows=1) density_ts = np.array(d.Density) [t0, g, Neff] = ts.detectEquilibration(density_ts, nskip=1000) density_ts = density_ts[t0:] density_mean_stderr = density_ts.std() / np.sqrt(Neff) if density_mean_stderr < self.stderr_tolerance: converged = True del (simulation) if self.ran_equilibrate: traj = md.load(self.production_dcd_filename, top=self.equil_pdb_filename)[-1] else: traj = md.load(self.production_dcd_filename, top=self.box_pdb_filename)[-1] traj.save(self.production_pdb_filename)
def train(): # load data sets:返回的是语料集的[['字','标'],...]元组 train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) # 由loader.py负责处理数据 # create maps if not exist if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: # 判断是否用之前训练好的词向量 dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] # dico_chars_train应该只接收了dico <注意后面的[0]> ,即训练数据的不重复统计的字集 dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences])) # chain.from_iterable(iterables): 一个备用链构造函数,其中的iterables是一个迭代变量,生成迭代序列 # 所以这里的list生成的就是test_sentences里的字集 ) # 这里dico_chars是在train_set字典基础上添加wiki_100中包含的test_set里的字构成的字典 else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) # 通过pickle模块的序列化操作我们能够将程序中运行的对象信息保存到文件中去,永久存储。 else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, FLAGS.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) # xxx_data 以句子为单位存储[字符,字符id,标签id/chars长度的全是“0”对应标签id的list <train = True/False>,标签] print("%i / %i / %i sentences in train / dev / test." % (len(train_data), 0, len(test_data))) train_manager = BatchManager(train_data, FLAGS.batch_size) # 默认的batch_size为20 dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # 定义了3个BatchManager类:这个类中包含batch_data和len_data # batch_data 是按句子长短顺序排序后一个batch大小的data列表数据,而且每个batch中的数据都padding到统一长短 # len_data 是所分batch的数量 # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto( ) # tf.ConfigProto一般用在创建session的时候。用来对session进行参数配置 tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
def train(): # load data sets # 句子集合 = [[句子1],[句子2],[句子3]],句子1 = [我 O,在 O,。。。] #<class 'list'>: [['海', 'O'], ['钓', 'O'], ['比', 'O'], ['赛', 'O'], ['地', 'O'], ['点', 'O'], ['在', 'O'], ['厦', 'B-LOC'], ['门', 'I-LOC'], ['与', 'O'], ['金', 'B-LOC'], ['门', 'I-LOC'], ['之', 'O'], ['间', 'O'], ['的', 'O'], ['海', 'O'], ['域', 'O'], ['。', 'O']] # train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) # dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) # test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) from xlnet_base.xlnet_data_utils import XLNetDataUtils sp_model = spm.SentencePieceProcessor() sp_model.Load('./chinese_xlnet_base_L-12_H-768_A-12/spiece.model') train_data = XLNetDataUtils(sp_model, batch_size=FLAGS.batch_size, entry="train") dev_data = XLNetDataUtils(sp_model, batch_size=FLAGS.batch_size, entry="dev") test_data = XLNetDataUtils(sp_model, batch_size=FLAGS.batch_size, entry="test") dev_batch = dev_data.iteration() def datapadding(data): alldatalist = [] datalist = data.data max_length = 64 for i in range(len(datalist)): tmpdatalist = [] token = datalist[i][0] segmentid = datalist[i][1] inputid = datalist[i][2] inputmask = datalist[i][3] labellist = datalist[i][4] #token label if len(labellist) < max_length: for i in range(max_length - len(token)): labellist.append(0) elif len(labellist) > max_length: tmplabellist = [] for i in range(max_length): tmplabellist.append(labellist[i]) labellist = tmplabellist #segmentid inputid inputmask if len(segmentid) < max_length: for i in range(max_length - len(segmentid)): segmentid.append(0) inputid.append(0) inputmask.append(0) elif len(segmentid) > max_length: tmpsegmentid = [] tmpinputid = [] tmpinputmask = [] for i in range(max_length): tmpsegmentid.append(segmentid[i]) tmpinputid.append(inputid[i]) tmpinputmask.append(inputmask[i]) segmentid = tmpsegmentid inputid = tmpinputid inputmask = tmpinputmask tmpdatalist.append(token) tmpdatalist.append(segmentid) tmpdatalist.append(inputid) tmpdatalist.append(inputmask) tmpdatalist.append(labellist) alldatalist.append(tmpdatalist) return alldatalist ftraindata = datapadding(train_data) fdevdata = datapadding(dev_data) ftestdata = datapadding(test_data) print(len(ftraindata)) print(len(fdevdata)) print(len(ftestdata)) # traindata = { # "batch_size": train_data.batch_size, # "input_size": train_data.input_size, # "vocab": train_data.vocab, # "tag_map": train_data.tag_map, # } # devdata = { # "batch_size": dev_data.batch_size, # "input_size": dev_data.input_size, # "vocab": dev_data.vocab, # "tag_map": dev_data.tag_map, # } # testdata = { # "batch_size": test_data.batch_size, # "input_size": test_data.input_size, # "vocab": test_data.vocab, # "tag_map": test_data.tag_map, # } # if not os.path.exists("./model/train_data_map.pkl"): # f = open("./model/train_data_map.pkl", "wb") # pickle.dump(traindata, f) # f.close() # if not os.path.exists("./model/dev_data_map.pkl"): # f = open("./model/dev_data_map.pkl", "wb") # pickle.dump(devdata, f) # f.close() # if not os.path.exists("./model/test_data_map.pkl"): # f = open("./model/test_data_map.pkl", "wb") # pickle.dump(testdata, f) # f.close() # Use selected tagging scheme (IOB / IOBES) #update_tag_scheme(train_sentences, FLAGS.tag_schema) #update_tag_scheme(test_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # Create a dictionary and a mapping for tags ''' _t:{'O': 869087, 'B-LOC': 16571, 'I-LOC': 22531, 'B-PER': 8144, 'I-PER': 15881, 'B-ORG': 9277, 'I-ORG': 37689, '[SEP]': 8, '[CLS]': 10} id_to_tag:{0: 'O', 1: 'I-ORG', 2: 'I-LOC', 3: 'B-LOC', 4: 'I-PER', 5: 'B-ORG', 6: 'B-PER', 7: '[CLS]', 8: '[SEP]'} tag_to_id:{'O': 0, 'I-ORG': 1, 'I-LOC': 2, 'B-LOC': 3, 'I-PER': 4, 'B-ORG': 5, 'B-PER': 6, '[CLS]': 7, '[SEP]': 8} ''' tag_to_id = train_data.tag_map id_to_tag = {v: k for k, v in tag_to_id.items()} with open(FLAGS.map_file, "wb") as f: pickle.dump([tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index ''' [['在', '这', '里', '恕', '弟', '不', '恭', '之', '罪', ',', '敢', '在', '尊', '前', '一', '诤', ':', '前', '人', '论', '书', ',', '每', '曰', '“', '字', '字', '有', '来', '历', ',', '笔', '笔', '有', '出', '处', '”', ',', '细', '读', '公', '字', ',', '何', '尝', '跳', '出', '前', '人', '藩', '篱', ',', '自', '隶', '变', '而', '后', ',', '直', '至', '明', '季', ',', '兄', '有', '何', '新', '出', '?'], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1762, 6821, 7027, 2609, 2475, 679, 2621, 722, 5389, 8024, 3140, 1762, 2203, 1184, 671, 6420, 8038, 1184, 782, 6389, 741, 8024, 3680, 3288, 100, 2099, 2099, 3300, 3341, 1325, 8024, 5011, 5011, 3300, 1139, 1905, 100, 8024, 5301, 6438, 1062, 2099, 8024, 862, 2214, 6663, 1139, 1184, 782, 5974, 5075, 8024, 5632, 7405, 1359, 5445, 1400, 8024, 4684, 5635, 3209, 2108, 8024, 1040, 3300, 862, 3173, 1139, 8043, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] ''' # train_data = prepare_dataset( # train_sentences, FLAGS.max_seq_len, tag_to_id, FLAGS.lower # ) # dev_data = prepare_dataset( # dev_sentences, FLAGS.max_seq_len, tag_to_id, FLAGS.lower # ) # test_data = prepare_dataset( # test_sentences, FLAGS.max_seq_len, tag_to_id, FLAGS.lower # ) print("%i / %i / %i sentences in train / dev / test." % (len(train_data.data), len(dev_data.data), len(test_data.data))) train_manager = BatchManager(ftraindata, FLAGS.batch_size) dev_manager = BatchManager(fdevdata, FLAGS.batch_size) test_manager = BatchManager(ftestdata, FLAGS.batch_size) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, config, logger) logger.info("start training") loss = [] for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger, global_steps=step) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
def train(): # load data sets train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) # 检测并维护数据集的 tag 标记 update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) update_tag_scheme(dev_sentences, FLAGS.tag_schema) # create maps if not exist # 根据数据集创建 char_to_id, id_to_char, tag_to_id, id_to_tag 字典,并储存为 pkl 文件 if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] # 利用预训练嵌入集增强(扩充)字符字典,然后返回字符与位置映射关系 dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags # 获取标记与位置映射关系 _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) #with open('maps.txt','w',encoding='utf8') as f1: #f1.writelines(str(char_to_id)+" "+id_to_char+" "+str(tag_to_id)+" "+id_to_tag+'\n') with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # 提取句子特征 # prepare data, get a collection of list containing index train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, FLAGS.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), len(dev_data), len(test_data))) # 获取可供模型训练的单个批次数据 train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True # 训练集全量跑一次需要迭代的次数 steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: # 此处模型创建为项目最核心代码 model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] with tf.device("/gpu:0"): for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] # best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if i % 7 == 0: save_model(sess, model, FLAGS.ckpt_path, logger)
compute_loss_fct = MultipleChoiceLossCompute(criterion, criterion, args.lm_coef, model_opt) load_openai_pretrained_model(dh_model.transformer, n_ctx=n_ctx, n_special=n_special) dh_model.to(device) dh_model = nn.DataParallel(dh_model) n_updates = 0 n_epochs = 0 if dataset != 'stsb': trYt = trY if submit: path = os.path.join(save_dir, desc, 'best_params') torch.save(dh_model.state_dict(), make_path(path)) best_score = 0 for i in range(args.n_iter): print("running epoch", i) run_epoch() n_epochs += 1 log(save_dir, desc) if submit: path = os.path.join(save_dir, desc, 'best_params') dh_model.load_state_dict(torch.load(path)) predict(dataset, args.submission_dir) if args.analysis: rocstories_analysis( data_dir, os.path.join(args.submission_dir, 'ROCStories.tsv'), os.path.join(log_dir, 'rocstories.jsonl'))
def train(self): make_path(self.FLAGS) logger = get_logger(self.FLAGS.logfile_path) # load data sets # use generator to avoid memory oversize train_sentences = SentenceGenerator(self.FLAGS.train_file, self.FLAGS.zeros) logger.info("Train sentence generator is initialized") dev_sentences = SentenceGenerator(self.FLAGS.dev_file, self.FLAGS.zeros) logger.info("Dev sentence generator is initialized") # create maps if not exist if not tf.gfile.Exists(self.FLAGS.mapfile_path): # create dictionary for word _, char_to_id, id_to_char = char_mapping(train_sentences(), self.FLAGS.lower) logger.info("Created dictionary of word from train data") with tf.gfile.GFile(self.FLAGS.mapfile_path, "wb") as f: pickle.dump([char_to_id, id_to_char], f) else: with tf.gfile.GFile(self.FLAGS.mapfile_path, "rb") as f: char_to_id, id_to_char = pickle.load(f) logger.info("Load dictionary from existed map file") if not tf.gfile.Exists(self.FLAGS.vocabfile_path): with tf.gfile.GFile(self.FLAGS.vocabfile_path, "w") as file: for word in char_to_id: file.write(word + "\n") logger.info("Created vocabulary file") # load config and print it if tf.gfile.Exists(self.FLAGS.configfile_path): config = load_config(self.FLAGS.configfile_path) else: config = self.config(char_to_id) save_config(config, self.FLAGS.configfile_path) print_config(config, logger) # prepare data # get char_based, char_index_based, segs_based, tag_index_based sentences # use generator to avoid memory oversize train_manager = BatchManager(train_sentences, config['batch_size'], config['lower'], char_to_id, True) logger.info("Train manager is initialized") dev_manager = BatchManager(dev_sentences, 100, config['lower'], char_to_id, False) logger.info("Dev manager is initialized") logger.info("{} / {} sentences in train /dev.".format( len(train_sentences), len(dev_sentences))) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True # tf_config.log_device_placement = True steps_per_epoch = train_manager.len_data # how many batches in an epoch with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, self.FLAGS.ckpt_path, logger) logger.info("start training") loss = [] lr = config["lr"] sample_prob_initial = config["sample_prob"] for i in range(self.FLAGS.max_epoch): tf.assign(model.global_epoch, i).eval() for iter_turn, batch in enumerate(train_manager.iter_batch()): sample_prob = max( 0.3, sample_prob_initial - (i * 500 + iter_turn) * 0.1 / 100.0) step, batch_loss = model.run_step(True, batch, lr, sample_prob) loss.append(batch_loss) if step % self.FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info( "iteration:{} step:{}/{}, NER loss:{:>9.6f}, Training Sample prob is now {:>4.2f}" .format(iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss), sample_prob)) loss = [] if step % self.FLAGS.steps_eval == 0: self.evaluate(model, "dev", dev_manager, id_to_char, logger) dev_manager.reset(dev_sentences()) logger.info( "Epoch {} is finished, reset dev_manager".format( i)) if (i + 1) % 2 == 0: save_model(sess, model, self.FLAGS.ckpt_path + u"/" + str(i), logger) # reset BatchManager train_manager.reset(train_sentences()) logger.info( "Epoch {} is finished, reset train_manager".format(i)) lr = max(0.001, lr / 1.5) logger.info( "Epoch {} is finished, rescale learing rate to {}".format( i, lr))
config["lr"] = FLAGS.lr config["tag_schema"] = FLAGS.tag_schema config["pre_emb"] = FLAGS.pre_emb config["zeros"] = FLAGS.zeros config["lower"] = FLAGS.lower return config with open(FLAGS.map_file, "rb") as f: if pyversion == 'three': char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) else: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f, protocol=2) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) app = Flask(__name__) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) tf_config = tf.ConfigProto() sess = tf.Session(config=tf_config) sess.run(tf.global_variables_initializer()) model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger)
def train(): # load data sets train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) #训练集 101218 句子 dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) #验证集 7827句子 test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) #测试集 16804句子 # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) #更新标注iob转换成iobes update_tag_scheme(test_sentences, FLAGS.tag_schema) #更新标注iob转换成iobes update_tag_scheme(dev_sentences, FLAGS.tag_schema) #更新标注iob转换成iobes # create maps if not exist if not os.path.isfile(FLAGS.map_file): #判断maps.pkl是否存在 # create dictionary for word if FLAGS.pre_emb: #是否使用预先训练的模型(训练好的字向量) 测试集的数据不在训练集中 dico_chars_train = char_mapping(train_sentences, FLAGS.lower)[0] #字频统计下来 dico_chars dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable( #拉平,变成一个list [[w[0] for w in s] for s in test_sentences]) #w[0] 是个字 )) #每个字建个字典,每个词建个字典 else: #每个字的id,标记的id _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags 每个标记的id _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) #字频,排序,写入文件 #with open('maps.txt','w',encoding='utf8') as f1: #f1.writelines(str(char_to_id)+" "+id_to_char+" "+str(tag_to_id)+" "+id_to_tag+'\n') with open(FLAGS.map_file, "wb") as f: #持久化下来 pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset( #字词 数字特征化 train_sentences, char_to_id, tag_to_id, FLAGS.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), 0, len(test_data))) train_manager = BatchManager(train_data, FLAGS.batch_size) #训练集每次60个句子进行迭代 dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # make path for store log and model if not exist make_path(FLAGS) #创建文件log,result,ckpt if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) #字符对应的id,标签对应的id save_config(config, FLAGS.config_file) #每次的数据不一样都要生成一个config_file, make_path(FLAGS) #创建文件log,result,ckpt 模型中的文件 log_path = os.path.join("log", FLAGS.log_file) #读取log路径 logger = get_logger(log_path) #定义log日志的写入格式 print_config(config, logger) #写入log日志 # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True #设置GPU自适应,用多少使用多少 #tf_config.gpu_options.per_process_gpu_memory_fraction=True 设置GPU的使用率,占比 steps_per_epoch = train_manager.len_data #总共分多少批,取多少次 with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) #模型初始化结束 logger.info("start training") loss = [] # with tf.device("/gpu:0"):没有Gpu注释掉 卷积神经网络要求句子的长度一样, for i in range(100): #迭代多少次,每次把数据拿过来 for batch in train_manager.iter_batch(shuffle=True): #随机的拿 step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] # best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger)比上次模型好的话,就保存 if i % 7 == 0: save_model(sess, model, FLAGS.ckpt_path, logger)
def save(save_path): ps = sess.run(params) make_path(save_path) joblib.dump(ps, save_path)
def train(): # load data sets train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) #update_tag_scheme(train_sentences, FLAGS.tag_schema) #update_tag_scheme(test_sentences, FLAGS.tag_schema) # create maps if not exist if not os.path.isfile(FLAGS.map_file): # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset( train_sentences, FLAGS.max_seq_len, tag_to_id, FLAGS.lower ) dev_data = prepare_dataset( dev_sentences, FLAGS.max_seq_len, tag_to_id, FLAGS.lower ) test_data = prepare_dataset( test_sentences, FLAGS.max_seq_len, tag_to_id, FLAGS.lower ) print("%i / %i / %i sentences in train / dev / test." % ( len(train_data), 0, len(test_data))) train_len=len(train_data) train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, FLAGS.batch_size) test_manager = BatchManager(test_data, FLAGS.batch_size) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(tag_to_id) save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, config, logger) logger.info("start training") loss = [] for i in range(FLAGS.max_epoch): from tqdm import tqdm for batch in tqdm(train_manager.iter_batch(shuffle=True)): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step%steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] print("save result epoch:",i," ***************************************************") best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger,i) if i>=8: save_model(sess, model, FLAGS.ckpt_path, logger, global_steps=step) evaluate(sess, model, "test", test_manager, id_to_tag, logger,i)
def train(): # load data sets # sentences 的格式如下 ['在', 'O'], ['厦', 'B-LOC'], ['门', 'I-LOC'] # train_sentences = loader.load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) # dev_sentences = loader.load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) # test_sentences = loader.load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) train_sentences = loader.load_folder_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) dev_sentences = loader.load_folder_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = loader.load_folder_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) # update_tag_scheme 后sentence没有太大的变化 loader.update_tag_scheme(train_sentences, FLAGS.tag_schema) loader.update_tag_scheme(test_sentences, FLAGS.tag_schema) os.environ["CUDA_VISIBLE_DEVICES"] = "0" # create maps if not exist # 是否存在maps.pkl文件,如果不存在就需要读取训练数据, # 获得char_to_id tag_to_id # create maps if not exist # 是否存在maps.pkl文件, if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: dico_chars_train = loader.char_mapping(train_sentences, FLAGS.lower)[0] dico_chars, char_to_id, id_to_char = loader.augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _c, char_to_id, id_to_char = loader.char_mapping( train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = loader.tag_mapping(train_sentences) print('tag_to_id: ', tag_to_id) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # print('tag_to_id: ', tag_to_id) print('tag_to_id: ', tag_to_id) # prepare data, get a collection of list containing index train_data = loader.prepare_dataset(train_sentences, char_to_id, tag_to_id, FLAGS.lower) dev_data = loader.prepare_dataset(dev_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = loader.prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), 0, len(test_data))) train_manager = data_utils.BatchManager(train_data, FLAGS.batch_size) dev_manager = data_utils.BatchManager(dev_data, 100) test_manager = data_utils.BatchManager(test_data, 100) # make path for store log and model if not exist utils.make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = utils.load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) utils.save_config(config, FLAGS.config_file) utils.make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) # ./log/train.log logger = utils.get_logger(log_path) utils.print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data with tf.Session(config=tf_config) as sess: model = utils.create_model(sess, Model, FLAGS.ckpt_path, data_utils.load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] for i in range(FLAGS.iterations): # for i in range(10): logger.info('epoch: {}'.format(i)) for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: utils.save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
def save(path): ps = sess.run(params) joblib.dump(ps, make_path(path))
def train(): """ train函数:传入数据、处理数据、模型训练、输出测试集f1值 :return: """ # load data sets传入数据集,做基本处理包括转小写、换0、去除空格提取word等,将训练集word和tag放在list中。 .dev_file用作cross validation train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) # FLAGS.zeros = False # train_sentences格式 ['厦', 'B-LOC'], ['门', 'I-LOC'], ['与', 'O'], ['金', 'B-LOC'], ['门', 'I-LOC'] dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) 将IOB格式标签转换文IOBES。I:中间,O:其他,B:开始 | E:结束,S:单个 # 调用loder.py中的update_tag_scheme函数进行tag转换,在此函数内又调用data_utils.py中的iob_iobes函数转换tag update_tag_scheme(train_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) update_tag_scheme(dev_sentences, FLAGS.tag_schema) # create maps if not exist 创建词映射字典 if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: # 数据增强 添加预训练词向量到训练字典中 dico_chars_train = char_mapping( train_sentences, FLAGS.lower )[0] # 调用loader.py中的char_mapping函数,只输出一个被转换为小写的数据集字典,frequency降序排列 dico_chars, char_to_id, id_to_char = augment_with_pretrained( # 调用loader.py中的augment_with_pretrained函数 # 添加原字典中没有的pretrain字符到原字典中,pretrain必须在test集中有出现过 dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences ]) # 使用test集作为预训练词向量的基准 )) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # _c是无序字典,即列出了每个key出现的次数。char_to_id是有序字典,但是value不是frequency,是序号,但key排列顺序是按frequence降序 # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping( train_sentences) # 调用loader.py中的tag_mapping函数创建tag字典,_t是不重复tag的字典 # tag_to_id: {'O': 0, 'S-MISC': 1, 'B-ORG': 2, 'B-PER': 3, 'E-ORG': 4, 'E-PER': 5, 'S-LOC': 6, 'S-ORG': 7, 'I-PER': 8, 'S-PER': 9} with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) # 将上述字典保存到map file中 else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset( # 调用loader.py中的prepare_dataset函数生成 训练集word字符——word的frequency——分词后的word特征——标签的frequency train_sentences, char_to_id, tag_to_id, FLAGS.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), len(dev_data), len(test_data))) # 生成bach_size大小 可以调用batch_data和len_data两个内置变量 # BatchManager用来统一输入的训练数据的array的长度 train_manager = BatchManager( train_data, FLAGS.batch_size) # data_utils.py传入BatchManager类 dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # make path for store log and model if not exist make_path(FLAGS) if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) # output配置文件config_file save_config(config, FLAGS.config_file) make_path(FLAGS) log_path = os.path.join("log", FLAGS.log_file) logger = get_logger(log_path) print_config(config, logger) # 打印生成的log并储存在文件夹内 # 迭代原理 训练 loss值如何产生 # limit GPU memory tf_config = tf.compat.v1.ConfigProto() tf_config.gpu_options.allow_growth = True # 英文:steps_per_epoch = 703 即一共需要处理的训练数据批次量, steps_per_epoch * 20 = 总共的句子个数 # 中文:steps_per_epoch = 1044 steps_per_epoch = train_manager.len_data # 开始训练模型 with tf.compat.v1.Session( config=tf_config ) as sess: # 使用tf.Session激活配置参数,使用utils.py中create_model函数下session.run执行 # 创建模型框架,包括init函数中定义的模型各个层参数和相应函数调用,先生成num_chars * 100的word embedding权重矩阵 model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) # 调用utils.py中的Model类创建模型,传入训练字典。调用data_utils中的load_word2vec函数 logger.info("start training") loss = [] # 这层循环的意义是共训练模型100次,不断传入train和验证集来调整模型的参数,得到最优F1值。括号内的range(100)可调参 for i in range(100): # 开始训练模型 传入数据集 # 先在模型中根据batch创建输入字典feed_dict,每20个一组,包括每句话的word id,每句话的word feature,每句话tag id # 依次执行模型每一层,从embedding layer开始 # 生成词向量,按批次传入参数包括每句话的char id;每句话feature和是否存在句子维度的预先定义值,生成120维包含所有训练数据的词向量 # 用dropout随机去除部分词向量防止过拟合,将词向量喂给CNN模型进行卷积训练。 for batch in train_manager.iter_batch( shuffle=True): # iter_batch:data_utils.py中的iter_batch函数 # batch是产生随机顺序的句子,输出上述array # batch组成:4个大list,每个list包含: # 1. 随机输出的所有句子,['Fairview', ',', 'Texas', ',', '$', '1.82', 'million', 'deal', 'Baa1', '-'], # 2. word出现在字典中的位置。 # 3. 每句话对应的表征word长度特征的list。 # 4. 每句话对应的tag在tag字典中出现的位置 step, batch_loss = model.run_step(sess, True, batch) # loss:60.648315 76.53908 54.006336 108.96472 # step从1开始增加,每100次输出一次当前loss值 loss.append(batch_loss) # 5个batch输出一次loss值,step=100,总batch if step % FLAGS.steps_check == 0: # 每迭代100次输出一次loss, iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger)
def train(): # load data sets train_sentences = load_sentences(FLAGS.train_file, FLAGS.lower, FLAGS.zeros) # "sentences[0]:[['我', 'O'], ['要', 'O'], ['看', 'O'], ['乌', 'B-SLOC'], ['鲁', 'I-SLOC'], ['木', 'I-SLOC'], ['齐', 'I-SLOC'], ['市', 'I-SLOC'], ['第', 'I-SLOC'], ['四', 'I-SLOC'], ['十', 'I-SLOC'], ['九', 'I-SLOC'], ['中', 'I-SLOC'], ['学', 'I-SLOC'], ['东', 'I-SLOC'], ['门', 'I-SLOC'], ['去', 'O'], ['乌', 'B-ELOC'], ['鲁', 'I-ELOC'], ['木', 'I-ELOC'], ['齐', 'I-ELOC'], ['推', 'I-ELOC'], ['拿', 'I-ELOC'], ['职', 'I-ELOC'], ['业', 'I-ELOC'], ['学', 'I-ELOC'], ['校', 'I-ELOC'], ['南', 'I-ELOC'], ['门', 'I-ELOC'], ['沿', 'O'], ['西', 'B-ROAD'], ['虹', 'I-ROAD'], ['东', 'I-ROAD'], ['路', 'I-ROAD'], ['的', 'O'], ['监', 'B-TYPE'], ['控', 'I-TYPE']]" dev_sentences = load_sentences(FLAGS.dev_file, FLAGS.lower, FLAGS.zeros) test_sentences = load_sentences(FLAGS.test_file, FLAGS.lower, FLAGS.zeros) # Use selected tagging scheme (IOB / IOBES) update_tag_scheme(train_sentences, FLAGS.tag_schema) # print("train_sentences[0]:{}".format(train_sentences[0])) # "train_sentences[0]:[['我', 'O'], ['要', 'O'], ['看', 'O'], ['乌', 'B-SLOC'], ['鲁', 'I-SLOC'], ['木', 'I-SLOC'], ['齐', 'I-SLOC'], ['市', 'I-SLOC'], ['第', 'I-SLOC'], ['四', 'I-SLOC'], ['十', 'I-SLOC'], ['九', 'I-SLOC'], ['中', 'I-SLOC'], ['学', 'I-SLOC'], ['东', 'I-SLOC'], ['门', 'E-SLOC'], ['去', 'O'], ['乌', 'B-ELOC'], ['鲁', 'I-ELOC'], ['木', 'I-ELOC'], ['齐', 'I-ELOC'], ['推', 'I-ELOC'], ['拿', 'I-ELOC'], ['职', 'I-ELOC'], ['业', 'I-ELOC'], ['学', 'I-ELOC'], ['校', 'I-ELOC'], ['南', 'I-ELOC'], ['门', 'E-ELOC'], ['沿', 'O'], ['西', 'B-ROAD'], ['虹', 'I-ROAD'], ['东', 'I-ROAD'], ['路', 'E-ROAD'], ['的', 'O'], ['监', 'B-TYPE'], ['控', 'E-TYPE']]" update_tag_scheme(dev_sentences, FLAGS.tag_schema) update_tag_scheme(test_sentences, FLAGS.tag_schema) # create maps if not exist # print("map_file:{}".format(FLAGS.map_file)) # print("pre_emb:{}".format(FLAGS.pre_emb)) # map_file: maps.pkl # pre_emb: False if not os.path.isfile(FLAGS.map_file): # create dictionary for word if FLAGS.pre_emb: dico_chars_train = char_mapping( train_sentences, FLAGS.lower)[0] # character -> count dict dico_chars, char_to_id, id_to_char = augment_with_pretrained( dico_chars_train.copy(), FLAGS.emb_file, list( itertools.chain.from_iterable([[w[0] for w in s] for s in test_sentences]))) else: _c, char_to_id, id_to_char = char_mapping(train_sentences, FLAGS.lower) # Create a dictionary and a mapping for tags _t, tag_to_id, id_to_tag = tag_mapping(train_sentences) with open(FLAGS.map_file, "wb") as f: pickle.dump([char_to_id, id_to_char, tag_to_id, id_to_tag], f) else: with open(FLAGS.map_file, "rb") as f: char_to_id, id_to_char, tag_to_id, id_to_tag = pickle.load(f) # prepare data, get a collection of list containing index train_data = prepare_dataset(train_sentences, char_to_id, tag_to_id, FLAGS.lower) dev_data = prepare_dataset(dev_sentences, char_to_id, tag_to_id, FLAGS.lower) test_data = prepare_dataset(test_sentences, char_to_id, tag_to_id, FLAGS.lower) print("%i / %i / %i sentences in train / dev / test." % (len(train_data), 0, len(test_data))) # '3027 / 0 / 361 sentences in train / dev / test.' # print("batch_size:{}".format(FLAGS.batch_size)) # batch_size: 20 train_manager = BatchManager(train_data, FLAGS.batch_size) dev_manager = BatchManager(dev_data, 100) test_manager = BatchManager(test_data, 100) # make path for store log and model if not exist make_path(FLAGS) # print("config_file:{}".format(FLAGS.config_file)) # config_file: config_file if os.path.isfile(FLAGS.config_file): config = load_config(FLAGS.config_file) else: config = config_model(char_to_id, tag_to_id) save_config(config, FLAGS.config_file) log_path = os.path.join("log", FLAGS.log_file) # log_path:log/train.log logger = get_logger(log_path) print_config(config, logger) # limit GPU memory tf_config = tf.ConfigProto() tf_config.gpu_options.allow_growth = True steps_per_epoch = train_manager.len_data # print("steps_per_epoch:{}".format(steps_per_epoch)) # steps_per_epoch: 152 with tf.Session(config=tf_config) as sess: model = create_model(sess, Model, FLAGS.ckpt_path, load_word2vec, config, id_to_char, logger) logger.info("start training") loss = [] for i in range(100): for batch in train_manager.iter_batch(shuffle=True): step, batch_loss = model.run_step(sess, True, batch) loss.append(batch_loss) # print("steps_check:{}".format(FLAGS.steps_check)) # steps_check: 100 if step % FLAGS.steps_check == 0: iteration = step // steps_per_epoch + 1 logger.info("iteration:{} step:{}/{}, " "NER loss:{:>9.6f}".format( iteration, step % steps_per_epoch, steps_per_epoch, np.mean(loss))) loss = [] best = evaluate(sess, model, "dev", dev_manager, id_to_tag, logger) if best: save_model(sess, model, FLAGS.ckpt_path, logger) evaluate(sess, model, "test", test_manager, id_to_tag, logger) export(model, sess, "ner", "export_model")
def log_test_pr_curve(self, epoch, entity_ids_test, labels_test, probs_test, negative_label_idx, label_encoder=None): bag_ids = [e1 + '_' + e2 for e1, e2 in entity_ids_test] bag_to_mention_mapping = defaultdict(set) for idx, bag_id in enumerate(bag_ids): bag_to_mention_mapping[bag_id].add(idx) num_relation_facts = 0 Prediction = namedtuple('Prediction', [ 'score', 'is_correct', 'bag_id', 'predicted_label_idx', 'bag_label_idxs', 'predicted_label', 'bag_labels', 'bag_size' ]) predictions = [] for bag_id, mention_idxs in bag_to_mention_mapping.items(): # Aggregate and count the labels per bag without the negative label bag_labels = set(labels_test[list(mention_idxs)]) bag_labels.discard(negative_label_idx) num_relation_facts += len(bag_labels) bag_size = len(mention_idxs) # Use max to aggregate the mention probabilities in the bag mention_probs = probs_test[list(mention_idxs)] bag_probs = np.max(mention_probs, axis=0) # For each bag and positive relation create a prediction for relation_idx, relation_prob in enumerate(bag_probs): if relation_idx == negative_label_idx: continue if len(bag_labels) == 0: bag_labels_str = 'NA' bag_label_idxs_str = negative_label_idx else: if label_encoder: decoded_bag_labels = [ label_encoder.get_item_for_index(idx) for idx in bag_labels ] bag_labels_str = ', '.join(decoded_bag_labels) else: bag_labels_str = '' bag_label_idxs_str = ', '.join( [str(lbl) for lbl in bag_labels]) if label_encoder: predicted_label_str = label_encoder.get_item_for_index( relation_idx) else: predicted_label_str = "" predicted_label_idx_str = str(relation_idx) is_correct = relation_idx in bag_labels predictions.append( Prediction(score=relation_prob, is_correct=is_correct, bag_id=bag_id, predicted_label_idx=predicted_label_idx_str, bag_label_idxs=bag_label_idxs_str, predicted_label=predicted_label_str, bag_labels=bag_labels_str, bag_size=bag_size)) predictions = sorted(predictions, key=attrgetter('score'), reverse=True) correct = 0 precision_values = [] recall_values = [] for idx, prediction in enumerate(predictions): if prediction.is_correct: correct += 1 precision_values.append(correct / (idx + 1)) recall_values.append(correct / num_relation_facts) def precision_at(n): return ( sum([prediction.is_correct for prediction in predictions[:n]]) / n) * 100 pr_metrics = { 'P/R AUC': auc(x=recall_values, y=precision_values), 'Precision@100': precision_at(100), 'Precision@200': precision_at(200), 'Precision@500': precision_at(500) } predictions_dir = join(self._base_path, 'predictions', 'test') pr_metrics_file_path = join(predictions_dir, 'pr_metrics_epoch_{}.jsonl'.format(epoch)) with open(make_path(pr_metrics_file_path), 'w', encoding='utf-8') as pr_metrics_file: pr_metrics_file.write(json.dumps(pr_metrics) + '\n') pr_predictions_file = join( predictions_dir, 'predictions_pr_curve_epoch_{}.tsv'.format(epoch)) with open(make_path(pr_predictions_file), 'w') as pr_pred_file: tuple_attrs = [ 'score', 'is_correct', 'bag_id', 'predicted_label_idx', 'bag_label_idxs', 'predicted_label', 'bag_labels', 'bag_size' ] pr_pred_file.write("\t".join(tuple_attrs) + "\n") for prediction in predictions: pred_values = attrgetter(*tuple_attrs)(prediction) pred_values = [str(val) for val in pred_values] pr_pred_file.write("\t".join(pred_values) + "\n") np.save(join(predictions_dir, 'pr_curve_y_epoch_{}.npy'.format(epoch)), precision_values) np.save(join(predictions_dir, 'pr_curve_x_epoch_{}.npy'.format(epoch)), recall_values)
# declare loss function and the optimizer criterion = nn.CrossEntropyLoss(reduce=False) # TODO check loss functions model_opt = OpenAIAdam(model.parameters(), lr=lr, schedule=lr_schedule, warmup=lr_warmup, t_total=n_updates_total, b1=b1, b2=b2, e=e, l2=l2, vector_l2=vector_l2, max_grad_norm=max_grad_norm) compute_loss_fct = LossCompute(criterion, lm_coef, model_opt) # this part will be changed for multigpu support model.to(device) lm_head.to(device) n_updates = 0 n_epochs = 0 make_path(os.path.join(save_dir, desc, 'temp.txt')) # repeat for n_iter epochs while n_epochs < n_iter: iters = 0 # split to train and valid _trX, _trV = get_train_valid(tr_data) start_ind = 0 end_ind = start_ind + n_batch_size while True: cur_batch = _trX[start_ind:end_ind] print("epoch ", n_epochs, "iter ", iters) trX, trM = transform_code(cur_batch) # forward pass and backprop run_epoch(trX, trM) iters += 1 start_ind = end_ind
def __init__(self, path, **kwargs): if 'time' not in kwargs: kwargs['time'] = time.time() self.f_log = open(make_path(path), 'w') self.f_log.write(json.dumps(kwargs) + '\n')
def save(path): save_path = saver.save(sess, make_path(path)) print('save the best model!')