from lda import lda_gibbs_sampling from datetime import datetime from sklearn.cross_validation import train_test_split, StratifiedKFold from nltk.stem import WordNetLemmatizer from sklearn.datasets import dump_svmlight_file, load_svmlight_file from sklearn.utils import shuffle from functions import * path2training = sys.argv[1] training = codecs.open(path2training, 'r', encoding='utf8').read().splitlines() topics = int(sys.argv[2]) alpha, beta = 0.5 / float(topics), 0.5 / float(topics) voca_en = vocabulary.Vocabulary(set(nltk.corpus.stopwords.words('english')), WordNetLemmatizer(), excluds_stopwords=True) ldaTrainingData = change_raw_2_lda_input(training, voca_en, True) ldaTrainingData = voca_en.cut_low_freq(ldaTrainingData, 1) classificationData, y = load_classification_data(sys.argv[3], sys.argv[4]) classificationData = change_raw_2_lda_input(classificationData, voca_en, False) classificationData = voca_en.cut_low_freq(classificationData, 1) iterations = 201 start = time.time() final_acc, final_mif, final_perpl, final_ar, final_nmi, final_p, final_r, final_f = [], [], [], [], [], [], [], [] for j in range(5): perpl, cnt, acc, mif, ar, nmi, p, r, f = [], 0, [], [], [], [], [], [], [] lda = lda_gibbs_sampling(K=topics,
def run_train(args): if args.numpy_seed is not None: print("Setting numpy random seed to {}...".format(args.numpy_seed)) np.random.seed(args.numpy_seed) torch.manual_seed(args.numpy_seed) print("Loading training trees from {}...".format(args.train_path)) train_treebank = trees.load_trees(args.train_path) print("Loaded {:,} training examples.".format(len(train_treebank))) print("Loading development trees from {}...".format(args.dev_path)) dev_treebank = trees.load_trees(args.dev_path) print("Loaded {:,} development examples.".format(len(dev_treebank))) print("Processing trees for training...") train_parse = [tree.convert() for tree in train_treebank] print("Constructing vocabularies...") tag_vocab = vocabulary.Vocabulary() tag_vocab.index(parse.START) tag_vocab.index(parse.STOP) word_vocab = vocabulary.Vocabulary() word_vocab.index(parse.START) word_vocab.index(parse.STOP) word_vocab.index(parse.UNK) label_vocab = vocabulary.Vocabulary() label_vocab.index(()) for tree in train_parse: nodes = [tree] while nodes: node = nodes.pop() if isinstance(node, trees.InternalParseNode): label_vocab.index(node.label) nodes.extend(reversed(node.children)) else: tag_vocab.index(node.tag) word_vocab.index(node.word) tag_vocab.freeze() word_vocab.freeze() label_vocab.freeze() def print_vocabulary(name, vocab): special = {parse.START, parse.STOP, parse.UNK} print("{} ({:,}): {}".format( name, vocab.size, sorted(value for value in vocab.values if value in special) + sorted(value for value in vocab.values if value not in special))) if args.print_vocabs: print_vocabulary("Tag", tag_vocab) print_vocabulary("Word", word_vocab) print_vocabulary("Label", label_vocab) print("Initializing model...") # model = dy.ParameterCollection() if args.parser_type == "top-down": parser = parse.TopDownParser( # model, tag_vocab, word_vocab, label_vocab, args.tag_embedding_dim, args.word_embedding_dim, args.lstm_layers, args.lstm_dim, args.label_hidden_dim, args.split_hidden_dim, args.dropout, ) # else: # parser = parse.ChartParser( # model, # tag_vocab, # word_vocab, # label_vocab, # args.tag_embedding_dim, # args.word_embedding_dim, # args.lstm_layers, # args.lstm_dim, # args.label_hidden_dim, # args.dropout, # ) # trainer = dy.AdamTrainer(model) optimizer = torch.optim.Adam(parser.parameters()) total_processed = 0 current_processed = 0 check_every = len(train_parse) / args.checks_per_epoch best_dev_fscore = -np.inf best_dev_model_path = None start_time = time.time() def check_dev(): nonlocal best_dev_fscore nonlocal best_dev_model_path dev_start_time = time.time() dev_predicted = [] for tree in dev_treebank: # dy.renew_cg() parser.eval() sentence = [(leaf.tag, leaf.word) for leaf in tree.leaves()] predicted, _ = parser.parse(sentence) dev_predicted.append(predicted.convert()) dev_fscore = evaluate.evalb(args.evalb_dir, dev_treebank, dev_predicted) print("dev-fscore {} " "dev-elapsed {} " "total-elapsed {}".format( dev_fscore, format_elapsed(dev_start_time), format_elapsed(start_time), )) if dev_fscore.fscore > best_dev_fscore: if best_dev_model_path is not None: # for ext in [".data", ".meta"]: # path = best_dev_model_path + ext # if os.path.exists(path): # print("Removing previous model file {}...".format(path)) # os.remove(path) path = best_dev_model_path if os.path.exists(path): print("Removing previous model file {}...".format(path)) os.remove(path) best_dev_fscore = dev_fscore.fscore # best_dev_model_path = "{}_dev={:.2f}".format( best_dev_model_path = "{}_dev={:.2f}.pth".format( args.model_path_base, dev_fscore.fscore) print("Saving new best model to {}...".format(best_dev_model_path)) # dy.save(best_dev_model_path, [parser]) torch.save(parser, best_dev_model_path) for epoch in itertools.count(start=1): if args.epochs is not None and epoch > args.epochs: break np.random.shuffle(train_parse) epoch_start_time = time.time() for start_index in range(0, len(train_parse), args.batch_size): # dy.renew_cg() optimizer.zero_grad() parser.train() batch_losses = [] for tree in train_parse[start_index:start_index + args.batch_size]: sentence = [(leaf.tag, leaf.word) for leaf in tree.leaves()] if args.parser_type == "top-down": _, loss = parser.parse(sentence, tree, args.explore) # else: # _, loss = parser.parse(sentence, tree) batch_losses.append(loss) total_processed += 1 current_processed += 1 # batch_loss = dy.average(batch_losses) # batch_loss_value = batch_loss.scalar_value() batch_loss = torch.stack(batch_losses).mean() assert batch_loss.data.numel() == 1 batch_loss_value = batch_loss.data[0] batch_loss.backward() # trainer.update() optimizer.step() print("epoch {:,} " "batch {:,}/{:,} " "processed {:,} " "batch-loss {:.4f} " "epoch-elapsed {} " "total-elapsed {}".format( epoch, start_index // args.batch_size + 1, int(np.ceil(len(train_parse) / args.batch_size)), total_processed, batch_loss_value, format_elapsed(epoch_start_time), format_elapsed(start_time), )) if current_processed >= check_every: current_processed -= check_every check_dev()
import vocabulary import inference_wrapper import configuration import h5py import tensorflow as tf import caption_generator import math import json import os os.environ["CUDA_VISIBLE_DEVICES"] = "1" conf = configuration.MyConfig() vocab = vocabulary.Vocabulary("data/dic.txt") file = h5py.File("data/feat.hdf5", 'r') encoded_images = file['valid_set'] valid_list_file = "data/valid_list.txt" train_step = conf.train_step checkpoint_steps = conf.original_train_steps + (train_step - 1) * conf.interval_train_steps check_point_path = "train_log/{}.ckpt".format(checkpoint_steps) model = inference_wrapper.InferenceWrapper() restore_fn = model.build_graph_from_config(configuration.ModelConfig(), check_point_path) sess = tf.InteractiveSession() restore_fn(sess)
def _read_training_classification_data(file_in): """Read the data for classification""" with open(file_in, 'r') as f: # (1) Class Number class_num = int(f.readline()[:-1]) # (2) Vocabularies # NP1 np1_num = int(f.readline()[:-1]) np1_voc = vocabulary.Vocabulary() for i in range(0, np1_num): np1_voc.add(f.readline()[:-1]) f.readline() # Blank line # VP vp_num = int(f.readline()[:-1]) vp_voc = vocabulary.Vocabulary() for i in range(0, vp_num): vp_voc.add(f.readline()[:-1]) f.readline() # Blank line # NP2 np2_num = int(f.readline()[:-1]) np2_voc = vocabulary.Vocabulary() for i in range(0, np2_num): np2_voc.add(f.readline()[:-1]) f.readline() # Blank line # (3) Priors for classes if class_num == int(f.readline()[:-1]): class_prior_prob = probability.Probability(1, class_num) for i in range(0, class_num): class_prior_prob.set_value(0, i, float(f.readline()[:-1])) else: print('Class Number Does Not Match in File {0}'.format(file_in)) f.readline() # Blank line # (4) Vocabularies' Probability # NP1 if np1_num == int(f.readline()[:-1]): np1_prob = probability.Probability(np1_num, class_num) for i in range(0, np1_num): f.readline() # class number for j in range(0, class_num): np1_prob.set_value(i, j, float(f.readline()[:-1])) f.readline() # Blank line else: print('NP1 Number Does Not Match in File {0}'.format(file_in)) f.readline() # Blank line # VP if vp_num == int(f.readline()[:-1]): vp_prob = probability.Probability(vp_num, class_num) for i in range(0, vp_num): f.readline() # class number for j in range(0, class_num): vp_prob.set_value(i, j, float(f.readline()[:-1])) f.readline() # Blank line else: print('VP Number Does Not Match in File {0}'.format(file_in)) f.readline() # Blank line # NP2 if np2_num == int(f.readline()[:-1]): np2_prob = probability.Probability(np2_num, class_num) for i in range(0, np2_num): f.readline() # class number for j in range(0, class_num): np2_prob.set_value(i, j, float(f.readline()[:-1])) f.readline() # Blank line else: print('NP2 Number Does Not Match in File {0}'.format(file_in)) f.readline() # Blank line # (5) Classes' Transition Matrix if class_num == int(f.readline()[:-1]): transition_prob = probability.Probability( class_num, class_num) # current class given previous class for i in range(0, class_num): f.readline() # Class number for j in range(0, class_num): transition_prob.set_value(i, j, float(f.readline()[:-1])) f.readline() # Blank line f.readline() else: print('Class Number Does Not Match in File {0}'.format(file_in)) # (6) Length Distribution # Currently not calculated. TO be Added return [ class_num, np1_voc, vp_voc, np2_voc, np1_prob, vp_prob, np2_prob, class_prior_prob, transition_prob ]
def loadAndPreprocessData(): ''' Read all the words to create a vocabulary ''' all_tokens = [] indir = '../preprocess/subset/' for root, dirs, filenames in os.walk(indir): for filename in filenames: if filename.startswith('canonicalized_words_'): with open(indir + filename, 'r') as f: for line in f.readlines(): w = line.rstrip() if w != '': all_tokens.append(w) print 'Processed all tokens: ', len(all_tokens) tokens_dict = Counter() for w in all_tokens: if w.startswith('DG') and w.endswith('DG'): w = 'DG' tokens_dict[w] += 1 ''' Remove noisy tokens - see notebook for exploratory analysis The first ~2500 tokens when sorted by key are noisy like "!!!!" or "* * * *" - for eg, the end of a chapter ''' noisy_tokens = sorted(tokens_dict)[0:2507] print 'Identified noisy tokens - some examples: ', noisy_tokens[0:30] ''' Clean up the tokens now that we know the noisy tokens and then generate the vocab ''' noisy_tokens = set(noisy_tokens) words = [w for w in all_tokens if w not in noisy_tokens] # TODO: Should make V configurable V = 50000 vocab = vocabulary.Vocabulary((word for word in words), size=V) print 'Vocabulary created with size: ', vocab.size ''' Read in the sentences already parsed from the ~3000 books Gutenberg subset ''' sents = [] indir = '../preprocess/subset/' books = [] for root, dirs, filenames in os.walk(indir): for filename in filenames: if filename.startswith('parsed_sents_'): with open(indir + filename, 'r') as f: for line in f.readlines(): sents.append(line.rstrip()) print 'Parsed sentences loaded into memory: ', len(sents) print 'The 10,000th sentence is: ', sents[10000] ''' Prepare training and test sentences ''' split = 0.8 shuffle = True sentences = np.array(sents, dtype=object) fmt = (len(sentences), sum(map(len, sentences))) print "Loaded %d sentences (%g tokens)" % fmt if shuffle: rng = np.random.RandomState(shuffle) rng.shuffle(sentences) # in-place train_frac = 0.8 split_idx = int(train_frac * len(sentences)) train_sentences = sentences[:split_idx] test_sentences = sentences[split_idx:] fmt = (len(train_sentences), sum(map(len, train_sentences))) print "Training set: %d sentences (%d tokens)" % fmt fmt = (len(test_sentences), sum(map(len, test_sentences))) print "Test set: %d sentences (%d tokens)" % fmt ''' Apply the vocab to the train and test sentences and convert words to ids to start training ''' ## Preprocess sentences ## convert words to ids based on the vocab wordset created above ## Do this in batches to avoid crashes due to insufficient memory batch_size = 50000 num_of_batches = int(round(len(train_sentences) / batch_size)) print 'Preprocessing train sentences - number of batches: ', num_of_batches train_id_batches = [] start = 0 end = start + batch_size for i in range(num_of_batches): if i % 15 is 0: print 'Completed Batches: ', i train_id_batches.append( utils.preprocess_sentences(train_sentences[start:end], vocab)) start = end end += batch_size # flatten the lists for 1D tensor temp = utils.flatten(train_id_batches) train_ids = utils.flatten(temp) train_ids = np.array(train_ids) print 'Train sentences converted to their IDs including start, end token and unknown word token' # repeat the same with test data batch_size = 50000 num_of_batches = int(round(len(test_sentences) / batch_size)) if num_of_batches > 10: num_of_batches = 10 print 'Preprocessing test sentences - number of batches: ', num_of_batches test_id_batches = [] start = 0 end = start + batch_size for i in range(num_of_batches): print 'Batch: ', i test_id_batches.append( utils.preprocess_sentences(test_sentences[start:end], vocab)) start = end end += batch_size test_ids = utils.flatten(utils.flatten(test_id_batches)) test_ids = np.array(test_ids) print 'Test sentences converted to their IDs including start, end token and unknown word token' max_time = 40 batch_size = 64 learning_rate = 0.01 num_epochs = 3 # Model parameters model_params = dict(V=vocab.size, H=100, softmax_ns=200, num_layers=1) TF_SAVEDIR = "tf_saved" checkpoint_filename = os.path.join(TF_SAVEDIR, "rnnlm") trained_filename = os.path.join(TF_SAVEDIR, "rnnlm_trained") # Will print status every this many seconds print_interval = 120 # Clear old log directory shutil.rmtree("tf_summaries", ignore_errors=True) lm = rnnlm.RNNLM(**model_params) lm.BuildCoreGraph() lm.BuildTrainGraph() # Explicitly add global initializer and variable saver to LM graph with lm.graph.as_default(): initializer = tf.global_variables_initializer() saver = tf.train.Saver() # Clear old log directory shutil.rmtree(TF_SAVEDIR, ignore_errors=True) if not os.path.isdir(TF_SAVEDIR): os.makedirs(TF_SAVEDIR) with tf.Session(graph=lm.graph) as session: # Seed RNG for repeatability tf.set_random_seed(42) session.run(initializer) bi = utils.batch_generator(train_ids, batch_size, max_time) for epoch in xrange(1, num_epochs + 1): t0_epoch = time.time() #bi = utils.batch_generator(train_ids, batch_size, max_time) print "[epoch %d] Starting epoch %d" % (epoch, epoch) #### YOUR CODE HERE #### # Run a training epoch. run_epoch(lm, session, bi, train=True, learning_rate=learning_rate) #### END(YOUR CODE) #### print "[epoch %d] Completed in %s" % ( epoch, utils.pretty_timedelta(since=t0_epoch)) # Save a checkpoint saver.save(session, checkpoint_filename, global_step=epoch) ## # score_dataset will run a forward pass over the entire dataset # and report perplexity scores. This can be slow (around 1/2 to # 1/4 as long as a full epoch), so you may want to comment it out # to speed up training on a slow machine. Be sure to run it at the # end to evaluate your score. print("[epoch %d]" % epoch), score_dataset(lm, session, train_ids, name="Train set") print("[epoch %d]" % epoch), score_dataset(lm, session, test_ids, name="Test set") print "" # Save final model saver.save(session, trained_filename)
def main(config): count_words(config, False) preprocess_names(config) count_words(config, True) vocabulary.Vocabulary(config, load=False).write() prep_data(config)
def run_train(args): args.numpy_seed = seed if args.numpy_seed is not None: print("Setting numpy random seed to {}...".format(args.numpy_seed)) np.random.seed(args.numpy_seed) if args.trial == 1: args.train_path = 'data/trial.txt' args.dev_path = 'data/trial.txt' args.test_path = 'data/trial.txt' # args.train_path = args.train_path.replace('[*]', args.treetype) # args.dev_path = args.dev_path.replace('[*]', args.treetype) # args.test_path = args.test_path.replace('[*]', args.treetype) print("Loading training trees from {}...".format(args.train_path)) train_chunk_insts = util.read_chunks(args.train_path, args.normal) print("Loaded {:,} training examples.".format(len(train_chunk_insts))) print("Loading development trees from {}...".format(args.dev_path)) dev_chunk_insts = util.read_chunks(args.dev_path, args.normal) print("Loaded {:,} development examples.".format(len(dev_chunk_insts))) print("Loading test trees from {}...".format(args.test_path)) test_chunk_insts = util.read_chunks(args.test_path, args.normal) print("Loaded {:,} test examples.".format(len(test_chunk_insts))) # print("Processing trees for training...") # train_parse = [tree.convert() for tree in train_treebank] print("Constructing vocabularies...") tag_vocab = vocabulary.Vocabulary() tag_vocab.index(parse.START) tag_vocab.index(parse.STOP) tag_vocab.index(parse.XX) word_vocab = vocabulary.Vocabulary() word_vocab.index(parse.START) word_vocab.index(parse.STOP) word_vocab.index(parse.UNK) word_vocab.index(parse.NUM) for x, chunks in train_chunk_insts + dev_chunk_insts + test_chunk_insts: for ch in x: word_vocab.index(ch) label_vocab = vocabulary.Vocabulary() label_vocab.index(()) label_list = util.load_label_list(args.labellist_path) #'data/labels.txt') for item in label_list: label_vocab.index((item, )) if args.nontlabelstyle != 1: for item in label_list: label_vocab.index((item + "'", )) if args.nontlabelstyle == 1: label_vocab.index((parse.EMPTY, )) tag_vocab.freeze() word_vocab.freeze() label_vocab.freeze() latent_tree = latent.latent_tree_builder(label_vocab, args.RBTlabel, args.nontlabelstyle) def print_vocabulary(name, vocab): special = {parse.START, parse.STOP, parse.UNK} print("{} ({:,}): {}".format( name, vocab.size, sorted(value for value in vocab.values if value in special) + sorted(value for value in vocab.values if value not in special))) if args.print_vocabs: print_vocabulary("Tag", tag_vocab) print_vocabulary("Word", word_vocab) print_vocabulary("Label", label_vocab) print("Initializing model...") pretrain = {'giga': 'data/giga.vec100', 'none': 'none'} pretrainemb = util.load_pretrain(pretrain[args.pretrainemb], args.word_embedding_dim, word_vocab) model = dy.ParameterCollection() if args.parser_type == "chartdyRBTC": parser = parse.ChartDynamicRBTConstraintParser( model, tag_vocab, word_vocab, label_vocab, args.tag_embedding_dim, args.word_embedding_dim, args.lstm_layers, args.lstm_dim, args.label_hidden_dim, args.dropout, (args.pretrainemb, pretrainemb), args.chunkencoding, args.trainc == 1, True, (args.zerocostchunk == 1), ) else: print('Model is not valid!') exit() if args.loadmodel != 'none': tmp = dy.load(args.loadmodel, model) parser = tmp[0] print('Model is loaded from ', args.loadmodel) trainer = dy.AdamTrainer(model) total_processed = 0 current_processed = 0 check_every = len(train_chunk_insts) / args.checks_per_epoch best_dev_fscore = -np.inf best_dev_model_path = None start_time = time.time() def check_dev(): nonlocal best_dev_fscore nonlocal best_dev_model_path dev_start_time = time.time() dev_predicted = [] #dev_gold = [] #dev_gold = latent_tree.build_latent_trees(dev_chunk_insts) dev_gold = [] for inst in dev_chunk_insts: chunks = util.inst2chunks(inst) dev_gold.append(chunks) for x, chunks in dev_chunk_insts: dy.renew_cg() #sentence = [(leaf.tag, leaf.word) for leaf in tree.leaves()] sentence = [(parse.XX, ch) for ch in x] predicted, _ = parser.parse(sentence) dev_predicted.append(predicted.convert().to_chunks()) #dev_fscore = evaluate.evalb(args.evalb_dir, dev_gold, dev_predicted, args.expname + '.dev.') #evalb dev_fscore = evaluate.eval_chunks2(args.evalb_dir, dev_gold, dev_predicted, output_filename=args.expname + '.dev.txt') # evalb print("dev-fscore {} " "dev-elapsed {} " "total-elapsed {}".format( dev_fscore, format_elapsed(dev_start_time), format_elapsed(start_time), )) if dev_fscore.fscore > best_dev_fscore: if best_dev_model_path is not None: for ext in [".data", ".meta"]: path = best_dev_model_path + ext if os.path.exists(path): print( "Removing previous model file {}...".format(path)) os.remove(path) best_dev_fscore = dev_fscore.fscore best_dev_model_path = "{}_dev={:.2f}".format( args.model_path_base + "_" + args.expname, dev_fscore.fscore) print("Saving new best model to {}...".format(best_dev_model_path)) dy.save(best_dev_model_path, [parser]) test_start_time = time.time() test_predicted = [] #test_gold = latent_tree.build_latent_trees(test_chunk_insts) test_gold = [] for inst in test_chunk_insts: chunks = util.inst2chunks(inst) test_gold.append(chunks) ftreelog = open(args.expname + '.test.predtree.txt', 'w', encoding='utf-8') for x, chunks in test_chunk_insts: dy.renew_cg() #sentence = [(leaf.tag, leaf.word) for leaf in tree.leaves()] sentence = [(parse.XX, ch) for ch in x] predicted, _ = parser.parse(sentence) pred_tree = predicted.convert() ftreelog.write(pred_tree.linearize() + '\n') test_predicted.append(pred_tree.to_chunks()) ftreelog.close() #test_fscore = evaluate.evalb(args.evalb_dir, test_chunk_insts, test_predicted, args.expname + '.test.') test_fscore = evaluate.eval_chunks2(args.evalb_dir, test_gold, test_predicted, output_filename=args.expname + '.test.txt') # evalb print("epoch {:,} " "test-fscore {} " "test-elapsed {} " "total-elapsed {}".format( epoch, test_fscore, format_elapsed(test_start_time), format_elapsed(start_time), )) train_trees = latent_tree.build_dynamicRBT_trees(train_chunk_insts) train_trees = [(x, tree.convert(), chunks, latentscope) for x, tree, chunks, latentscope in train_trees] for epoch in itertools.count(start=1): if args.epochs is not None and epoch > args.epochs: break np.random.shuffle(train_chunk_insts) epoch_start_time = time.time() for start_index in range(0, len(train_chunk_insts), args.batch_size): dy.renew_cg() batch_losses = [] for x, tree, chunks, latentscope in train_trees[ start_index:start_index + args.batch_size]: discard = False for chunk in chunks: length = chunk[2] - chunk[1] if length > args.maxllimit: discard = True break if discard: continue print('discard') sentence = [(parse.XX, ch) for ch in x] if args.parser_type == "top-down": _, loss = parser.parse(sentence, tree, args.explore) else: _, loss = parser.parse(sentence, tree, chunks, latentscope) batch_losses.append(loss) total_processed += 1 current_processed += 1 batch_loss = dy.average(batch_losses) batch_loss_value = batch_loss.scalar_value() batch_loss.backward() trainer.update() print("Epoch {:,} " "batch {:,}/{:,} " "processed {:,} " "batch-loss {:.4f} " "epoch-elapsed {} " "total-elapsed {}".format( epoch, start_index // args.batch_size + 1, int(np.ceil(len(train_chunk_insts) / args.batch_size)), total_processed, batch_loss_value, format_elapsed(epoch_start_time), format_elapsed(start_time), ), flush=True) if current_processed >= check_every: current_processed -= check_every if epoch > 7: check_dev()
def main(): import optparse import vocabulary import lda import lda_cvb0 parser = optparse.OptionParser() parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)", default="0:100") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5) parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5) parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False) parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=10) (options, args) = parser.parse_args() corpus = vocabulary.load_corpus(options.corpus) voca = vocabulary.Vocabulary(options.stopwords) docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) train_docs = [[x for i, x in enumerate(doc) if i % 10 != 0] for doc in docs] test_docs = [[x for i, x in enumerate(doc) if i % 10 == 0] for doc in docs] test_docs_wf = conv_word_freq(test_docs) f = FileOutput("lda_test2") f.out("corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(docs), len(voca.vocas), options.K, options.alpha, options.beta)) lda_learning(f, lda_cvb0.LDA_CVB0, False, options, train_docs, test_docs_wf, voca) lda_learning(f, lda_cvb0.LDA_CVB0, True, options, train_docs, test_docs_wf, voca) lda_learning(f, lda.LDA, False, options, train_docs, test_docs, voca, 2) lda_learning(f, lda.LDA, True, options, train_docs, test_docs, voca, 2)
def perplexity(self): """パープレキシティを計算""" phi = self.worddist() log_per = 0 Kalpha = self.K * self.alpha for m, doc in enumerate(self.docs): theta = self.n_m_z[m,:] / (len(doc) + Kalpha) for w in doc: log_per -= numpy.log(numpy.inner(phi[:,w], theta)) return numpy.exp(log_per / self.N) if __name__ == '__main__': voca = vocabulary.Vocabulary() #docs = [ voca.doc_to_ids(doc) for doc in vocabulary.read_from('corpus_1') ] corpus = vocabulary.read_from('corpus_1') lda = LDA(K = 10, alpha = 0.5, beta = 0.5) lda.set_corpus(corpus) for i in range(20): lda.inference() print(lda.perplexity()) phi = lda.worddist() print(phi[0]) print(lda.n_z)
def main(_): #convert jpg image(s) into iamge representations using alexnet: filenames = [ os.path.join(image_dir, f) for f in [ 'overly-attached-girlfriend.jpg', 'high-expectations-asian-father.jpg', 'foul-bachelor-frog.jpg', 'stoner-stanley.jpg', 'y-u-no.jpg', 'willy-wonka.jpg', 'futurama-fry.jpg', 'success-kid.jpg', 'one-does-not-simply.jpg', 'bad-luck-brian.jpg', 'first-world-problems.jpg', 'philosoraptor.jpg', 'what-if-i-told-you.jpg', 'TutorPP.jpg' ] ] print(filenames) tf.logging.info("Running caption generation on %d files matching %s", len(filenames), FLAGS.input_files) #mean of imagenet dataset in BGR imagenet_mean = np.array([104., 117., 124.], dtype=np.float32) #placeholder for input and dropout rate x_Alex = tf.placeholder(tf.float32, [1, 227, 227, 3]) keep_prob_Alex = tf.placeholder(tf.float32) #create model with default config ( == no skip_layer and 1000 units in the last layer) modelAlex = AlexNet(x_Alex, keep_prob_Alex, 1000, [], ['fc7', 'fc8'], 512) #maybe need to put fc8 in skip_layers #define activation of last layer as score score = modelAlex.fc6 meme_embeddings = [] with tf.Session() as sess: # Initialize all variables sess.run(tf.global_variables_initializer()) # Load the pretrained weights into the model modelAlex.load_initial_weights(sess) for i, meme in enumerate(filenames): img = Image.open(meme) try: img.thumbnail((227, 227), Image.ANTIALIAS) #img = img.resize((227,227)) #use img.thumbnail for square images, img.resize for non square assert np.shape(img) == (227, 227, 3) except AssertionError: img = img.resize((227, 227)) print('sizing error') # Subtract the ImageNet mean img = img - imagenet_mean #should probably change this # Reshape as needed to feed into model img = img.reshape((1, 227, 227, 3)) meme_vector = sess.run(score, feed_dict={ x_Alex: img, keep_prob_Alex: 1 }) #[1,4096] meme_vector = np.reshape(meme_vector, [4096]) assert np.shape(meme_vector) == (4096, ) #now have np embeddings to feed for inference meme_embeddings.append(meme_vector) with open('Captions.txt', 'r') as f: data_captions = f.readlines() data_captions = [s.lower() for s in data_captions] # Build the inference graph. g = tf.Graph() with g.as_default(): model = inference_wrapper.InferenceWrapper() restore_fn = model.build_graph_from_config(configuration.ModelConfig(), FLAGS.checkpoint_path) g.finalize() # Create the vocabulary. vocab = vocabulary.Vocabulary(FLAGS.vocab_file) #filenames = [] #for file_pattern in FLAGS.input_files.split(","): #filenames.extend(tf.gfile.Glob(file_pattern)) #tf.logging.info("Running caption generation on %d files matching %s", #len(filenames), FLAGS.input_files) with tf.Session(graph=g) as sess: # Load the model from checkpoint. restore_fn(sess) # Prepare the caption generator. Here we are implicitly using the default # beam search parameters. See caption_generator.py for a description of the # available beam search parameters. generator = caption_generator.CaptionGenerator(model, vocab) num_in_data_total = 0 num_captions = 0 for i, meme in enumerate(meme_embeddings): #with tf.gfile.GFile(filename, "rb") as f: #image = f.read() captions = generator.beam_search(sess, meme) print("Captions for image %s:" % os.path.basename(filenames[i])) num_in_data = 0 for i, caption in enumerate(captions): # Ignore begin and end words. sentence = [ vocab.id_to_word(w) for w in caption.sentence[1:-1] ] sentence = " ".join(sentence) in_data = 0 if b_any(sentence in capt for capt in data_captions): in_data = 1 num_in_data += 1 num_in_data_total += 1 num_captions += 1 else: num_captions += 1 print(" %d) %s (p=%f) [in data = %d]" % (i, sentence, math.exp(caption.logprob), in_data)) print("number of captions in data = %d" % (num_in_data)) print("(total number of captions in data = %d) percent in data = %f" % (num_in_data_total, (num_in_data_total / num_captions)))
def load_or_create_model(args, parses_for_vocab): components = args.model_path_base.split('/') directory = '/'.join(components[:-1]) if os.path.isdir(directory): relevant_files = [f for f in os.listdir(directory) if f.startswith(components[-1])] else: relevant_files = [] assert len(relevant_files) <= 2, "Multiple possibilities {}".format(relevant_files) if len(relevant_files) > 0: print("Loading model from {}...".format(args.model_path_base)) model = dy.ParameterCollection() [parser] = dy.load(args.model_path_base, model) else: assert parses_for_vocab is not None print("Constructing vocabularies using train parses...") tag_vocab = vocabulary.Vocabulary() tag_vocab.index(parse.START) tag_vocab.index(parse.STOP) word_vocab = vocabulary.Vocabulary() word_vocab.index(parse.START) word_vocab.index(parse.STOP) word_vocab.index(parse.UNK) label_vocab = vocabulary.Vocabulary() label_vocab.index(()) for tree in parses_for_vocab: nodes = [tree] while nodes: node = nodes.pop() if isinstance(node, trees.InternalParseNode): label_vocab.index(node.label) nodes.extend(reversed(node.children)) else: assert isinstance(node, LeafParseNode) tag_vocab.index(node.tag) word_vocab.index(node.word) tag_vocab.freeze() word_vocab.freeze() label_vocab.freeze() print("Initializing model...") model = dy.ParameterCollection() parser = parse.Parser( model, tag_vocab, word_vocab, label_vocab, None, args.word_embedding_dim, args.lstm_layers, args.lstm_dim, args.label_hidden_dim, None, args.dropout, not args.no_elmo ) return parser, model
def main(): t1 = time.time() import optparse import vocabulary global out_dir parser = optparse.OptionParser() parser.add_option("-f", dest="filename", help="corpus filename") parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.1) parser.add_option("--eta", dest="eta", type="float", help="parameter eta", default=0.2) parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100) parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False) parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) #parser.add_option("--setup", dest="setup", help="setup details", default="uniform") parser.add_option("--dataset", dest="did", help="setup details : Dataset-1/Dataset-2/Dataset-3", default="Dataset-1") (options, args) = parser.parse_args() if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)") if options.filename: if options.did == 'Dataset-1': corpus, doc_ids, event_list, total_no_word = vocabulary.load_file( options.filename) else: corpus, doc_ids, event_list, total_no_word = vocabulary.load_file_reuter( options.filename) else: corpus = vocabulary.load_corpus(options.corpus) if not corpus: parser.error("corpus range(-c) forms 'start:end'") if options.seed != None: numpy.random.seed(options.seed) voca = vocabulary.Vocabulary(options.stopwords) docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) if event_list is not None: options.K = options.K #len(event_list) suffix = datetime.now().strftime('%Y-%m-%d_%H:%M:%S') #out_dir = '%s/all_words/Topic_%d_alpha_%f_eta_%f_iter_%d/%s' %(out_dir,options.K,options.alpha, options.eta, options.iteration, suffix) #out_dir = '%s/Dataset-1/Topic_%d_alpha_%f_eta_%f_iter_%d/%s' %(out_dir,options.K,options.alpha, options.eta, options.iteration, suffix) out_dir = '%s/%s/Topic_%d_alpha_%f_eta_%f_iter_%d/%s' % ( out_dir, options.did, options.K, options.alpha, options.eta, options.iteration, suffix) #out_dir = '%s/Reuters-21578/R-8-train-train_no-stop/Topic_%d_alpha_%f_eta_%f_iter_%d/%s' %(out_dir,options.K,options.alpha, options.eta, options.iteration, suffix) #out_dir = '%s/20-Newsgroup/20-Newsgroup_train-train_all_term/Topic_%d_alpha_%f_eta_%f_iter_%d/%s' %(out_dir,options.K,options.alpha, options.eta, options.iteration, suffix) print('out_dir: ', out_dir) try: os.makedirs(out_dir) except Exception as e: print(' %s Dir exist ' % (out_dir)) print('E MSG : ', e) lda = LDA(options.K, options.alpha, options.eta, docs, doc_ids, voca.size(), options.smartinit) t_int = time.time() #print 'Intialization time : %f' %(t_int-t1) flog = '%s/log_file.txt' % (out_dir) f = open(flog, 'w') f.write( "corpus(# of doc)=%d, no of event = %d , Uniq words=%d, Toal # of word =%d, K=%d, a=%f, b=%f , iteration = %d \n" % (len(corpus), len(event_list), len(voca.vocas), total_no_word, options.K, options.alpha, options.eta, options.iteration)) f.close() print("corpus=%d, no of event =%d , uniq words=%d, K=%d, a=%f, b=%f" % (len(corpus), len(event_list), len( voca.vocas), options.K, options.alpha, options.eta)), #import cProfile #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile') lda_learning(lda, options.iteration, voca) t2 = time.time() print(' Total time taken : %f ' % (t2 - t1)) flog = '%s/log_file.txt' % (out_dir) f = open(flog, 'a') f.write(' TOtal time taken : %f ' % (t2 - t1)) f.close()
def training_model(main_path, type, config_file, from_date, to_date, customer): #logging.basicConfig(filename=logCustomer, level=logging.INFO) #lg.configureLogger(QIUserLogger, customer, "training") # QIUserLogger.info( "-----------------------------------------------------------------") QIUserLogger.info( "------------------------Training Start---------------------------") # QIUserLogger.info("** Initialization start... **") main_path = main_path type = type config_file = config_file from_date = from_date to_date = to_date QIUserLogger.info(" MainPath - " + str(main_path)) QIUserLogger.info(" Type - " + str(type)) QIUserLogger.info(" ConfigFile - " + str(config_file)) QIUserLogger.info(" FromDate - " + str(from_date)) QIUserLogger.info(" ToDate - " + str(to_date)) # QIUserLogger.info("** Initialization End **") try: QIUserLogger.info("1 - Load Configurations") QIUserLogger.info(" ** Config for Classification") # Load Config files configModel = cg.Config() configModel.configFromFile(config_file) configModel.main_path = main_path configModel.updateDataOfMainPath(config_file, main_path) dataL = dt.Data(configModel) # QIUserLogger.info("2 - Login In API") # Login to API configConnection = con.ConfigConnection() dir_path = os.path.dirname(os.path.realpath(__file__)) configConnection.configFromFile(dir_path + "/config/" + customer + "/connector_config.json") connector = con.Connector(configConnection) # Create Persistent Session Reqsess = requests.session() # LogIN connector.login(Reqsess) QIUserLogger.info("3 - GET TICKETS FROM API") # params = "closedfrom=" + str(from_date) + "&closedto=" + str( to_date) + "&maxnum=" + str(configConnection.max_tickets_to_get) #params = {"closedfrom": from_date, "closedto": to_date, "maxnum" : configConnection.max_tickets_to_get} responseTicket = connector.getTickets(Reqsess, params) if len(responseTicket) > 0: rTicket = [] for t in responseTicket: rTicket.append(t['description']) # id2lab = dict( zip(configModel.labels_map.values(), configModel.labels_map.keys())) # gather_tickets, gather_targets = gatherData( type, responseTicket, configModel, id2lab) # QIUserLogger.info("4 - REMOVE STOP WORDS FROM NEW TICKETS") tok = tk.Tokenizer(gather_tickets) tok.tokenizeTickets() tickets_to_lower = tok.toLower() gather_tickets, gather_targets = tok.removeStopWordsToString( tickets_to_lower, gather_targets) QIUserLogger.info("5 - GET STORED DATA TICKETS") tickets_train = dataL.loadDataInArray( configModel.data_path + "/tickets.txt", configModel.csv_encoding) targets_train = dataL.loadDataInArray(configModel.data_path + "/targets.txt") # # Count if we reached the threshold QIUserLogger.info("6 - MERGE THE DATA - STORED AND GATHERED") max_length = configModel.max_num_tickets len_gather_tickets = len(gather_tickets) len_tickets = len(tickets_train) #Effettuo un nuovo training su tutto il dataset e non un transfer #learning perchè voglio utilizzare sempre un vocabolario aggiornato. tickets = tickets_train + gather_tickets targets = targets_train + gather_targets reached_dim = len_gather_tickets + len_tickets if reached_dim > max_length: elem_to_cut = reached_dim - max_length #cut out the firsts elem_to_cut elements merged_targets = tickets[elem_to_cut:] merged_tickets = targets[elem_to_cut:] tickets = merged_tickets targets = merged_targets reached_dim = max_length QIUserLogger.info("7 - REMOVE IDENTICAL TICKETS") #tickets, targets = ut.removeIdenticalTickets(tickets, targets) tickets, targets = ut.removeIdenticalTicketsFromNew( tickets, targets, len_tickets, reached_dim) QIUserLogger.info("8 - SAVING MERGED DATA") dataL.writeArrayInFileCompleteDataPath( tickets, configModel.data_path + '/tickets.txt', "utf-8") dataL.writeArrayInFileCompleteDataPath( targets, configModel.data_path + '/targets.txt', "utf-8") # QIUserLogger.info("9 - EXTRACT WORDS FROM TICKETS") words = tok.extractWordsTicketString(tickets) # QIUserLogger.info("10 - BUILD NEW VOCABULARY") # Create Vocabulary voc = vc.Vocabulary(configModel) dictionary, reverse_dict = voc.build_dictionary( words, configModel.labels) voc.saveDictionary(dictionary, "vocabulary") QIUserLogger.info("*** Vocabulary saved") # QIUserLogger.info("11 -- SPLIT DATA IN TRAINING AND TEST DATASET") tickets_training, tickets_test, Target_training, Target_test = ut.get_train_and_test( tickets, targets) dataL.writeArrayInFileCompleteDataPath( tickets_training, configModel.data_path + '/tickets_training.txt', "utf-8") dataL.writeArrayInFileCompleteDataPath( Target_training, configModel.data_path + '/targets_training.txt', "utf-8") dataL.writeArrayInFileCompleteDataPath( tickets_test, configModel.data_path + '/tickets_test.txt', "utf-8") dataL.writeArrayInFileCompleteDataPath( Target_test, configModel.data_path + '/targets_test.txt', "utf-8") # QIUserLogger.info("12 - CREATE TICKETS AND TARGETS SEQUENCES") # Create Sequences and HotVectors for the Target tickets_training_sequences = dataL.createDataSequenceTicketsString( tickets_training, dictionary) oneHotVectorTarget_training = dataL.transformInOneHotVector( configModel.labels, Target_training) # QIUserLogger.info("13 - FILTER OUT DATA - Removing Token OOV") filtdata = fd.FilterData(configModel, configModel.labels) tickets_training_sequences, oneHotVectorTarget_training, trash = filtdata.removeTokenOOV( tickets_training_sequences, oneHotVectorTarget_training, dictionary) QIUserLogger.info(" *** Classe Cestino in Training : " + str(len(trash))) # #QIUserLogger.info(" -- Split Training | Test Dataset") #tickets_training_sequences, tickets_test_sequences, oneHotVectorTarget_training, oneHotVectorTarget_test = ut.get_train_and_test(tickets_training_sequences, oneHotVectorTarget_training) # QIUserLogger.info("14 - SAVING TRAINING SEQUENCES") dataL.writeArrayInFileCompleteDataPath( tickets_training_sequences, configModel.data_sequences_path + '/tickets_training.txt', "utf-8") dataL.writeArrayInFileCompleteDataPath( oneHotVectorTarget_training, configModel.data_sequences_path + '/target_training.txt', "utf-8") QIUserLogger.info(" *** Training Size : " + str(len(tickets_training_sequences)) + "\n") if configModel.use_pretrained_embs: QIUserLogger.info(" *** Use pretrained Words Embedding") skip = sk.SkipgramModel(configModel) skipgramModel = skip.get_skipgram() skipgramEmbedding = skip.getCustomEmbeddingMatrix( skipgramModel, reverse_dict) configModel.skipgramEmbedding = skipgramEmbedding # Start Training QIUserLogger.info("15 - START TRAINING") ml.runTraining(configModel, tickets_training_sequences, oneHotVectorTarget_training, configModel.labels) QIUserLogger.info("============ End =============") else: QIUserLogger.info( "No New Tickets found. There is no need of a new training.") # LogIN connector.logout(Reqsess) # except Exception as e: print(str(e)) QIUserLogger.error("Error in training_model " + str(e))
def main(): import optparse import vocabulary parser = optparse.OptionParser() parser.add_option("-f", dest="filename", help="corpus filename") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5) parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5) parser.add_option("--lamda", dest="lamda", type="float", help="parameter lamda", default=0.5) parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100) parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False) parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) (options, args) = parser.parse_args() if not (options.filename): parser.error("need corpus filename(-f) or corpus range(-c)") if options.filename: (pids, tids) = vocabulary.load_file(options.filename) if options.seed != None: numpy.random.seed(options.seed) #voca is the object which stores the data structures needed by LDA voca = vocabulary.Vocabulary(options.stopwords) docs = voca.PT_to_idlist(pids, tids) #print docs size_of_vocab = max(tids) + 1 lda = BLDA(options.K, options.alpha, options.beta, options.lamda, docs, size_of_vocab, options.smartinit) #print "corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(corpus), len(voca.vocas), options.K, options.alpha, options.beta) blda_learning(lda, options.iteration)
def mainTrainModelOnApertureWithSequenceFeatures(): print("============ Start =============\n") print("1 - Load Configuration\n") config = cg.Config() dataL = dt.Data(config) print("2 - Load Data and Targets\n") tickets_training, tickets_test, targets_training, targets_test = loadAndSplit( ) map_labels = dataL.loadMapFromJson(config.data_path + "map_labels.json") labels = dataL.getfirstLevelTargets(map_labels['map']) print("3 - Preprocess Data\n") tickets_training_tl, targets_training, words = preprocessData( tickets_training, targets_training, labels) tickets_test_tl, targets_test, w_ = preprocessData(tickets_test, targets_test, labels) print("4 - Build Vocabulary\n") # Create Vocabulary voc = vc.Vocabulary(config) dictionary, reverse_dict = voc.build_dictionary(words, labels) voc.saveDictionary(dictionary, "vocabulary") print("5 - Create Ticket Sequences and Targets Hot Vectors\n") #Create Sequences and HotVectors for the Target tickets_training_sequences = dataL.createDataSequence( tickets_training_tl, dictionary) oneHotVectorTarget_training = dataL.transformInOneHotVector( labels, targets_training) tickets_test_sequences = dataL.createDataSequence(tickets_test_tl, dictionary) oneHotVectorTarget_test = dataL.transformInOneHotVector( labels, targets_test) print("6 - Create Ticket Feature Sequences") #Create Sequences Features tickets_feature_sequences = dataL.extractFeatures(tickets_training_tl, dictionary) tickets_feature_test_sequences = dataL.createDataSequence( tickets_test_tl, dictionary) print("6 - Filter Data - Removeing Token OOV\n") filtdata = fd.FilterData(config, labels) tickets_training_sequences, oneHotVectorTarget_training, tickets_feature_sequences_training, trash = filtdata.removeTokenOOVwithSequenceFeatures( tickets_training_sequences, oneHotVectorTarget_training, tickets_feature_sequences, dictionary) print("*** Classe Cestino in Training : " + str(len(trash))) tickets_test_sequences, oneHotVectorTarget_test, tickets_feature_test_sequences, trash = filtdata.removeTokenOOVwithSequenceFeatures( tickets_test_sequences, oneHotVectorTarget_test, tickets_feature_test_sequences, dictionary) print("*** Classe Cestino in Test : " + str(len(trash))) print("7 - Generate Training and Testing Dataset\n") dataL.writeArrayStringInFile(tickets_training_sequences, 'parsed_sequences/tickets_training.txt', "utf-8") dataL.writeArrayStringInFile(tickets_test_sequences, 'parsed_sequences/tickets_test.txt', "utf-8") dataL.writeArrayStringInFile(oneHotVectorTarget_training, 'parsed_sequences/target_training.txt', "utf-8") dataL.writeArrayStringInFile(oneHotVectorTarget_test, 'parsed_sequences/target_test.txt', "utf-8") print("*** Training Size : " + str(len(tickets_training_sequences)) + "\n") if config.use_pretrained_embs: print("*** Uso pretrained Words Embedding\n") skip = sk.SkipgramModel(config) skipgramModel = skip.get_skipgram() skipgramEmbedding = skip.getCustomEmbeddingMatrix( skipgramModel, reverse_dict) config.skipgramEmbedding = skipgramEmbedding print("8 - Start Training\n") ml.runTrainingWithFeatureSequence(config, tickets_training_sequences, oneHotVectorTarget_training, labels, tickets_feature_sequences_training) print("============ End =============\n")
def build_vocab(corpus, V=10000): token_feed = (canonicalize_word(w) for w in corpus.words()) vocab = vocabulary.Vocabulary(token_feed, size=V) return vocab
def trainPriority(): print("============ Start =============\n") print("1 - Load Configuration\n") config = cg.Config() config.configFromFile("config/priority_config.json") dataL = dt.Data(config) print("2 - Load Data and Targets\n") tickets_training = dataL.loadDataInArray( config.main_path + "onlyAperturaPriority/tickets_training.txt", config.csv_encoding) tickets_test = dataL.loadDataInArray( config.main_path + "onlyAperturaPriority/tickets_test.txt", config.csv_encoding) targets_training = dataL.loadDataInArray( config.main_path + "onlyAperturaPriority/targets_training.txt", config.csv_encoding) targets_test = dataL.loadDataInArray( config.main_path + "onlyAperturaPriority/targets_test.txt", config.csv_encoding) labels = ["1", "2", "3", "4", "5"] print("3 - Preprocess Data\n") tickets_training_tl, targets_training, words = preprocessData( tickets_training, targets_training, labels) tickets_test_tl, targets_test, w_ = preprocessData(tickets_test, targets_test, labels) if config.loadOrbuild_dictionary == "build": print("4 - Build Vocabulary\n") # Create Vocabulary voc = vc.Vocabulary(config) dictionary, reverse_dict = voc.build_dictionary(words, labels) voc.saveDictionary(dictionary, "vocabulary") print("*** Vocabulary saved \n") else: print("4 - Load Vocabulary\n") # Load Existing Vocabulary voc = vc.Vocabulary(config) dictionary = voc.loadDictionary("vocabulary") reverse_dict = voc.getReverseDictionary(dictionary) print("5 - Create Ticket Sequences and Targets Hot Vectors\n") # Create Sequences and HotVectors for the Target tickets_training_sequences = dataL.createDataSequence( tickets_training_tl, dictionary) oneHotVectorTarget_training = dataL.transformInOneHotVector( labels, targets_training) tickets_test_sequences = dataL.createDataSequence(tickets_test_tl, dictionary) oneHotVectorTarget_test = dataL.transformInOneHotVector( labels, targets_test) print("6 - Filter Data - Removeing Token OOV\n") filtdata = fd.FilterData(config, labels) tickets_training_sequences, oneHotVectorTarget_training, trash = filtdata.removeTokenOOV( tickets_training_sequences, oneHotVectorTarget_training, dictionary) print(" *** Classe Cestino in Training : " + str(len(trash)) + "\n") tickets_test_sequences, oneHotVectorTarget_test, trash = filtdata.removeTokenOOV( tickets_test_sequences, oneHotVectorTarget_test, dictionary) print(" *** Classe Cestino in Test : " + str(len(trash)) + "\n") print("7 - Generate Training and Testing Dataset\n") dataL.writeArrayInFileCompleteDataPath( tickets_training_sequences, config.data_sequences_path + '/tickets_training.txt', "utf-8") dataL.writeArrayInFileCompleteDataPath( tickets_test_sequences, config.data_sequences_path + '/tickets_test.txt', "utf-8") dataL.writeArrayInFileCompleteDataPath( oneHotVectorTarget_training, config.data_sequences_path + '/target_training.txt', "utf-8") dataL.writeArrayInFileCompleteDataPath( oneHotVectorTarget_test, config.data_sequences_path + '/target_test.txt', "utf-8") print(" *** Training Size : " + str(len(tickets_training_sequences)) + "\n") print(" *** Test Size : " + str(len(tickets_test_sequences)) + "\n") if config.use_pretrained_embs: print(" *** Use pretrained Words Embedding\n") skip = sk.SkipgramModel(config) skipgramModel = skip.get_skipgram() skipgramEmbedding = skip.getCustomEmbeddingMatrix( skipgramModel, reverse_dict) config.skipgramEmbedding = skipgramEmbedding print("8 - Start Training\n") ml.runTraining(config, tickets_training_sequences, oneHotVectorTarget_training, labels) print("============ End =============\n")
def run_train(args, hparams): # if args.numpy_seed is not None: # print("Setting numpy random seed to {}...".format(args.numpy_seed)) # np.random.seed(args.numpy_seed) # # # Make sure that pytorch is actually being initialized randomly. # # On my cluster I was getting highly correlated results from multiple # # runs, but calling reset_parameters() changed that. A brief look at the # # pytorch source code revealed that pytorch initializes its RNG by # # calling std::random_device, which according to the C++ spec is allowed # # to be deterministic. # seed_from_numpy = np.random.randint(2147483648) # print("Manual seed for pytorch:", seed_from_numpy) # torch.manual_seed(seed_from_numpy) now_time = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') log_file_name = os.path.join(args.log_dir, 'log-' + now_time) logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', filename=log_file_name, filemode='w', level=logging.INFO) logger = logging.getLogger(__name__) console_handler = logging.StreamHandler() logger.addHandler(console_handler) logger = logging.getLogger(__name__) random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) hparams.set_from_args(args) logger.info("Hyperparameters:") logger.info(hparams.print()) logger.info("Loading training trees from {}...".format(args.train_path)) if hparams.predict_tags and args.train_path.endswith('10way.clean'): logger.info( "WARNING: The data distributed with this repository contains " "predicted part-of-speech tags only (not gold tags!) We do not " "recommend enabling predict_tags in this configuration.") train_treebank = trees.load_trees(args.train_path) if hparams.max_len_train > 0: train_treebank = [ tree for tree in train_treebank if len(list(tree.leaves())) <= hparams.max_len_train ] logger.info("Loaded {:,} training examples.".format(len(train_treebank))) logger.info("Loading development trees from {}...".format(args.dev_path)) dev_treebank = trees.load_trees(args.dev_path) if hparams.max_len_dev > 0: dev_treebank = [ tree for tree in dev_treebank if len(list(tree.leaves())) <= hparams.max_len_dev ] logger.info("Loaded {:,} development examples.".format(len(dev_treebank))) logger.info("Loading test trees from {}...".format(args.test_path)) test_treebank = trees.load_trees(args.test_path) if hparams.max_len_dev > 0: test_treebank = [ tree for tree in test_treebank if len(list(tree.leaves())) <= hparams.max_len_dev ] logger.info("Loaded {:,} test examples.".format(len(test_treebank))) logger.info("Processing trees for training...") train_parse = [tree.convert() for tree in train_treebank] dev_parse = [tree.convert() for tree in dev_treebank] test_parse = [tree.convert() for tree in test_treebank] logger.info("Constructing vocabularies...") tag_vocab = vocabulary.Vocabulary() tag_vocab.index(tokens.START) tag_vocab.index(tokens.STOP) tag_vocab.index(tokens.TAG_UNK) word_vocab = vocabulary.Vocabulary() word_vocab.index(tokens.START) word_vocab.index(tokens.STOP) word_vocab.index(tokens.UNK) label_vocab = vocabulary.Vocabulary() label_vocab.index(()) char_set = set() for tree in train_parse: nodes = [tree] while nodes: node = nodes.pop() if isinstance(node, trees.InternalParseNode): label_vocab.index(node.label) nodes.extend(reversed(node.children)) else: tag_vocab.index(node.tag) word_vocab.index(node.word) char_set |= set(node.word) char_vocab = vocabulary.Vocabulary() # If codepoints are small (e.g. Latin alphabet), index by codepoint directly highest_codepoint = max(ord(char) for char in char_set) if highest_codepoint < 512: if highest_codepoint < 256: highest_codepoint = 256 else: highest_codepoint = 512 # This also takes care of constants like tokens.CHAR_PAD for codepoint in range(highest_codepoint): char_index = char_vocab.index(chr(codepoint)) assert char_index == codepoint else: char_vocab.index(tokens.CHAR_UNK) char_vocab.index(tokens.CHAR_START_SENTENCE) char_vocab.index(tokens.CHAR_START_WORD) char_vocab.index(tokens.CHAR_STOP_WORD) char_vocab.index(tokens.CHAR_STOP_SENTENCE) for char in sorted(char_set): char_vocab.index(char) tag_vocab.freeze() word_vocab.freeze() label_vocab.freeze() char_vocab.freeze() # -------- ngram vocab ------------ ngram_vocab = vocabulary.Vocabulary() ngram_vocab.index(()) ngram_finder = FindNgrams(min_count=hparams.ngram_threshold) def get_sentence(parse): sentences = [] for tree in parse: sentence = [] for leaf in tree.leaves(): sentence.append(leaf.word) sentences.append(sentence) return sentences sentence_list = get_sentence(train_parse) if not args.cross_domain: sentence_list.extend(get_sentence(dev_parse)) # sentence_list.extend(get_sentence(test_parse)) if hparams.ngram_type == 'freq': logger.info('ngram type: freq') ngram_finder.count_ngram(sentence_list, hparams.ngram) elif hparams.ngram_type == 'pmi': logger.info('ngram type: pmi') ngram_finder.find_ngrams_pmi(sentence_list, hparams.ngram, hparams.ngram_freq_threshold) else: raise ValueError() ngram_type_count = [0 for _ in range(hparams.ngram)] for w, c in ngram_finder.ngrams.items(): ngram_type_count[len(list(w)) - 1] += 1 for _ in range(c): ngram_vocab.index(w) logger.info(str(ngram_type_count)) ngram_vocab.freeze() ngram_count = [0 for _ in range(hparams.ngram)] for sentence in sentence_list: for n in range(len(ngram_count)): length = n + 1 for i in range(len(sentence)): gram = tuple(sentence[i:i + length]) if gram in ngram_finder.ngrams: ngram_count[n] += 1 logger.info(str(ngram_count)) # -------- ngram vocab ------------ def print_vocabulary(name, vocab): special = {tokens.START, tokens.STOP, tokens.UNK} logger.info("{} ({:,}): {}".format( name, vocab.size, sorted(value for value in vocab.values if value in special) + sorted(value for value in vocab.values if value not in special))) if args.print_vocabs: print_vocabulary("Tag", tag_vocab) print_vocabulary("Word", word_vocab) print_vocabulary("Label", label_vocab) print_vocabulary("Ngram", ngram_vocab) logger.info("Initializing model...") load_path = None if load_path is not None: logger.info(f"Loading parameters from {load_path}") info = torch_load(load_path) parser = SAPar_model.SAChartParser.from_spec(info['spec'], info['state_dict']) else: parser = SAPar_model.SAChartParser( tag_vocab, word_vocab, label_vocab, char_vocab, ngram_vocab, hparams, ) print("Initializing optimizer...") trainable_parameters = [ param for param in parser.parameters() if param.requires_grad ] trainer = torch.optim.Adam(trainable_parameters, lr=1., betas=(0.9, 0.98), eps=1e-9) if load_path is not None: trainer.load_state_dict(info['trainer']) pytorch_total_params = sum(p.numel() for p in parser.parameters() if p.requires_grad) logger.info('# of trainable parameters: %d' % pytorch_total_params) def set_lr(new_lr): for param_group in trainer.param_groups: param_group['lr'] = new_lr assert hparams.step_decay, "Only step_decay schedule is supported" warmup_coeff = hparams.learning_rate / hparams.learning_rate_warmup_steps scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( trainer, 'max', factor=hparams.step_decay_factor, patience=hparams.step_decay_patience, verbose=True, ) def schedule_lr(iteration): iteration = iteration + 1 if iteration <= hparams.learning_rate_warmup_steps: set_lr(iteration * warmup_coeff) clippable_parameters = trainable_parameters grad_clip_threshold = np.inf if hparams.clip_grad_norm == 0 else hparams.clip_grad_norm logger.info("Training...") total_processed = 0 current_processed = 0 check_every = len(train_parse) / args.checks_per_epoch best_eval_fscore = -np.inf test_fscore_on_dev = -np.inf best_eval_scores = None best_eval_model_path = None best_eval_processed = 0 start_time = time.time() def check_eval(eval_treebank, ep, flag='dev'): # nonlocal best_eval_fscore # nonlocal best_eval_model_path # nonlocal best_eval_processed dev_start_time = time.time() eval_predicted = [] for dev_start_index in range(0, len(eval_treebank), args.eval_batch_size): subbatch_trees = eval_treebank[dev_start_index:dev_start_index + args.eval_batch_size] subbatch_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in subbatch_trees] predicted, _ = parser.parse_batch(subbatch_sentences) del _ eval_predicted.extend([p.convert() for p in predicted]) eval_fscore = evaluate.evalb(args.evalb_dir, eval_treebank, eval_predicted) logger.info(flag + ' eval ' 'epoch {} ' "fscore {} " "elapsed {} " "total-elapsed {}".format( ep, eval_fscore, format_elapsed(dev_start_time), format_elapsed(start_time), )) return eval_fscore def save_model(eval_fscore, remove_model): nonlocal best_eval_fscore nonlocal best_eval_model_path nonlocal best_eval_processed nonlocal best_eval_scores if best_eval_model_path is not None: extensions = [".pt"] for ext in extensions: path = best_eval_model_path + ext if os.path.exists(path) and remove_model: logger.info( "Removing previous model file {}...".format(path)) os.remove(path) best_eval_fscore = eval_fscore.fscore best_eval_scores = eval_fscore best_eval_model_path = "{}_eval={:.2f}_{}".format( args.model_path_base, eval_fscore.fscore, now_time) best_eval_processed = total_processed logger.info( "Saving new best model to {}...".format(best_eval_model_path)) torch.save( { 'spec': parser.spec, 'state_dict': parser.state_dict(), # 'trainer' : trainer.state_dict(), }, best_eval_model_path + ".pt") for epoch in itertools.count(start=1): if args.epochs is not None and epoch > args.epochs: break np.random.shuffle(train_parse) epoch_start_time = time.time() for start_index in range(0, len(train_parse), args.batch_size): trainer.zero_grad() schedule_lr(total_processed // args.batch_size) batch_loss_value = 0.0 batch_trees = train_parse[start_index:start_index + args.batch_size] batch_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in batch_trees] batch_num_tokens = sum( len(sentence) for sentence in batch_sentences) for subbatch_sentences, subbatch_trees in parser.split_batch( batch_sentences, batch_trees, args.subbatch_max_tokens): _, loss = parser.parse_batch(subbatch_sentences, subbatch_trees) if hparams.predict_tags: loss = loss[0] / len( batch_trees) + loss[1] / batch_num_tokens else: loss = loss / len(batch_trees) loss_value = float(loss.data.cpu().numpy()) batch_loss_value += loss_value if loss_value > 0: loss.backward() del loss total_processed += len(subbatch_trees) current_processed += len(subbatch_trees) grad_norm = torch.nn.utils.clip_grad_norm_(clippable_parameters, grad_clip_threshold) trainer.step() print("epoch {:,} " "batch {:,}/{:,} " "processed {:,} " "batch-loss {:.4f} " "grad-norm {:.4f} " "epoch-elapsed {} " "total-elapsed {}".format( epoch, start_index // args.batch_size + 1, int(np.ceil(len(train_parse) / args.batch_size)), total_processed, batch_loss_value, grad_norm, format_elapsed(epoch_start_time), format_elapsed(start_time), )) if current_processed >= check_every: current_processed -= check_every dev_fscore = check_eval(dev_treebank, epoch, flag='dev') test_fscore = check_eval(test_treebank, epoch, flag='test') if dev_fscore.fscore > best_eval_fscore: save_model(dev_fscore, remove_model=True) test_fscore_on_dev = test_fscore # adjust learning rate at the end of an epoch if (total_processed // args.batch_size + 1) > hparams.learning_rate_warmup_steps: scheduler.step(best_eval_fscore) if (total_processed - best_eval_processed) > args.patients \ + ((hparams.step_decay_patience + 1) * hparams.max_consecutive_decays * len(train_parse)): logger.info( "Terminating due to lack of improvement in eval fscore.") logger.info("best dev {} test {}".format( best_eval_scores, test_fscore_on_dev, )) break
def main(): import optparse parser = optparse.OptionParser() parser.add_option("-f", dest="filename", help="corpus filename") parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=numpy.random.gamma(1, 1)) parser.add_option("--gamma", dest="gamma", type="float", help="parameter gamma", default=numpy.random.gamma(1, 1)) parser.add_option("--beta", dest="base", type="float", help="parameter of beta measure H", default=0.5) parser.add_option("-k", dest="K", type="int", help="initial number of topics", default=1) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=10) parser.add_option("-s", dest="stopwords", type="int", help="0=exclude stop words, 1=include stop words", default=1) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) (options, args) = parser.parse_args() if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)") if options.seed != None: numpy.random.seed(options.seed) import vocabulary if options.filename: corpus = vocabulary.load_file(options.filename) else: corpus = vocabulary.load_corpus(options.corpus) if not corpus: parser.error("corpus range(-c) forms 'start:end'") voca = vocabulary.Vocabulary(options.stopwords == 0) docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) hdplda = HDPLDA(options.alpha, options.gamma, options.base, docs, voca.size()) print "corpus=%d words=%d alpha=%.3f gamma=%.3f base=%.3f stopwords=%d" % ( len(corpus), len(voca.vocas), options.alpha, options.gamma, options.base, options.stopwords) #hdplda.dump() #import cProfile #cProfile.runctx('hdplda_learning(hdplda, options.iteration)', globals(), locals(), 'hdplda.profile') hdplda_learning(hdplda, options.iteration) """
def main(): import os import pickle import optparse parser = optparse.OptionParser() parser.add_option("-m", dest="model", help="model filename") parser.add_option("-f", dest="filename", help="corpus filename") parser.add_option("-b", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.1) parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.01) parser.add_option("--eta", dest="eta", type="float", help="parameter eta", default=100) parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=10) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) parser.add_option( "-c", dest="constraint", help="add constraint (wordlist which should belong to the same topic)") parser.add_option("-u", "--unassign", dest="unassign", help="unassign method (all/doc/term/none)", default="none") (options, args) = parser.parse_args() numpy.random.seed(options.seed) if options.model and os.path.exists(options.model): with open(options.model, "rb") as f: lda, voca = pickle.load(f) elif not (options.filename or options.corpus): parser.error( "need corpus filename(-f) or corpus range(-b) or model(-m)") else: import vocabulary if options.filename: corpus = vocabulary.load_file(options.filename) else: corpus = vocabulary.load_corpus(options.corpus) if not corpus: parser.error("corpus range(-c) forms 'start:end'") voca = vocabulary.Vocabulary() docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) lda = ITM(options.K, options.alpha, options.beta, options.eta, docs, voca.size()) param = (len(lda.docs), len(voca.vocas), options.K, options.alpha, options.beta, options.eta) print "corpus=%d, words=%d, K=%d, a=%f, b=%f, eta=%f" % param if options.constraint: if options.unassign == "all": add_constraint = lda.add_constraint_all elif options.unassign == "doc": add_constraint = lda.add_constraint_doc elif options.unassign == "term": add_constraint = lda.add_constraint_term elif options.unassign == "none": add_constraint = lda.add_constraint_none else: parser.error("unassign method(-u) must be all/doc/term/none") wordlist = options.constraint.split(',') idlist = [voca.vocas_id[w] for w in wordlist] print "\n== add constraint ==" for w, v in zip(idlist, wordlist): print "%s [%s]" % (v, ",".join(str(x) for x in lda.n_k_w[:, w])) add_constraint(idlist) lda.verify_topic() #import cProfile #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile') lda_learning(lda, options.iteration, voca) with open(options.model, "wb") as f: pickle.dump((lda, voca), f)
def run_train(args, hparams): if args.numpy_seed is not None: print("Setting numpy random seed to {}...".format(args.numpy_seed)) np.random.seed(args.numpy_seed) # Make sure that pytorch is actually being initialized randomly. # On my cluster I was getting highly correlated results from multiple # runs, but calling reset_parameters() changed that. A brief look at the # pytorch source code revealed that pytorch initializes its RNG by # calling std::random_device, which according to the C++ spec is allowed # to be deterministic. seed_from_numpy = np.random.randint(2147483648) print("Manual seed for pytorch:", seed_from_numpy) torch.manual_seed(seed_from_numpy) hparams.set_from_args(args) print("Hyperparameters:") hparams.print() print("Loading training trees from {}...".format(args.train_path)) if hparams.predict_tags and args.train_path.endswith('10way.clean'): print("WARNING: The data distributed with this repository contains " "predicted part-of-speech tags only (not gold tags!) We do not " "recommend enabling predict_tags in this configuration.") train_treebank = trees.load_trees(args.train_path) if hparams.max_len_train > 0: train_treebank = [ tree for tree in train_treebank if len(list(tree.leaves())) <= hparams.max_len_train ] print("Loaded {:,} training examples.".format(len(train_treebank))) print("Loading development trees from {}...".format(args.dev_path)) dev_treebank = trees.load_trees(args.dev_path) if hparams.max_len_dev > 0: dev_treebank = [ tree for tree in dev_treebank if len(list(tree.leaves())) <= hparams.max_len_dev ] print("Loaded {:,} development examples.".format(len(dev_treebank))) print("Processing trees for training...") train_parse = [tree.convert() for tree in train_treebank] print("Constructing vocabularies...") tag_vocab = vocabulary.Vocabulary() tag_vocab.index(tokens.START) tag_vocab.index(tokens.STOP) tag_vocab.index(tokens.TAG_UNK) word_vocab = vocabulary.Vocabulary() word_vocab.index(tokens.START) word_vocab.index(tokens.STOP) word_vocab.index(tokens.UNK) label_vocab = vocabulary.Vocabulary() label_vocab.index(()) char_set = set() for tree in train_parse: nodes = [tree] while nodes: node = nodes.pop() if isinstance(node, trees.InternalParseNode): label_vocab.index(node.label) nodes.extend(reversed(node.children)) else: tag_vocab.index(node.tag) word_vocab.index(node.word) char_set |= set(node.word) char_vocab = vocabulary.Vocabulary() # If codepoints are small (e.g. Latin alphabet), index by codepoint directly highest_codepoint = max(ord(char) for char in char_set) if highest_codepoint < 512: if highest_codepoint < 256: highest_codepoint = 256 else: highest_codepoint = 512 # This also takes care of constants like tokens.CHAR_PAD for codepoint in range(highest_codepoint): char_index = char_vocab.index(chr(codepoint)) assert char_index == codepoint else: char_vocab.index(tokens.CHAR_UNK) char_vocab.index(tokens.CHAR_START_SENTENCE) char_vocab.index(tokens.CHAR_START_WORD) char_vocab.index(tokens.CHAR_STOP_WORD) char_vocab.index(tokens.CHAR_STOP_SENTENCE) for char in sorted(char_set): char_vocab.index(char) tag_vocab.freeze() word_vocab.freeze() label_vocab.freeze() char_vocab.freeze() def print_vocabulary(name, vocab): special = {tokens.START, tokens.STOP, tokens.UNK} print("{} ({:,}): {}".format( name, vocab.size, sorted(value for value in vocab.values if value in special) + sorted(value for value in vocab.values if value not in special))) if args.print_vocabs: print_vocabulary("Tag", tag_vocab) print_vocabulary("Word", word_vocab) print_vocabulary("Label", label_vocab) print("Initializing model...") load_path = None if load_path is not None: print(f"Loading parameters from {load_path}") info = torch_load(load_path) parser = parse_nk.NKChartParser.from_spec(info['spec'], info['state_dict']) else: parser = parse_nk.NKChartParser( tag_vocab, word_vocab, label_vocab, char_vocab, hparams, ) print("Initializing optimizer...") trainable_parameters = [ param for param in parser.parameters() if param.requires_grad ] trainer = torch.optim.Adam(trainable_parameters, lr=1., betas=(0.9, 0.98), eps=1e-9) if load_path is not None: trainer.load_state_dict(info['trainer']) def set_lr(new_lr): for param_group in trainer.param_groups: param_group['lr'] = new_lr assert hparams.step_decay, "Only step_decay schedule is supported" warmup_coeff = hparams.learning_rate / hparams.learning_rate_warmup_steps scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( trainer, 'max', factor=hparams.step_decay_factor, patience=hparams.step_decay_patience, verbose=True, ) def schedule_lr(iteration): iteration = iteration + 1 if iteration <= hparams.learning_rate_warmup_steps: set_lr(iteration * warmup_coeff) clippable_parameters = trainable_parameters grad_clip_threshold = np.inf if hparams.clip_grad_norm == 0 else hparams.clip_grad_norm print("Training...") total_processed = 0 current_processed = 0 check_every = len(train_parse) / args.checks_per_epoch best_dev_fscore = -np.inf best_dev_model_path = None best_dev_processed = 0 start_time = time.time() def check_dev(): nonlocal best_dev_fscore nonlocal best_dev_model_path nonlocal best_dev_processed dev_start_time = time.time() dev_predicted = [] for dev_start_index in range(0, len(dev_treebank), args.eval_batch_size): subbatch_trees = dev_treebank[dev_start_index:dev_start_index + args.eval_batch_size] subbatch_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in subbatch_trees] predicted, _ = parser.parse_batch(subbatch_sentences) del _ dev_predicted.extend([p.convert() for p in predicted]) dev_fscore = evaluate.evalb(args.evalb_dir, dev_treebank, dev_predicted) print("dev-fscore {} " "dev-elapsed {} " "total-elapsed {}".format( dev_fscore, format_elapsed(dev_start_time), format_elapsed(start_time), )) if dev_fscore.fscore > best_dev_fscore: if best_dev_model_path is not None: extensions = [".pt"] for ext in extensions: path = best_dev_model_path + ext if os.path.exists(path): print( "Removing previous model file {}...".format(path)) os.remove(path) best_dev_fscore = dev_fscore.fscore best_dev_model_path = "{}_dev={:.2f}".format( args.model_path_base, dev_fscore.fscore) best_dev_processed = total_processed print("Saving new best model to {}...".format(best_dev_model_path)) torch.save( { 'spec': parser.spec, 'state_dict': parser.state_dict(), 'trainer': trainer.state_dict(), }, best_dev_model_path + ".pt") for epoch in itertools.count(start=1): if args.epochs is not None and epoch > args.epochs: break np.random.shuffle(train_parse) epoch_start_time = time.time() for start_index in range(0, len(train_parse), args.batch_size): trainer.zero_grad() schedule_lr(total_processed // args.batch_size) batch_loss_value = 0.0 batch_trees = train_parse[start_index:start_index + args.batch_size] batch_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in batch_trees] batch_num_tokens = sum( len(sentence) for sentence in batch_sentences) for subbatch_sentences, subbatch_trees in parser.split_batch( batch_sentences, batch_trees, args.subbatch_max_tokens): _, loss = parser.parse_batch(subbatch_sentences, subbatch_trees) if hparams.predict_tags: loss = loss[0] / len( batch_trees) + loss[1] / batch_num_tokens else: loss = loss / len(batch_trees) loss_value = float(loss.data.cpu().numpy()) batch_loss_value += loss_value if loss_value > 0: loss.backward() del loss total_processed += len(subbatch_trees) current_processed += len(subbatch_trees) grad_norm = torch.nn.utils.clip_grad_norm_(clippable_parameters, grad_clip_threshold) trainer.step() print("epoch {:,} " "batch {:,}/{:,} " "processed {:,} " "batch-loss {:.4f} " "grad-norm {:.4f} " "epoch-elapsed {} " "total-elapsed {}".format( epoch, start_index // args.batch_size + 1, int(np.ceil(len(train_parse) / args.batch_size)), total_processed, batch_loss_value, grad_norm, format_elapsed(epoch_start_time), format_elapsed(start_time), )) if current_processed >= check_every: current_processed -= check_every check_dev() # adjust learning rate at the end of an epoch if (total_processed // args.batch_size + 1) > hparams.learning_rate_warmup_steps: scheduler.step(best_dev_fscore) if (total_processed - best_dev_processed) > ( (hparams.step_decay_patience + 1) * hparams.max_consecutive_decays * len(train_parse)): print("Terminating due to lack of improvement in dev fscore.") break
'images/9.JPG', 'images/10.JPG', ] nbr_images = len(imlist) featlist = [imlist[i][:-3] + 'sift' for i in range(nbr_images)] for i in range(nbr_images): # print featlist[i] # print imlist[i] sift.process_image(imlist[i], featlist[i]) """ imagename = "/opt/cv/images/2603.JPG" from PIL import Image im = Image.open(imagename).convert('L') im.save('/opt/cv/images/tmp.pgm') sift /opt/cv/images/tmp.pgm --output /opt/cv/images/2603.sift --edge-thresh 10 --peak-thresh 5 """ # print "ok" # exit() voc = vocabulary.Vocabulary('ukbenchtest') voc.train(featlist, 1000, 10) with open('vocabulary.pkl', 'wb') as f: pickle.dump(voc, f) print('vocabulary is:', voc.name, voc.nbr_words)
positivos = [] for file in dir: w = open("learn2/positivos/" + file).read() positivos.append(w) nn_corpus = [sentence.split(' ') for sentence in positivos] nnn_corpus = [sentence.split(' ') for sentence in negativos] nn_corpus += nnn_corpus classes = [] for i in range(50): classes.append(1) for i in range(50): classes.append(0) voca = vocabulary.Vocabulary("stopwords.txt") docs = [voca.doc_to_ids(doc) for doc in nn_corpus] NB = bernoulliNB.BernoulliNB(voca, docs, classes) def gg(): print ":B" def classify(comment): comment = comment.split(' ') tst_bow = voca.doc_to_ids_no_add(comment) print tst_bow return NB.apply(classes, voca, tst_bow)
def shared_tree_constrained_inference(self, testing_hps): np.set_printoptions(precision=4, suppress=True) testing_sent_tokens_1 = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10] testing_sent_tokens_1 = [str(t) for t in testing_sent_tokens_1] testing_sent_edu_ids_1 = [0, 0, 1, 1, 2, 2, 3, 3, 4, 4] testing_sent_parent_ids_1 = [1, 1, 2, 2, -1, -1, 2, 2, 3, 3] testing_sent_tokens_2 = [1, 2, 3, 4, 5, 6, 7, 8, 9] testing_sent_tokens_2 = [str(t) for t in testing_sent_tokens_2] testing_sent_edu_ids_2 = [5, 5, 5, 6, 7, 7, 7, 8, 8] testing_sent_parent_ids_2 = [6, 6, 6, -1, 6, 6, 6, 6, 6] dummy_abstract_sentence_1 = [ "0", "0", "0", "0", "0", "0", "0", "0", "0", "0" ] dummy_abstract_sentence_2 = ["0", "0", "0"] extract_labels_1 = [0 for _ in testing_sent_tokens_1] extract_labels_2 = [0 for _ in testing_sent_tokens_2] vocab_word_indices = range(11) dummy_vocab_indices = dict([(str(i), i) for i in vocab_word_indices]) stem_indices = dummy_vocab_indices word_stems = dict([(str(w), str(w)) for w in vocab_word_indices]) is_stop = [False for _ in vocab_word_indices] vocab = vocabulary.Vocabulary(dummy_vocab_indices, stem_indices, word_stems, is_stop, testing_hps.vocab_size) if testing_hps.single_sentence_concat: combined_toks = [["0"] + testing_sent_tokens_1 + testing_sent_tokens_2] combined_labels = [[1] + extract_labels_1 + extract_labels_2] combined_edu_ids = [ -1 ] + testing_sent_edu_ids_1 + testing_sent_edu_ids_2 combined_parent_ids = [ -2 ] + testing_sent_parent_ids_1 + testing_sent_parent_ids_2 combined_edu_ids = [[i + 1 for i in combined_edu_ids]] combined_parent_ids = [[i + 1 for i in combined_parent_ids]] nyt_ex_1 = data.SummaryExample(0, combined_toks, combined_edu_ids, combined_parent_ids, combined_labels, dummy_abstract_sentence_1) nyt_ex_2 = data.SummaryExample(1, combined_toks, combined_edu_ids, combined_parent_ids, combined_labels, dummy_abstract_sentence_2) else: nyt_ex_1 = data.SummaryExample( 0, [testing_sent_tokens_1, testing_sent_tokens_2], [testing_sent_edu_ids_1, testing_sent_edu_ids_2], [testing_sent_parent_ids_1, testing_sent_parent_ids_2], [extract_labels_1, extract_labels_2], dummy_abstract_sentence_1) nyt_ex_2 = data.SummaryExample( 1, [testing_sent_tokens_1, testing_sent_tokens_2], [testing_sent_edu_ids_1, testing_sent_edu_ids_2], [testing_sent_parent_ids_1, testing_sent_parent_ids_2], [extract_labels_1, extract_labels_2], dummy_abstract_sentence_2) with self.test_session() as session: tf.set_random_seed(12) model_inp = ffttci.TreeInferenceInputs(testing_hps) ex_batch = ffttci.TreeInferenceBatch(testing_hps, model_inp, [nyt_ex_1, nyt_ex_2], vocab) inferencer = ffttci.TreeConstrainedInferencer() logit_shape = [testing_hps.batch_size, testing_hps.num_art_steps] word_logits = tf.constant(np.full(logit_shape, 0.0), dtype=tf.float32, shape=logit_shape) margs, samples, logz = inferencer.do_tree_inference( testing_hps, model_inp, word_logits) margs = tf.reshape(margs, [testing_hps.batch_size, -1]) grad_logz = tf.gradients(logz, word_logits)[0] margs_np, samples_np, logz_np, grad_logz_np = session.run( [margs, samples, logz, grad_logz], ex_batch.feeds) emp_marg = np.average(samples_np, axis=1) emp_marg = np.reshape(emp_marg, [testing_hps.batch_size, -1]) # sampled marginals should be pretty close to marginals calculated from BP self.assertNDArrayNear(margs_np, emp_marg, 0.05) # gradient of logz should be _very_ close to marginals calculated from BP self.assertNDArrayNear(margs_np, grad_logz_np, 0.001) # for k=3 example, logz should equal log(3) self.assertNear(1.08961229, logz_np[1], 0.01)
def run_train(args, hparams): if args.numpy_seed is not None: print("Setting numpy random seed to {}...".format(args.numpy_seed)) np.random.seed(args.numpy_seed) # Make sure that pytorch is actually being initialized randomly. # On my cluster I was getting highly correlated results from multiple # runs, but calling reset_parameters() changed that. A brief look at the # pytorch source code revealed that pytorch initializes its RNG by # calling std::random_device, which according to the C++ spec is allowed # to be deterministic. seed_from_numpy = np.random.randint(2147483648) print("Manual seed for pytorch:", seed_from_numpy) torch.manual_seed(seed_from_numpy) hparams.set_from_args(args) print("Hyperparameters:") hparams.print() train_path = args.train_ptb_path dev_path = args.dev_ptb_path dep_train_path = args.dep_train_ptb_path dep_dev_path = args.dep_dev_ptb_path if hparams.dataset == 'ctb': train_path = args.train_ctb_path dev_path = args.dev_ctb_path dep_train_path = args.dep_train_ctb_path dep_dev_path = args.dep_dev_ctb_path dep_reader = CoNLLXReader(dep_train_path) print('Reading dependency parsing data from %s' % dep_train_path) dep_dev_reader = CoNLLXReader(dep_dev_path) print('Reading dependency parsing data from %s' % dep_dev_path) counter = 0 dep_sentences = [] dep_data = [] dep_heads = [] dep_types = [] inst = dep_reader.getNext() while inst is not None: inst_size = inst.length() if hparams.max_len_train > 0 and inst_size - 1 > hparams.max_len_train: inst = dep_reader.getNext() continue counter += 1 if counter % 10000 == 0: print("reading data: %d" % counter) sent = inst.sentence dep_data.append((sent.words, inst.postags, inst.heads, inst.types)) #dep_sentences.append([(tag, word) for i, (word, tag) in enumerate(zip(sent.words, sent.postags))]) dep_sentences.append(sent.words) dep_heads.append(inst.heads) dep_types.append(inst.types) inst = dep_reader.getNext() dep_reader.close() print("Total number of data: %d" % counter) dep_dev_data = [] dev_inst = dep_dev_reader.getNext() dep_dev_headid = np.zeros([3000,300],dtype=int) dep_dev_type = [] dep_dev_word = [] dep_dev_pos = [] dep_dev_lengs = np.zeros(3000, dtype=int) cun = 0 while dev_inst is not None: inst_size = dev_inst.length() if hparams.max_len_dev > 0 and inst_size - 1> hparams.max_len_dev: dev_inst = dep_dev_reader.getNext() continue dep_dev_lengs[cun] = inst_size sent = dev_inst.sentence dep_dev_data.append((sent.words, dev_inst.postags, dev_inst.heads, dev_inst.types)) for i in range(inst_size): dep_dev_headid[cun][i] = dev_inst.heads[i] dep_dev_type.append(dev_inst.types) dep_dev_word.append(sent.words) dep_dev_pos.append(sent.postags) #dep_sentences.append([(tag, word) for i, (word, tag) in enumerate(zip(sent.words, sent.postags))]) dev_inst = dep_dev_reader.getNext() cun = cun + 1 dep_dev_reader.close() print("Loading training trees from {}...".format(train_path)) train_treebank = trees.load_trees(train_path, dep_heads, dep_types, dep_sentences) if hparams.max_len_train > 0: train_treebank = [tree for tree in train_treebank if len(list(tree.leaves())) <= hparams.max_len_train] print("Loaded {:,} training examples.".format(len(train_treebank))) print("Loading development trees from {}...".format(dev_path)) dev_treebank = trees.load_trees(dev_path, dep_dev_headid, dep_dev_type, dep_dev_word) if hparams.max_len_dev > 0: dev_treebank = [tree for tree in dev_treebank if len(list(tree.leaves())) <= hparams.max_len_dev] print("Loaded {:,} development examples.".format(len(dev_treebank))) print("Processing trees for training...") train_parse = [tree.convert() for tree in train_treebank] dev_parse = [tree.convert() for tree in dev_treebank] count_wh("train data:", train_parse, dep_heads, dep_types) count_wh("dev data:", dev_parse, dep_dev_headid, dep_dev_type) print("Constructing vocabularies...") tag_vocab = vocabulary.Vocabulary() tag_vocab.index(Zparser.START) tag_vocab.index(Zparser.STOP) tag_vocab.index(Zparser.TAG_UNK) word_vocab = vocabulary.Vocabulary() word_vocab.index(Zparser.START) word_vocab.index(Zparser.STOP) word_vocab.index(Zparser.UNK) label_vocab = vocabulary.Vocabulary() label_vocab.index(()) sublabels = [Zparser.Sub_Head] label_vocab.index(tuple(sublabels)) type_vocab = vocabulary.Vocabulary() char_set = set() for i, tree in enumerate(train_parse): const_sentences = [leaf.word for leaf in tree.leaves()] assert len(const_sentences) == len(dep_sentences[i]) nodes = [tree] while nodes: node = nodes.pop() if isinstance(node, trees.InternalParseNode): label_vocab.index(node.label) if node.type is not Zparser.ROOT:#not include root type type_vocab.index(node.type) nodes.extend(reversed(node.children)) else: tag_vocab.index(node.tag) word_vocab.index(node.word) type_vocab.index(node.type) char_set |= set(node.word) char_vocab = vocabulary.Vocabulary() #char_vocab.index(tokens.CHAR_PAD) # If codepoints are small (e.g. Latin alphabet), index by codepoint directly highest_codepoint = max(ord(char) for char in char_set) if highest_codepoint < 512: if highest_codepoint < 256: highest_codepoint = 256 else: highest_codepoint = 512 # This also takes care of constants like tokens.CHAR_PAD for codepoint in range(highest_codepoint): char_index = char_vocab.index(chr(codepoint)) assert char_index == codepoint else: char_vocab.index(tokens.CHAR_UNK) char_vocab.index(tokens.CHAR_START_SENTENCE) char_vocab.index(tokens.CHAR_START_WORD) char_vocab.index(tokens.CHAR_STOP_WORD) char_vocab.index(tokens.CHAR_STOP_SENTENCE) for char in sorted(char_set): char_vocab.index(char) tag_vocab.freeze() word_vocab.freeze() label_vocab.freeze() char_vocab.freeze() type_vocab.freeze() punctuation = hparams.punctuation punct_set = punctuation def print_vocabulary(name, vocab): special = {tokens.START, tokens.STOP, tokens.UNK} print("{} ({:,}): {}".format( name, vocab.size, sorted(value for value in vocab.values if value in special) + sorted(value for value in vocab.values if value not in special))) if args.print_vocabs: print_vocabulary("Tag", tag_vocab) print_vocabulary("Word", word_vocab) print_vocabulary("Label", label_vocab) print_vocabulary("Char", char_vocab) print_vocabulary("Type", type_vocab) print("Initializing model...") load_path = None if load_path is not None: print(f"Loading parameters from {load_path}") info = torch_load(load_path) parser = Zparser.ChartParser.from_spec(info['spec'], info['state_dict']) else: parser = Zparser.ChartParser( tag_vocab, word_vocab, label_vocab, char_vocab, type_vocab, hparams, ) print("Initializing optimizer...") trainable_parameters = [param for param in parser.parameters() if param.requires_grad] trainer = torch.optim.Adam(trainable_parameters, lr=1., betas=(0.9, 0.98), eps=1e-9) if load_path is not None: trainer.load_state_dict(info['trainer']) def set_lr(new_lr): for param_group in trainer.param_groups: param_group['lr'] = new_lr assert hparams.step_decay, "Only step_decay schedule is supported" warmup_coeff = hparams.learning_rate / hparams.learning_rate_warmup_steps scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( trainer, 'max', factor=hparams.step_decay_factor, patience=hparams.step_decay_patience, verbose=True, ) def schedule_lr(iteration): iteration = iteration + 1 if iteration <= hparams.learning_rate_warmup_steps: set_lr(iteration * warmup_coeff) clippable_parameters = trainable_parameters grad_clip_threshold = np.inf if hparams.clip_grad_norm == 0 else hparams.clip_grad_norm print("Training...") total_processed = 0 current_processed = 0 check_every = len(train_parse) / args.checks_per_epoch best_dev_score = -np.inf best_model_path = None model_name = hparams.model_name print("This is ", model_name) start_time = time.time() def check_dev(epoch_num): nonlocal best_dev_score nonlocal best_model_path dev_start_time = time.time() parser.eval() dev_predicted = [] for dev_start_index in range(0, len(dev_treebank), args.eval_batch_size): subbatch_trees = dev_treebank[dev_start_index:dev_start_index+args.eval_batch_size] subbatch_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in subbatch_trees] predicted, _,= parser.parse_batch(subbatch_sentences) del _ dev_predicted.extend([p.convert() for p in predicted]) dev_fscore = evaluate.evalb(args.evalb_dir, dev_treebank, dev_predicted) print( "dev-fscore {} " "dev-elapsed {} " "total-elapsed {}".format( dev_fscore, format_elapsed(dev_start_time), format_elapsed(start_time), ) ) dev_pred_head = [[leaf.father for leaf in tree.leaves()] for tree in dev_predicted] dev_pred_type = [[leaf.type for leaf in tree.leaves()] for tree in dev_predicted] assert len(dev_pred_head) == len(dev_pred_type) assert len(dev_pred_type) == len(dep_dev_type) stats, stats_nopunc, stats_root, num_inst = dep_eval.eval(len(dev_pred_head), dep_dev_word, dep_dev_pos, dev_pred_head, dev_pred_type, dep_dev_headid, dep_dev_type, dep_dev_lengs, punct_set=punct_set, symbolic_root=False) dev_ucorr, dev_lcorr, dev_total, dev_ucomlpete, dev_lcomplete = stats dev_ucorr_nopunc, dev_lcorr_nopunc, dev_total_nopunc, dev_ucomlpete_nopunc, dev_lcomplete_nopunc = stats_nopunc dev_root_corr, dev_total_root = stats_root dev_total_inst = num_inst print( 'W. Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%%' % ( dev_ucorr, dev_lcorr, dev_total, dev_ucorr * 100 / dev_total, dev_lcorr * 100 / dev_total, dev_ucomlpete * 100 / dev_total_inst, dev_lcomplete * 100 / dev_total_inst)) print( 'Wo Punct: ucorr: %d, lcorr: %d, total: %d, uas: %.2f%%, las: %.2f%%, ucm: %.2f%%, lcm: %.2f%%' % ( dev_ucorr_nopunc, dev_lcorr_nopunc, dev_total_nopunc, dev_ucorr_nopunc * 100 / dev_total_nopunc, dev_lcorr_nopunc * 100 / dev_total_nopunc, dev_ucomlpete_nopunc * 100 / dev_total_inst, dev_lcomplete_nopunc * 100 / dev_total_inst)) print('Root: corr: %d, total: %d, acc: %.2f%%' % ( dev_root_corr, dev_total_root, dev_root_corr * 100 / dev_total_root)) dev_uas = dev_ucorr_nopunc * 100 / dev_total_nopunc dev_las = dev_lcorr_nopunc * 100 / dev_total_nopunc if dev_fscore.fscore + dev_las > best_dev_score : if best_model_path is not None: extensions = [".pt"] for ext in extensions: path = best_model_path + ext if os.path.exists(path): print("Removing previous model file {}...".format(path)) os.remove(path) best_dev_score = dev_fscore.fscore + dev_las best_model_path = "{}_best_dev={:.2f}_devuas={:.2f}_devlas={:.2f}".format( args.model_path_base, dev_fscore.fscore, dev_uas,dev_las) print("Saving new best model to {}...".format(best_model_path)) torch.save({ 'spec': parser.spec, 'state_dict': parser.state_dict(), 'trainer' : trainer.state_dict(), }, besthh_model_path + ".pt") for epoch in itertools.count(start=1): if args.epochs is not None and epoch > args.epochs: break #check_dev(epoch) np.random.shuffle(train_parse) epoch_start_time = time.time() for start_index in range(0, len(train_parse), args.batch_size): trainer.zero_grad() schedule_lr(total_processed // args.batch_size) parser.train() batch_loss_value = 0.0 batch_trees = train_parse[start_index:start_index + args.batch_size] batch_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in batch_trees] for subbatch_sentences, subbatch_trees in parser.split_batch(batch_sentences, batch_trees, args.subbatch_max_tokens): _, loss = parser.parse_batch(subbatch_sentences, subbatch_trees) loss = loss / len(batch_trees) loss_value = float(loss.data.cpu().numpy()) batch_loss_value += loss_value if loss_value > 0: loss.backward() del loss total_processed += len(subbatch_trees) current_processed += len(subbatch_trees) grad_norm = torch.nn.utils.clip_grad_norm_(clippable_parameters, grad_clip_threshold) trainer.step() print( "epoch {:,} " "batch {:,}/{:,} " "processed {:,} " "batch-loss {:.4f} " "grad-norm {:.4f} " "epoch-elapsed {} " "total-elapsed {}".format( epoch, start_index // args.batch_size + 1, int(np.ceil(len(train_parse) / args.batch_size)), total_processed, batch_loss_value, grad_norm, format_elapsed(epoch_start_time), format_elapsed(start_time), ) ) if current_processed >= check_every: current_processed -= check_every check_dev(epoch) # adjust learning rate at the end of an epoch if hparams.step_decay: if (total_processed // args.batch_size + 1) > hparams.learning_rate_warmup_steps: scheduler.step(best_dev_score)
def run_train(args, hparams): if args.numpy_seed is not None: print("Setting numpy random seed to {}...".format(args.numpy_seed)) np.random.seed(args.numpy_seed) seed_from_numpy = np.random.randint(2147483648) print("Manual seed for pytorch:", seed_from_numpy) torch.manual_seed(seed_from_numpy) hparams.set_from_args(args) print("Hyperparameters:") hparams.print() train_path = args.train_ptb_path dev_path = args.dev_ptb_path if hparams.dataset == "ctb": train_path = args.train_ctb_path dev_path = args.dev_ctb_path print("Loading training trees from {}...".format(train_path)) train_treebank = trees.load_trees(train_path) if hparams.max_len_train > 0: train_treebank = [ tree for tree in train_treebank if len(list(tree.leaves())) <= hparams.max_len_train ] print("Loaded {:,} training examples.".format(len(train_treebank))) print("Loading development trees from {}...".format(dev_path)) dev_treebank = trees.load_trees(dev_path) if hparams.max_len_dev > 0: dev_treebank = [ tree for tree in dev_treebank if len(list(tree.leaves())) <= hparams.max_len_dev ] print("Loaded {:,} development examples.".format(len(dev_treebank))) print("Processing trees for training...") train_parse = [tree.convert() for tree in train_treebank] dev_parse = [tree.convert() for tree in dev_treebank] print("Constructing vocabularies...") tag_vocab = vocabulary.Vocabulary() tag_vocab.index(Lparser.START) tag_vocab.index(Lparser.STOP) tag_vocab.index(Lparser.TAG_UNK) word_vocab = vocabulary.Vocabulary() word_vocab.index(Lparser.START) word_vocab.index(Lparser.STOP) word_vocab.index(Lparser.UNK) label_vocab = vocabulary.Vocabulary() label_vocab.index(()) char_set = set() for tree in train_parse: nodes = [tree] while nodes: node = nodes.pop() if isinstance(node, trees.InternalParseNode): label_vocab.index(node.label) nodes.extend(reversed(node.children)) else: tag_vocab.index(node.tag) word_vocab.index(node.word) char_set |= set(node.word) char_vocab = vocabulary.Vocabulary() # If codepoints are small (e.g. Latin alphabet), index by codepoint directly highest_codepoint = max(ord(char) for char in char_set) if highest_codepoint < 512: if highest_codepoint < 256: highest_codepoint = 256 else: highest_codepoint = 512 # This also takes care of constants like tokens.CHAR_PAD for codepoint in range(highest_codepoint): char_index = char_vocab.index(chr(codepoint)) assert char_index == codepoint else: char_vocab.index(tokens.CHAR_UNK) char_vocab.index(tokens.CHAR_START_SENTENCE) char_vocab.index(tokens.CHAR_START_WORD) char_vocab.index(tokens.CHAR_STOP_WORD) char_vocab.index(tokens.CHAR_STOP_SENTENCE) for char in sorted(char_set): char_vocab.index(char) tag_vocab.freeze() word_vocab.freeze() label_vocab.freeze() char_vocab.freeze() def print_vocabulary(name, vocab): special = {tokens.START, tokens.STOP, tokens.UNK} print("{} ({:,}): {}".format( name, vocab.size, sorted(value for value in vocab.values if value in special) + sorted(value for value in vocab.values if value not in special), )) if args.print_vocabs: print_vocabulary("Tag", tag_vocab) print_vocabulary("Word", word_vocab) print_vocabulary("Label", label_vocab) print_vocabulary("Char", char_vocab) print("Initializing model...") load_path = None if load_path is not None: print("Loading parameters from {}".format(load_path)) info = torch_load(load_path) parser = Lparser.ChartParser.from_spec(info["spec"], info["state_dict"]) else: parser = Lparser.ChartParser( tag_vocab, word_vocab, label_vocab, char_vocab, hparams, ) print("Initializing optimizer...") trainable_parameters = [ param for param in parser.parameters() if param.requires_grad ] trainer = torch.optim.Adam(trainable_parameters, lr=1.0, betas=(0.9, 0.98), eps=1e-9) if load_path is not None: trainer.load_state_dict(info["trainer"]) def set_lr(new_lr): for param_group in trainer.param_groups: param_group["lr"] = new_lr assert hparams.step_decay, "Only step_decay schedule is supported" warmup_coeff = hparams.learning_rate / hparams.learning_rate_warmup_steps scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau( trainer, "max", factor=hparams.step_decay_factor, patience=hparams.step_decay_patience, verbose=True, ) def schedule_lr(iteration): iteration = iteration + 1 if iteration <= hparams.learning_rate_warmup_steps: set_lr(iteration * warmup_coeff) clippable_parameters = trainable_parameters grad_clip_threshold = (np.inf if hparams.clip_grad_norm == 0 else hparams.clip_grad_norm) print("Training...") total_processed = 0 current_processed = 0 check_every = len(train_parse) / args.checks_per_epoch best_dev_fscore = -np.inf best_model_path = None model_name = hparams.model_name best_dev_processed = 0 print("This is ", model_name) start_time = time.time() def check_dev(epoch_num): nonlocal best_dev_fscore nonlocal best_model_path nonlocal best_dev_processed dev_start_time = time.time() parser.eval() dev_predicted = [] for dev_start_index in range(0, len(dev_treebank), args.eval_batch_size): subbatch_trees = dev_treebank[dev_start_index:dev_start_index + args.eval_batch_size] subbatch_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in subbatch_trees] ( predicted, _, ) = parser.parse_batch(subbatch_sentences) del _ dev_predicted.extend([p.convert() for p in predicted]) dev_fscore = evaluate.evalb(args.evalb_dir, dev_treebank, dev_predicted) print("\n" "dev-fscore {} " "dev-elapsed {} " "total-elapsed {}".format(dev_fscore, format_elapsed(dev_start_time), format_elapsed(start_time))) if dev_fscore.fscore > best_dev_fscore: if best_model_path is not None: extensions = [".pt"] for ext in extensions: path = best_model_path + ext if os.path.exists(path): print( "Removing previous model file {}...".format(path)) os.remove(path) best_dev_fscore = dev_fscore.fscore best_model_path = "{}_best_dev={:.2f}".format( args.model_path_base, dev_fscore.fscore) best_dev_processed = total_processed print("Saving new best model to {}...".format(best_model_path)) torch.save( { "spec": parser.spec, "state_dict": parser.state_dict(), "trainer": trainer.state_dict(), }, best_model_path + ".pt", ) for epoch in itertools.count(start=1): if args.epochs is not None and epoch > args.epochs: break np.random.shuffle(train_parse) epoch_start_time = time.time() for start_index in range(0, len(train_parse), args.batch_size): trainer.zero_grad() schedule_lr(total_processed // args.batch_size) parser.train() batch_loss_value = 0.0 batch_trees = train_parse[start_index:start_index + args.batch_size] batch_sentences = [[(leaf.tag, leaf.word) for leaf in tree.leaves()] for tree in batch_trees] for subbatch_sentences, subbatch_trees in parser.split_batch( batch_sentences, batch_trees, args.subbatch_max_tokens): _, loss = parser.parse_batch(subbatch_sentences, subbatch_trees) loss = loss / len(batch_trees) loss_value = float(loss.data.cpu().numpy()) batch_loss_value += loss_value if loss_value > 0: loss.backward() del loss total_processed += len(subbatch_trees) current_processed += len(subbatch_trees) grad_norm = torch.nn.utils.clip_grad_norm_(clippable_parameters, grad_clip_threshold) trainer.step() print( "\r" "epoch {:,} " "batch {:,}/{:,} " "processed {:,} " "batch-loss {:.4f} " "grad-norm {:.4f} " "epoch-elapsed {} " "total-elapsed {}".format( epoch, start_index // args.batch_size + 1, int(np.ceil(len(train_parse) / args.batch_size)), total_processed, batch_loss_value, grad_norm, format_elapsed(epoch_start_time), format_elapsed(start_time), ), end="", ) sys.stdout.flush() if current_processed >= check_every: current_processed -= check_every check_dev(epoch) # adjust learning rate at the end of an epoch if hparams.step_decay: if (total_processed // args.batch_size + 1) > hparams.learning_rate_warmup_steps: scheduler.step(best_dev_fscore)
def main(): import optparse import vocabulary parser = optparse.OptionParser() parser.add_option("-f", dest="filename", help="corpus filename", default='ap.txt') #use any sample .txt for training parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)") parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5) parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5) parser.add_option("-k", dest="K", type="int", help="number of topics", default=2) parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=10) parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False) parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False) parser.add_option("--seed", dest="seed", type="int", help="random seed") parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) (options, args) = parser.parse_args() if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)") if options.filename: corpus = vocabulary.load_file(options.filename) else: corpus = vocabulary.load_corpus( options.corpus) #here is to load 'corpora/wordnet', install first if not corpus: parser.error("corpus range(-c) forms 'start:end'") if options.seed != None: numpy.random.seed(options.seed) voca = vocabulary.Vocabulary(options.stopwords) docs = [voca.doc_to_ids(doc) for doc in corpus] if options.df > 0: docs = voca.cut_low_freq(docs, options.df) lda = LDA(options.K, options.alpha, options.beta, docs, voca.size(), options.smartinit) print( "corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(corpus), len(voca.vocas), options.K, options.alpha, options.beta)) #import cProfile #cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile') lda_learning(lda, options.iteration, voca)
""" import os import cPickle as pickle import tic import sift import vocabulary from imtools import get_imageList imlist = get_imageList("../local/data/JianDa1") imcount = len(imlist) print imlist print imcount featlist = [imlist[i][:-3] + 'sift' for i in range(imcount)] tic.k('Start') for i in range(imcount): if not os.path.exists(featlist[i]): sift.process_image(imlist[i], featlist[i]) tic.k('sift loaded') voc = vocabulary.Vocabulary('JianDa1') # ukbenchtest voc.train(featlist, k=imcount, subsampling=10) tic.k('train loaded') # 保存词汇 #imagepkl = r"pickle\vocabulary.pkl" imagepkl = r"../static\pickle\jianda1.pkl" with open(imagepkl, 'wb') as f: pickle.dump(voc, f) print imagepkl, 'is:', voc.name, voc.word_count
# # The most important thing that you will likely wind up changing here is the `feature_processor` code. You'll see the two lists of features being used (the word features and the list features). As you add more features, you will need to add those features to these arguments so they actually get checked against tokens in the input. # In[6]: default_tokenizer = lambda i: tagged_contexts(tagtools.bies_tagged_tokens(i)) default_token_view = lambda i: i[0] default_feature_processor = make_cxt_feature_processor( [all_digits, lonely_initial, identity_feature], [is_empty]) def default_features(vocab): return lambda data: vocab bib_features = vocabulary.Vocabulary() bib_data = tagtools.DataManager(reference_train_file, reference_test_file, reference_dev_file, reference_xml_item_keyword, default_tokenizer, default_token_view, default_features(bib_features), default_feature_processor) # Load the data from the file system # In[7]: bib_data.initialize() # Look at how we're analyzing a typical item. #
print "Cleaned sample! Final rows ", len(sample_final) #######SPLITING TRAIN TEST rows = random.sample(sample_final.index, 2000) sample_train = sample_final.drop(rows) sample_test = sample_final.ix[rows] print "Final test and train sizes:", len(sample_test), len(sample_train) cleaned_reviews_train = sample_train['text'] cleaned_reviews_test = sample_test['text'] print("Sample cleaned!") ############ SETTING THE VARIABLES #Text voca = v.Vocabulary() docs = voca.read_corpus(cleaned_reviews_train) docs_test = voca.new_corpus(cleaned_reviews_test) if options.model == 'slda': # Supervised Y_train = sample_train.stars if options.model == 'dmr': feat_orig = np.reshape(sample_train.stars, (len(sample_train.stars), 1)) #Features sample_train.columns = [s.encode('utf-8') for s in sample_train.columns] features_biz = sample_train.filter(regex='biz_') features_biz = features_biz.drop('biz_name', axis=1) #veeecs = features_biz #vecs = np.array([[v for v in vec] for vec in vecs], dtype=np.float32) feat_biz = np.array(features_biz)