def build_vocab(self, embed_file: str = None) -> Vocab: """Build the vocabulary for the data set. Args: embed_file (str, optional): The file path of the pre-trained embedding word vector. Defaults to None. Returns: vocab.Vocab: The vocab object. """ # word frequency word_counts = Counter() count_words(word_counts, [sample['src'] + sample['tgt'] for sample in self.samples]) vocab = Vocab() # Filter the vocabulary by keeping only the top k tokens in terms of # word frequncy in the data set, where k is the maximum vocab size set # in "config.py". for word, count in word_counts.most_common(config.max_vocab_size): vocab.add_words([word]) if embed_file is not None: count = vocab.load_embeddings(embed_file) logging.info("%d pre-trained embeddings loaded." % count) with open(config.vocab, "wb") as f: pickle.dump(vocab, f) return vocab
def build_vocab(self, embed_file: str = None) -> Vocab: """Build the vocabulary for the data set. Args: embed_file (str, optional): The file path of the pre-trained embedding word vector. Defaults to None. Returns: vocab.Vocab: The vocab object. """ # word frequency word_counts = Counter() count_words(word_counts, [src + tgr for src, tgr in self.pairs]) vocab = Vocab() # Filter the vocabulary by keeping only the top k tokens in terms of # word frequncy in the data set, where k is the maximum vocab size set # in "config.py". for word, count in word_counts.most_common(config.max_vocab_size): vocab.add_words([word]) if embed_file is not None: count = vocab.load_embeddings(embed_file) print("%d pre-trained embeddings loaded." % count) return vocab
def load_temario(self, version): ''' TEMARIO: NUmero de palabras o sentencias del resumo final CSTNews: 70% en numero de palabras del documento con mayor peso ''' print "temario :)" corpus_dictionary = dict() if version == 'temario_v1': path = corpus_dir[version] path_sumarios = summaries_dir[version] documents = os.listdir(path) sumarios = os.listdir(path_sumarios) for i in documents: docPath = path + '/' + i # print docPath document_name = i[3:] document_name = document_name[:-4] document_sentences = read_document(docPath, self.language) class_labels_ml = None if self.dictionary_class_labels is not None: class_labels_ml = self.dictionary_class_labels[ document_name] naive_tagged_sentences = naive_tag( document_sentences, class_labels_ml ) # modificado para tambien etiquetar las sentencias q hacen parte del sumario o no fazen parte #print naive_tagged_sentences #corpus_dictionary[document_name] = [document_sentences] corpus_dictionary[document_name] = [naive_tagged_sentences] for i in sumarios: summPath = path_sumarios + i # print summPath summary_name = i[4:] summary_name = summary_name[:-4] size_summary = count_words(summPath, self.language) value = corpus_dictionary[summary_name] # size_summary value.append(size_summary) corpus_dictionary[summary_name] = value else: print 'version 2' return corpus_dictionary
def load_cst_news(self, version): print "cst news :)" corpus_dictionary = dict() if version == 'cstnews_v1': path = corpus_dir[version] clusters = os.listdir(path) special = '.DS_Store' if special in clusters: clusters.remove(special) for i in clusters: sub_path = path + i + '/' + corpus_dir['textosFonte'] documents = os.listdir(sub_path) if special in documents: documents.remove(special) allSentences = [] document_lenghts = [] #top_sentences = [] index = 1 for j in documents: document = sub_path + j document_sentences = read_document(document, self.language) class_labels_ml = None if self.dictionary_class_labels is not None: class_labels_ml = self.dictionary_class_labels[i] #for k in range(3): # top_sentences.append(document_sentences[k]) document_size = count_words(document, self.language) document_lenghts.append(document_size) taggedSentences = tag_sentence(document_sentences, index, class_labels_ml) #print taggedSentences index += 1 allSentences.extend(taggedSentences) size_cluster = max(document_lenghts) size_summary = (30 * size_cluster) / 100 #corpus_dictionary[i] = [allSentences, size_summary, top_sentences] corpus_dictionary[i] = [allSentences, size_summary] else: print 'version 2' # corpus = ['diccionario con nombres y los datos' ,'loaded corpus sin procesar' , 'vectores de sizes de sumarios'] return corpus_dictionary
def post(self): url = self.get_argument('url', '') words = [] if url: words = utils.count_words(url) utils.store_words(words, self.db) max = 0 for word in words: if word['counter'] > max: max = word['counter'] if max < 5: max = 5 def size(counter): return min(int(ceil(counter / ceil(max / 5))), 5) self.render("words.html", words=words, url=url, size=size)
def ma_lo_data_set(path: str, n_words: int) -> Tuple[Dict, Dict, Dict[str, int]]: ''' make, load and return the dataset from a structured data folder :param n_words: most important words number :type n_words: int :param path: path to dataset :type path: str :return: :rtype: ''' word_counter = utils.count_words(path) voc = utils.most_rep(word_counter, n_words) data = utils.load_data(path) np.random.shuffle(data) train_set, test_set = data[:-5000], data[-5000:] return train_set, test_set, voc
def run(): print("Read train data...") train_data = utils.concat_sets( utils.read_and_parse(config.DATA_TRAINING_POS_REVIEW, parsers.WordsParser), utils.read_and_parse(config.DATA_TRAINING_NEG_REVIEW, parsers.WordsParser), is_join=True, is_shuffle=True) print("Read test data...") test_data = utils.concat_sets( utils.read_and_parse(config.DATA_TEST_POS_REVIEW, parsers.WordsParser), utils.read_and_parse(config.DATA_TEST_NEG_REVIEW, parsers.WordsParser), is_join=True, is_shuffle=True) print('Creating the bag of words...') # note that CountVectorizer comes with its own options # to automatically do preprocessing, tokenization, and stop word removal # for each of these, instead of specifying "None", # it's possible to use a built-in method or custom function, # however, in this example, for data cleaning used custom parsers vectorizer = CountVectorizer(analyzer='word', tokenizer=None, preprocessor=None, stop_words=None, max_features=5000) print('Cleaning and parsing the train set movie reviews...') # get a bag of words for the training set, and convert to a numpy array # example result: # train_texts -> [[1, 3], [1, 2], [3, 1], ...] train_texts = vectorizer.fit_transform(train_texts).toarray() print('Cleaning and parsing the test set movie reviews...') # get a bag of words for the test set, and convert to a numpy array # example result: # test_texts -> [[1, 3], [1, 2], [3, 1], ...] test_texts = vectorizer.transform(test_texts).toarray() print('Training the Random Forest...') n_estimators = 100 # example result: # test_sentiments_predicted_rf -> [1, 0, 1...] test_sentiments_predicted_rf = classifiers_sk.random_forest( train_texts, train_sentiments, test_texts, n_estimators=n_estimators) print('Training the Naive Bayes Gaussian...') # example result: # test_sentiments_predicted_nbg -> [1, 0, 1...] test_sentiments_predicted_nbg = classifiers_sk.naive_bayes_gaussian( train_texts, train_sentiments, test_texts) print('Training the Naive Bayes Multinomial...') # example result: # test_sentiments_predicted_nbm -> [1, 0, 1...] test_sentiments_predicted_nbm = classifiers_sk.naive_bayes_multinomial( train_texts, train_sentiments, test_texts) print('Training the Naive Bayes Bernoulli...') # example result: # test_sentiments_predicted_nbb -> [1, 0, 1...] test_sentiments_predicted_nbb = classifiers_sk.naive_bayes_bernoulli( train_texts, train_sentiments, test_texts) print('Training the k-Nearest Neighbors...') n_neighbors = 100 # example result: # test_sentiments_predicted_knn -> [1, 0, 1...] test_sentiments_predicted_knn = classifiers_sk.k_nearest_neighbors( train_texts, train_sentiments, test_texts, n_neighbors=n_neighbors) print('Accuracy of the the Random Forest: {accuracy}'.format( accuracy=utils.calculate_accuracy(test_sentiments, test_sentiments_predicted_rf))) print('Accuracy of the Naive Bayes Gaussian: {accuracy}'.format( accuracy=utils.calculate_accuracy(test_sentiments, test_sentiments_predicted_nbg))) print('Accuracy of the Naive Bayes Multinomial: {accuracy}'.format( accuracy=utils.calculate_accuracy(test_sentiments, test_sentiments_predicted_nbm))) print('Accuracy of the Naive Bayes Bernoulli: {accuracy}'.format( accuracy=utils.calculate_accuracy(test_sentiments, test_sentiments_predicted_nbb))) print('Accuracy of the k-Nearest Neighbors: {accuracy}'.format( accuracy=utils.calculate_accuracy(test_sentiments, test_sentiments_predicted_knn))) filename_sklearn_rf = 'bag-of-words-sklearn-rf-model.csv' filename_sklearn_nbg = 'bag-of-words-sklearn-nbg-model.csv' filename_sklearn_nbm = 'bag-of-words-sklearn-nbm-model.csv' filename_sklearn_nbb = 'bag-of-words-sklearn-nbb-model.csv' filename_sklearn_knn = 'bag-of-words-sklearn-knn-model.csv' filename_summary = 'bag-of-words-summary.txt' print('Wrote Random Forest results to {filename}'.format( filename=filename_sklearn_rf)) utils.write_results_to_csv(test_ids, test_sentiments, test_sentiments_predicted_rf, filename_sklearn_rf) print('Wrote Naive Bayes Gaussian results to {filename}'.format( filename=filename_sklearn_nbg)) utils.write_results_to_csv(test_ids, test_sentiments, test_sentiments_predicted_nbg, filename_sklearn_nbg) print('Wrote Naive Bayes Multinomial results to {filename}'.format( filename=filename_sklearn_nbm)) utils.write_results_to_csv(test_ids, test_sentiments, test_sentiments_predicted_nbm, filename_sklearn_nbm) print('Wrote Naive Bayes Bernoulli results to {filename}'.format( filename=filename_sklearn_nbb)) utils.write_results_to_csv(test_ids, test_sentiments, test_sentiments_predicted_nbb, filename_sklearn_nbb) print('Wrote k-Nearest Neighbors results to {filename}'.format( filename=filename_sklearn_knn)) utils.write_results_to_csv(test_ids, test_sentiments, test_sentiments_predicted_knn, filename_sklearn_knn) print('Wrote summary results to {filename}'.format( filename=filename_summary)) with open(filename_summary, "w") as file_summary: print('Size of train dataset: {size}'.format(size=len(train_ids)), file=file_summary) print('Size of test dataset: {size}'.format(size=len(test_ids)), file=file_summary) print('\n', file=file_summary) print('Number of trees in Random Forest: {trees}'.format( trees=n_estimators), file=file_summary) print('Number of neighbors in KNN: {neighbors}'.format( neighbors=n_neighbors), file=file_summary) print('\n', file=file_summary) print('Accuracy of the the Random Forest sklearn: {accuracy}'.format( accuracy=utils.calculate_accuracy(test_sentiments, test_sentiments_predicted_rf)), file=file_summary) print( 'Accuracy of the Naive Bayes Gaussian sklearn: {accuracy}'.format( accuracy=utils.calculate_accuracy( test_sentiments, test_sentiments_predicted_nbg)), file=file_summary) print('Accuracy of the Naive Bayes Multinomial sklearn: {accuracy}'. format(accuracy=utils.calculate_accuracy( test_sentiments, test_sentiments_predicted_nbm)), file=file_summary) print( 'Accuracy of the Naive Bayes Bernoulli sklearn: {accuracy}'.format( accuracy=utils.calculate_accuracy( test_sentiments, test_sentiments_predicted_nbb)), file=file_summary) print('Accuracy of the k-Nearest Neighbors sklearn: {accuracy}'.format( accuracy=utils.calculate_accuracy(test_sentiments, test_sentiments_predicted_knn)), file=file_summary) print('\n', file=file_summary) print('Count of each word in train dataset: {counts}'.format( counts=utils.count_words(vectorizer.get_feature_names(), train_texts)), file=file_summary)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--batchsize', '-b', type=int, default=20, help='Number of examples in each mini-batch') parser.add_argument('--bproplen', '-l', type=int, default=35, help='Number of words in each mini-batch ' '(= length of truncated BPTT)') parser.add_argument('--epoch', '-e', type=int, default=39, help='Number of sweeps over the dataset to train') parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID (negative value indicates CPU)') parser.add_argument('--gradclip', '-c', type=float, default=5, help='Gradient norm threshold to clip') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') parser.add_argument('--resume', '-r', default='', help='Resume the training from snapshot') parser.add_argument('--test', action='store_true', help='Use tiny datasets for quick tests') parser.set_defaults(test=False) parser.add_argument('--unit', '-u', type=int, default=650, help='Number of LSTM units in each layer') parser.add_argument('--layer', type=int, default=2) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--share-embedding', action='store_true') parser.add_argument('--blackout', action='store_true') parser.add_argument('--adaptive-softmax', action='store_true') parser.add_argument('--dataset', default='ptb', choices=['ptb', 'wikitext-2', 'wikitext-103']) parser.add_argument('--vocab') parser.add_argument('--log-interval', type=int, default=500) parser.add_argument('--validation-interval', '--val-interval', type=int, default=30000) parser.add_argument('--decay-if-fail', action='store_true') args = parser.parse_args() print(json.dumps(args.__dict__, indent=2)) if not os.path.isdir(args.out): os.mkdir(args.out) def evaluate(raw_model, iter): model = raw_model.copy() # to use different state model.reset_state() # initialize state sum_perp = 0 count = 0 xt_batch_seq = [] one_pack = args.batchsize * args.bproplen * 2 with chainer.using_config('train', False), chainer.no_backprop_mode(): for batch in copy.copy(iter): xt_batch_seq.append(batch) count += 1 if len(xt_batch_seq) >= one_pack: x_seq_batch, t_seq_batch = utils.convert_xt_batch_seq( xt_batch_seq, args.gpu) loss = model.forward_seq_batch( x_seq_batch, t_seq_batch, normalize=1.) sum_perp += loss.data xt_batch_seq = [] if xt_batch_seq: x_seq_batch, t_seq_batch = utils.convert_xt_batch_seq( xt_batch_seq, args.gpu) loss = model.forward_seq_batch( x_seq_batch, t_seq_batch, normalize=1.) sum_perp += loss.data return np.exp(float(sum_perp) / count) if args.vocab: vocab = json.load(open(args.vocab)) print('vocab is loaded', args.vocab) print('vocab =', len(vocab)) else: vocab = None if args.dataset == 'ptb': train, val, test = chainer.datasets.get_ptb_words() n_vocab = max(train) + 1 # train is just an array of integers else: train, val, test, vocab = utils.get_wikitext_words_and_vocab( name=args.dataset, vocab=vocab) n_vocab = len(vocab) if args.test: train = train[:100] val = val[:100] test = test[:100] print('#train tokens =', len(train)) print('#valid tokens =', len(val)) print('#test tokens =', len(test)) print('#vocab =', n_vocab) # Create the dataset iterators train_iter = utils.ParallelSequentialIterator(train, args.batchsize) val_iter = utils.ParallelSequentialIterator(val, 1, repeat=False) test_iter = utils.ParallelSequentialIterator(test, 1, repeat=False) # Prepare an RNNLM model if args.blackout: counts = utils.count_words(train) assert(len(counts) == n_vocab) else: counts = None model = nets.RNNForLM(n_vocab, args.unit, args.layer, args.dropout, share_embedding=args.share_embedding, blackout_counts=counts, adaptive_softmax=args.adaptive_softmax) if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() model.to_gpu() # Set up an optimizer # optimizer = chainer.optimizers.SGD(lr=1.0) # optimizer = chainer.optimizers.Adam(alpha=1e-3, beta1=0.) optimizer = chainer.optimizers.Adam(alpha=1e-3) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(args.gradclip)) # optimizer.add_hook(chainer.optimizer.WeightDecay(1e-6)) sum_perp = 0 count = 0 iteration = 0 is_new_epoch = 0 best_val_perp = 1000000. best_epoch = 0 start = time.time() log_interval = args.log_interval validation_interval = args.validation_interval print('iter/epoch', len(train) // (args.bproplen * args.batchsize)) print('Training start') while train_iter.epoch < args.epoch: iteration += 1 xt_batch_seq = [] if np.random.rand() < 0.01: model.reset_state() for i in range(args.bproplen): batch = train_iter.__next__() xt_batch_seq.append(batch) is_new_epoch += train_iter.is_new_epoch count += 1 x_seq_batch, t_seq_batch = utils.convert_xt_batch_seq( xt_batch_seq, args.gpu) loss = model.forward_seq_batch( x_seq_batch, t_seq_batch, normalize=args.batchsize) sum_perp += loss.data model.cleargrads() # Clear the parameter gradients loss.backward() # Backprop loss.unchain_backward() # Truncate the graph optimizer.update() # Update the parameters del loss if iteration % log_interval == 0: time_str = time.strftime('%Y-%m-%d %H-%M-%S') mean_speed = (count // args.bproplen) / (time.time() - start) print('\ti {:}\tperp {:.3f}\t\t| TIME {:.3f}i/s ({})'.format( iteration, np.exp(float(sum_perp) / count), mean_speed, time_str)) sum_perp = 0 count = 0 start = time.time() # if is_new_epoch: if iteration % validation_interval == 0: tmp = time.time() val_perp = evaluate(model, val_iter) time_str = time.strftime('%Y-%m-%d %H-%M-%S') print('Epoch {:}: val perp {:.3f}\t\t| TIME [{:.3f}s] ({})'.format( train_iter.epoch, val_perp, time.time() - tmp, time_str)) if val_perp < best_val_perp: best_val_perp = val_perp best_epoch = train_iter.epoch serializers.save_npz(os.path.join( args.out, 'best.model'), model) elif args.decay_if_fail: if hasattr(optimizer, 'alpha'): optimizer.alpha *= 0.5 optimizer.alpha = max(optimizer.alpha, 1e-7) else: optimizer.lr *= 0.5 optimizer.lr = max(optimizer.lr, 1e-7) start += (time.time() - tmp) if not args.decay_if_fail: if hasattr(optimizer, 'alpha'): optimizer.alpha *= 0.85 else: optimizer.lr *= 0.85 print('\t*lr = {:.8f}'.format( optimizer.alpha if hasattr(optimizer, 'alpha') else optimizer.lr)) is_new_epoch = 0 # Evaluate on test dataset print('test') print('load best model at epoch {}'.format(best_epoch)) print('valid perplexity: {}'.format(best_val_perp)) serializers.load_npz(os.path.join(args.out, 'best.model'), model) test_perp = evaluate(model, test_iter) print('test perplexity: {}'.format(test_perp))
import utils words = [ 'how', 'often', 'does', 'each', 'string', 'occur', 'in', 'this', 'list', '?' ] word2freq = utils.count_words(words) print('word2freq', word2freq) print('x', utils.x) print('python', utils.python)
import utils words = ['how', 'often', 'does', 'each', 'string', 'occur', 'in', 'this', 'list', '?'] word2freq = utils.count_words(words) print('word2freq', word2freq) print('x', utils.x) print('python', utils.python)
from utils import count_words from utils import x from utils import python words = ['how', 'often', 'does', 'each', 'string', 'occur', 'in', 'this', 'list', '?'] word2freq = count_words(words) print('word2freq', word2freq) print('x', x) print('python', python)
def init( session: str, text: str, topk: int, top_p: int, mode: str, ratio: float, return_text_to_speech=False, return_titles_and_links=False, ) -> Tuple[Any, ...]: output_fn = None about_fn = None titles_urls_fn = None pids: Optional[List[int]] = None context: Optional[str] = None if session == 'SentenceSimilarity': output = engine_api.similar(text, top_p=top_p) if output is not None: # render similar sentences line by line. pids = output.pids.squeeze(0).tolist() sentences = output.sentences output_fn = render_similar(st, sentences) nsids, npids = output.sids.size, len(set(pids)) about_fn = render_about(st, nsids, npids) elif session == 'QuestionAnswering': num_words = count_words(text, min_word_length=2) if num_words < MIN_VALID_WORDS: e = 'Text needs to be at least {} words long, and not {}' st.sidebar.error(e.format(MIN_VALID_WORDS, num_words)) else: # Do not cache outputs from user's questions. output = engine_api.answer(text, topk, top_p, mode, ratio) if output is not None: with st.spinner('Fetching results...'): pids = output.pids.squeeze(0).tolist() context = output.context answer = output.a[output.topk(0)] output_fn = render_answer(st, text, answer, context) n0, n1 = output.sids.size, len(output.c) nsids = f'**({n0}-{n0 - n1})**' npids = f'**{len(set(pids))}**' about_fn = render_about(st, nsids, npids) else: e = 'There was an ⚠ issue in trying to answer your question.' st.sidebar.error(e) elif session == 'Demo': # Cache the outputs from the demo questions. output = cache_api_answer(text, topk, top_p, mode, ratio) pids = output.pids.squeeze(0).tolist() context = output.context answer = output.a[output.topk(0)] output_fn = render_answer(st, text, answer, context) n0, n1 = output.sids.size, len(output.c) nsids = f'**({n0}-{n0 - n1})**' npids = f'**{len(set(pids))}**' about_fn = render_about(st, nsids, npids) if return_titles_and_links and pids is not None: try: titles_urls = meta_reader.load_urls(pids) except Exception as e: print(f'Loading titles and urls raised an exception {e}') else: titles_urls_fn = render_titles_urls(st, titles_urls) if return_text_to_speech and TTS_PORT is not None and context is not None: msg = 'Fetching synthesized text from IBM Watson. Please wait ⌛..' with st.spinner(msg): try: audio = engine_api.tts(context, prob=0.99, port=TTS_PORT) except Exception as e: print(f'TTS loading raised an exception, {e}') st.error('There was an issue with text-to-speech service 🤔.') else: st.audio(audio['audio_file_path'], format='audio/wav') return output_fn, about_fn, titles_urls_fn
def main(): parser = argparse.ArgumentParser() parser.add_argument('--gpu', '-g', type=int, default=-1, help='GPU ID (negative value indicates CPU)') parser.add_argument('--out', '-o', default='result', help='Directory to output the result') parser.add_argument('--batchsize', '-b', type=int, default=32, help='Number of examples in each mini-batch') parser.add_argument('--epoch', '-e', type=int, default=5, help='Number of sweeps over the dataset to train') parser.add_argument('--gradclip', '-c', type=float, default=10, help='Gradient norm threshold to clip') parser.add_argument('--unit', '-u', type=int, default=650, help='Number of LSTM units in each layer') parser.add_argument('--layer', type=int, default=2) parser.add_argument('--dropout', type=float, default=0.5) parser.add_argument('--share-embedding', action='store_true') parser.add_argument('--blackout', action='store_true') parser.add_argument('--adaptive-softmax', action='store_true') parser.add_argument('--log-interval', type=int, default=500) parser.add_argument('--validation-interval', '--val-interval', type=int, default=30000) parser.add_argument('--decay-if-fail', action='store_true') parser.add_argument('--vocab', required=True) parser.add_argument('--train-path', '--train', required=True) parser.add_argument('--valid-path', '--valid', required=True) parser.add_argument('--resume') parser.add_argument('--resume-rnn') parser.add_argument('--resume-wordemb') parser.add_argument('--resume-wordemb-vocab') parser.add_argument('--init-output-by-embed', action='store_true') parser.add_argument('--language-model', action='store_true') parser.add_argument('--rnn', default='gru', choices=['lstm', 'gru']) args = parser.parse_args() print(json.dumps(args.__dict__, indent=2)) vocab = json.load(open(args.vocab)) n_vocab = len(vocab) print('vocab is loaded', args.vocab) print('vocab =', n_vocab) if args.language_model: train = chain_utils.SequenceChainDataset( args.train_path, vocab, chain_length=1) valid = chain_utils.SequenceChainDataset( args.valid_path, vocab, chain_length=1) else: train = chain_utils.SequenceChainDataset( args.train_path, vocab, chain_length=2) valid = chain_utils.SequenceChainDataset( args.valid_path, vocab, chain_length=2) print('#train =', len(train)) print('#valid =', len(valid)) print('#vocab =', n_vocab) # Create the dataset iterators train_iter = chainer.iterators.SerialIterator(train, args.batchsize) valid_iter = chainer.iterators.SerialIterator(valid, args.batchsize, repeat=False, shuffle=False) # Prepare an RNNLM model if args.blackout: counts = utils.count_words(train) assert(len(counts) == n_vocab) else: counts = None if args.language_model: model = nets.SentenceLanguageModel( n_vocab, args.unit, args.layer, args.dropout, rnn=args.rnn, share_embedding=args.share_embedding, blackout_counts=counts, adaptive_softmax=args.adaptive_softmax) else: model = nets.SkipThoughtModel( n_vocab, args.unit, args.layer, args.dropout, rnn=args.rnn, share_embedding=args.share_embedding, blackout_counts=counts, adaptive_softmax=args.adaptive_softmax) print('RNN unit is {}'.format(args.rnn)) if args.gpu >= 0: chainer.cuda.get_device_from_id(args.gpu).use() model.to_gpu() # Set up an optimizer # optimizer = chainer.optimizers.SGD(lr=1.0) # optimizer = chainer.optimizers.Adam(alpha=1e-3, beta1=0.) optimizer = chainer.optimizers.Adam(alpha=1e-3) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(args.gradclip)) # optimizer.add_hook(chainer.optimizer.WeightDecay(1e-6)) iter_per_epoch = len(train) // args.batchsize log_trigger = (iter_per_epoch // 100, 'iteration') eval_trigger = (log_trigger[0] * 50, 'iteration') # every half epoch updater = training.StandardUpdater( train_iter, optimizer, converter=chain_utils.convert_sequence_chain, device=args.gpu, loss_func=model.calculate_loss) trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out) trainer.extend(extensions.Evaluator( valid_iter, model, converter=chain_utils.convert_sequence_chain, device=args.gpu, eval_func=model.calculate_loss), trigger=eval_trigger) """ trainer.extend(utils.SentenceEvaluater( model, valid, vocab, 'val/', batchsize=args.batchsize, device=args.gpu, k=args.beam, print_sentence_mod=args.print_sentence_mod), trigger=eval_trigger) """ record_trigger = training.triggers.MinValueTrigger( 'validation/main/perp', trigger=eval_trigger) trainer.extend(extensions.snapshot_object( model, 'best_model.npz'), trigger=record_trigger) trainer.extend(extensions.LogReport(trigger=log_trigger), trigger=log_trigger) if args.language_model: keys = [ 'epoch', 'iteration', 'main/perp', 'validation/main/perp', 'elapsed_time'] else: keys = [ 'epoch', 'iteration', 'main/perp', 'main/FWperp', 'main/BWperp', 'validation/main/perp', 'elapsed_time'] trainer.extend(extensions.PrintReport(keys), trigger=log_trigger) trainer.extend(extensions.ProgressBar(update_interval=50)) print('iter/epoch', iter_per_epoch) print('Training start') trainer.run()
from utils import create_words_frequency, count_words create_words_frequency() print(count_words())