def build_vocab(self, embed_file: str = None) -> Vocab:
        """Build the vocabulary for the data set.

        Args:
            embed_file (str, optional):
            The file path of the pre-trained embedding word vector.
            Defaults to None.

        Returns:
            vocab.Vocab: The vocab object.
        """
        # word frequency
        word_counts = Counter()
        count_words(word_counts,
                    [sample['src'] + sample['tgt'] for sample in self.samples])
        vocab = Vocab()
        # Filter the vocabulary by keeping only the top k tokens in terms of
        # word frequncy in the data set, where k is the maximum vocab size set
        # in "config.py".
        for word, count in word_counts.most_common(config.max_vocab_size):
            vocab.add_words([word])
        if embed_file is not None:
            count = vocab.load_embeddings(embed_file)
            logging.info("%d pre-trained embeddings loaded." % count)
            
        with open(config.vocab, "wb") as f:
            pickle.dump(vocab, f)

        return vocab
Beispiel #2
0
    def build_vocab(self, embed_file: str = None) -> Vocab:
        """Build the vocabulary for the data set.

        Args:
            embed_file (str, optional):
            The file path of the pre-trained embedding word vector.
            Defaults to None.

        Returns:
            vocab.Vocab: The vocab object.
        """
        # word frequency
        word_counts = Counter()
        count_words(word_counts, [src + tgr for src, tgr in self.pairs])
        vocab = Vocab()
        # Filter the vocabulary by keeping only the top k tokens in terms of
        # word frequncy in the data set, where k is the maximum vocab size set
        # in "config.py".
        for word, count in word_counts.most_common(config.max_vocab_size):
            vocab.add_words([word])
        if embed_file is not None:
            count = vocab.load_embeddings(embed_file)
            print("%d pre-trained embeddings loaded." % count)

        return vocab
    def load_temario(self, version):
        '''
        TEMARIO: NUmero de palabras o sentencias del resumo final
        CSTNews: 70% en numero de palabras del documento con mayor peso
        '''
        print "temario :)"
        corpus_dictionary = dict()

        if version == 'temario_v1':
            path = corpus_dir[version]
            path_sumarios = summaries_dir[version]
            documents = os.listdir(path)
            sumarios = os.listdir(path_sumarios)

            for i in documents:
                docPath = path + '/' + i
                # print docPath
                document_name = i[3:]
                document_name = document_name[:-4]

                document_sentences = read_document(docPath, self.language)
                class_labels_ml = None
                if self.dictionary_class_labels is not None:
                    class_labels_ml = self.dictionary_class_labels[
                        document_name]
                naive_tagged_sentences = naive_tag(
                    document_sentences, class_labels_ml
                )  # modificado para tambien etiquetar las sentencias q hacen parte del sumario o no fazen parte

                #print naive_tagged_sentences

                #corpus_dictionary[document_name] = [document_sentences]
                corpus_dictionary[document_name] = [naive_tagged_sentences]

            for i in sumarios:
                summPath = path_sumarios + i
                # print summPath
                summary_name = i[4:]
                summary_name = summary_name[:-4]
                size_summary = count_words(summPath, self.language)

                value = corpus_dictionary[summary_name]  # size_summary
                value.append(size_summary)
                corpus_dictionary[summary_name] = value

        else:
            print 'version 2'

        return corpus_dictionary
    def load_cst_news(self, version):
        print "cst news :)"
        corpus_dictionary = dict()
        if version == 'cstnews_v1':
            path = corpus_dir[version]
            clusters = os.listdir(path)
            special = '.DS_Store'
            if special in clusters: clusters.remove(special)
            for i in clusters:
                sub_path = path + i + '/' + corpus_dir['textosFonte']
                documents = os.listdir(sub_path)
                if special in documents: documents.remove(special)

                allSentences = []
                document_lenghts = []
                #top_sentences = []
                index = 1

                for j in documents:
                    document = sub_path + j
                    document_sentences = read_document(document, self.language)
                    class_labels_ml = None
                    if self.dictionary_class_labels is not None:
                        class_labels_ml = self.dictionary_class_labels[i]

                    #for k in  range(3):
                    #    top_sentences.append(document_sentences[k])

                    document_size = count_words(document, self.language)
                    document_lenghts.append(document_size)

                    taggedSentences = tag_sentence(document_sentences, index,
                                                   class_labels_ml)
                    #print taggedSentences

                    index += 1

                    allSentences.extend(taggedSentences)

                size_cluster = max(document_lenghts)
                size_summary = (30 * size_cluster) / 100
                #corpus_dictionary[i] = [allSentences, size_summary, top_sentences]
                corpus_dictionary[i] = [allSentences, size_summary]

        else:
            print 'version 2'

        # corpus = ['diccionario con nombres y los datos' ,'loaded corpus sin procesar' , 'vectores de sizes de sumarios']
        return corpus_dictionary
Beispiel #5
0
    def post(self):
        url = self.get_argument('url', '')
        words = []
        if url:
            words = utils.count_words(url)
            utils.store_words(words, self.db)

        max = 0
        for word in words:
            if word['counter'] > max:
                max = word['counter']
        if max < 5:
            max = 5

        def size(counter):
            return min(int(ceil(counter / ceil(max / 5))), 5)

        self.render("words.html", words=words, url=url, size=size)
def ma_lo_data_set(path: str,
                   n_words: int) -> Tuple[Dict, Dict, Dict[str, int]]:
    '''
	 make, load and return the dataset from a structured data folder
	:param n_words: most important words number
	:type n_words: int
	:param path: path to dataset
	:type path: str
	:return:
	:rtype:
	'''
    word_counter = utils.count_words(path)

    voc = utils.most_rep(word_counter, n_words)

    data = utils.load_data(path)
    np.random.shuffle(data)

    train_set, test_set = data[:-5000], data[-5000:]

    return train_set, test_set, voc
def run():
    print("Read train data...")

    train_data = utils.concat_sets(
        utils.read_and_parse(config.DATA_TRAINING_POS_REVIEW,
                             parsers.WordsParser),
        utils.read_and_parse(config.DATA_TRAINING_NEG_REVIEW,
                             parsers.WordsParser),
        is_join=True,
        is_shuffle=True)

    print("Read test data...")

    test_data = utils.concat_sets(
        utils.read_and_parse(config.DATA_TEST_POS_REVIEW, parsers.WordsParser),
        utils.read_and_parse(config.DATA_TEST_NEG_REVIEW, parsers.WordsParser),
        is_join=True,
        is_shuffle=True)

    print('Creating the bag of words...')

    # note that CountVectorizer comes with its own options
    # to automatically do preprocessing, tokenization, and stop word removal
    # for each of these, instead of specifying "None",
    # it's possible to use a built-in method or custom function,
    # however, in this example, for data cleaning used custom parsers
    vectorizer = CountVectorizer(analyzer='word',
                                 tokenizer=None,
                                 preprocessor=None,
                                 stop_words=None,
                                 max_features=5000)

    print('Cleaning and parsing the train set movie reviews...')

    # get a bag of words for the training set, and convert to a numpy array
    # example result:
    # train_texts -> [[1, 3], [1, 2], [3, 1], ...]
    train_texts = vectorizer.fit_transform(train_texts).toarray()

    print('Cleaning and parsing the test set movie reviews...')

    # get a bag of words for the test set, and convert to a numpy array
    # example result:
    # test_texts -> [[1, 3], [1, 2], [3, 1], ...]
    test_texts = vectorizer.transform(test_texts).toarray()

    print('Training the Random Forest...')
    n_estimators = 100
    # example result:
    # test_sentiments_predicted_rf -> [1, 0, 1...]
    test_sentiments_predicted_rf = classifiers_sk.random_forest(
        train_texts, train_sentiments, test_texts, n_estimators=n_estimators)

    print('Training the Naive Bayes Gaussian...')
    # example result:
    # test_sentiments_predicted_nbg -> [1, 0, 1...]
    test_sentiments_predicted_nbg = classifiers_sk.naive_bayes_gaussian(
        train_texts, train_sentiments, test_texts)

    print('Training the Naive Bayes Multinomial...')
    # example result:
    # test_sentiments_predicted_nbm -> [1, 0, 1...]
    test_sentiments_predicted_nbm = classifiers_sk.naive_bayes_multinomial(
        train_texts, train_sentiments, test_texts)

    print('Training the Naive Bayes Bernoulli...')
    # example result:
    # test_sentiments_predicted_nbb -> [1, 0, 1...]
    test_sentiments_predicted_nbb = classifiers_sk.naive_bayes_bernoulli(
        train_texts, train_sentiments, test_texts)

    print('Training the k-Nearest Neighbors...')
    n_neighbors = 100
    # example result:
    # test_sentiments_predicted_knn -> [1, 0, 1...]
    test_sentiments_predicted_knn = classifiers_sk.k_nearest_neighbors(
        train_texts, train_sentiments, test_texts, n_neighbors=n_neighbors)

    print('Accuracy of the the Random Forest: {accuracy}'.format(
        accuracy=utils.calculate_accuracy(test_sentiments,
                                          test_sentiments_predicted_rf)))

    print('Accuracy of the Naive Bayes Gaussian: {accuracy}'.format(
        accuracy=utils.calculate_accuracy(test_sentiments,
                                          test_sentiments_predicted_nbg)))

    print('Accuracy of the Naive Bayes Multinomial: {accuracy}'.format(
        accuracy=utils.calculate_accuracy(test_sentiments,
                                          test_sentiments_predicted_nbm)))

    print('Accuracy of the Naive Bayes Bernoulli: {accuracy}'.format(
        accuracy=utils.calculate_accuracy(test_sentiments,
                                          test_sentiments_predicted_nbb)))

    print('Accuracy of the k-Nearest Neighbors: {accuracy}'.format(
        accuracy=utils.calculate_accuracy(test_sentiments,
                                          test_sentiments_predicted_knn)))

    filename_sklearn_rf = 'bag-of-words-sklearn-rf-model.csv'
    filename_sklearn_nbg = 'bag-of-words-sklearn-nbg-model.csv'
    filename_sklearn_nbm = 'bag-of-words-sklearn-nbm-model.csv'
    filename_sklearn_nbb = 'bag-of-words-sklearn-nbb-model.csv'
    filename_sklearn_knn = 'bag-of-words-sklearn-knn-model.csv'
    filename_summary = 'bag-of-words-summary.txt'

    print('Wrote Random Forest results to {filename}'.format(
        filename=filename_sklearn_rf))
    utils.write_results_to_csv(test_ids, test_sentiments,
                               test_sentiments_predicted_rf,
                               filename_sklearn_rf)

    print('Wrote Naive Bayes Gaussian results to {filename}'.format(
        filename=filename_sklearn_nbg))
    utils.write_results_to_csv(test_ids, test_sentiments,
                               test_sentiments_predicted_nbg,
                               filename_sklearn_nbg)

    print('Wrote Naive Bayes Multinomial results to {filename}'.format(
        filename=filename_sklearn_nbm))
    utils.write_results_to_csv(test_ids, test_sentiments,
                               test_sentiments_predicted_nbm,
                               filename_sklearn_nbm)

    print('Wrote Naive Bayes Bernoulli results to {filename}'.format(
        filename=filename_sklearn_nbb))
    utils.write_results_to_csv(test_ids, test_sentiments,
                               test_sentiments_predicted_nbb,
                               filename_sklearn_nbb)

    print('Wrote k-Nearest Neighbors results to {filename}'.format(
        filename=filename_sklearn_knn))
    utils.write_results_to_csv(test_ids, test_sentiments,
                               test_sentiments_predicted_knn,
                               filename_sklearn_knn)

    print('Wrote summary results to {filename}'.format(
        filename=filename_summary))

    with open(filename_summary, "w") as file_summary:
        print('Size of train dataset: {size}'.format(size=len(train_ids)),
              file=file_summary)

        print('Size of test dataset: {size}'.format(size=len(test_ids)),
              file=file_summary)

        print('\n', file=file_summary)

        print('Number of trees in Random Forest: {trees}'.format(
            trees=n_estimators),
              file=file_summary)

        print('Number of neighbors in KNN: {neighbors}'.format(
            neighbors=n_neighbors),
              file=file_summary)

        print('\n', file=file_summary)

        print('Accuracy of the the Random Forest sklearn: {accuracy}'.format(
            accuracy=utils.calculate_accuracy(test_sentiments,
                                              test_sentiments_predicted_rf)),
              file=file_summary)

        print(
            'Accuracy of the Naive Bayes Gaussian sklearn: {accuracy}'.format(
                accuracy=utils.calculate_accuracy(
                    test_sentiments, test_sentiments_predicted_nbg)),
            file=file_summary)

        print('Accuracy of the Naive Bayes Multinomial sklearn: {accuracy}'.
              format(accuracy=utils.calculate_accuracy(
                  test_sentiments, test_sentiments_predicted_nbm)),
              file=file_summary)

        print(
            'Accuracy of the Naive Bayes Bernoulli sklearn: {accuracy}'.format(
                accuracy=utils.calculate_accuracy(
                    test_sentiments, test_sentiments_predicted_nbb)),
            file=file_summary)

        print('Accuracy of the k-Nearest Neighbors sklearn: {accuracy}'.format(
            accuracy=utils.calculate_accuracy(test_sentiments,
                                              test_sentiments_predicted_knn)),
              file=file_summary)

        print('\n', file=file_summary)

        print('Count of each word in train dataset: {counts}'.format(
            counts=utils.count_words(vectorizer.get_feature_names(),
                                     train_texts)),
              file=file_summary)
Beispiel #8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--batchsize', '-b', type=int, default=20,
                        help='Number of examples in each mini-batch')
    parser.add_argument('--bproplen', '-l', type=int, default=35,
                        help='Number of words in each mini-batch '
                             '(= length of truncated BPTT)')
    parser.add_argument('--epoch', '-e', type=int, default=39,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gpu', '-g', type=int, default=-1,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--gradclip', '-c', type=float, default=5,
                        help='Gradient norm threshold to clip')
    parser.add_argument('--out', '-o', default='result',
                        help='Directory to output the result')
    parser.add_argument('--resume', '-r', default='',
                        help='Resume the training from snapshot')
    parser.add_argument('--test', action='store_true',
                        help='Use tiny datasets for quick tests')
    parser.set_defaults(test=False)
    parser.add_argument('--unit', '-u', type=int, default=650,
                        help='Number of LSTM units in each layer')
    parser.add_argument('--layer', type=int, default=2)
    parser.add_argument('--dropout', type=float, default=0.5)
    parser.add_argument('--share-embedding', action='store_true')
    parser.add_argument('--blackout', action='store_true')
    parser.add_argument('--adaptive-softmax', action='store_true')
    parser.add_argument('--dataset', default='ptb',
                        choices=['ptb', 'wikitext-2', 'wikitext-103'])
    parser.add_argument('--vocab')
    parser.add_argument('--log-interval', type=int, default=500)
    parser.add_argument('--validation-interval', '--val-interval',
                        type=int, default=30000)
    parser.add_argument('--decay-if-fail', action='store_true')

    args = parser.parse_args()
    print(json.dumps(args.__dict__, indent=2))

    if not os.path.isdir(args.out):
        os.mkdir(args.out)

    def evaluate(raw_model, iter):
        model = raw_model.copy()  # to use different state
        model.reset_state()  # initialize state
        sum_perp = 0
        count = 0
        xt_batch_seq = []
        one_pack = args.batchsize * args.bproplen * 2
        with chainer.using_config('train', False), chainer.no_backprop_mode():
            for batch in copy.copy(iter):
                xt_batch_seq.append(batch)
                count += 1
                if len(xt_batch_seq) >= one_pack:
                    x_seq_batch, t_seq_batch = utils.convert_xt_batch_seq(
                        xt_batch_seq, args.gpu)
                    loss = model.forward_seq_batch(
                        x_seq_batch, t_seq_batch, normalize=1.)
                    sum_perp += loss.data
                    xt_batch_seq = []
            if xt_batch_seq:
                x_seq_batch, t_seq_batch = utils.convert_xt_batch_seq(
                    xt_batch_seq, args.gpu)
                loss = model.forward_seq_batch(
                    x_seq_batch, t_seq_batch, normalize=1.)
                sum_perp += loss.data
        return np.exp(float(sum_perp) / count)

    if args.vocab:
        vocab = json.load(open(args.vocab))
        print('vocab is loaded', args.vocab)
        print('vocab =', len(vocab))
    else:
        vocab = None

    if args.dataset == 'ptb':
        train, val, test = chainer.datasets.get_ptb_words()
        n_vocab = max(train) + 1  # train is just an array of integers
    else:
        train, val, test, vocab = utils.get_wikitext_words_and_vocab(
            name=args.dataset, vocab=vocab)
        n_vocab = len(vocab)
    if args.test:
        train = train[:100]
        val = val[:100]
        test = test[:100]
    print('#train tokens =', len(train))
    print('#valid tokens =', len(val))
    print('#test tokens =', len(test))
    print('#vocab =', n_vocab)

    # Create the dataset iterators
    train_iter = utils.ParallelSequentialIterator(train, args.batchsize)
    val_iter = utils.ParallelSequentialIterator(val, 1, repeat=False)
    test_iter = utils.ParallelSequentialIterator(test, 1, repeat=False)

    # Prepare an RNNLM model
    if args.blackout:
        counts = utils.count_words(train)
        assert(len(counts) == n_vocab)
    else:
        counts = None
    model = nets.RNNForLM(n_vocab, args.unit, args.layer, args.dropout,
                          share_embedding=args.share_embedding,
                          blackout_counts=counts,
                          adaptive_softmax=args.adaptive_softmax)

    if args.gpu >= 0:
        chainer.cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()

    # Set up an optimizer
    # optimizer = chainer.optimizers.SGD(lr=1.0)
    # optimizer = chainer.optimizers.Adam(alpha=1e-3, beta1=0.)
    optimizer = chainer.optimizers.Adam(alpha=1e-3)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.GradientClipping(args.gradclip))
    # optimizer.add_hook(chainer.optimizer.WeightDecay(1e-6))

    sum_perp = 0
    count = 0
    iteration = 0
    is_new_epoch = 0
    best_val_perp = 1000000.
    best_epoch = 0
    start = time.time()

    log_interval = args.log_interval
    validation_interval = args.validation_interval
    print('iter/epoch', len(train) // (args.bproplen * args.batchsize))
    print('Training start')
    while train_iter.epoch < args.epoch:
        iteration += 1
        xt_batch_seq = []
        if np.random.rand() < 0.01:
            model.reset_state()

        for i in range(args.bproplen):
            batch = train_iter.__next__()
            xt_batch_seq.append(batch)
            is_new_epoch += train_iter.is_new_epoch
            count += 1
        x_seq_batch, t_seq_batch = utils.convert_xt_batch_seq(
            xt_batch_seq, args.gpu)
        loss = model.forward_seq_batch(
            x_seq_batch, t_seq_batch, normalize=args.batchsize)

        sum_perp += loss.data
        model.cleargrads()  # Clear the parameter gradients
        loss.backward()  # Backprop
        loss.unchain_backward()  # Truncate the graph
        optimizer.update()  # Update the parameters
        del loss

        if iteration % log_interval == 0:
            time_str = time.strftime('%Y-%m-%d %H-%M-%S')
            mean_speed = (count // args.bproplen) / (time.time() - start)
            print('\ti {:}\tperp {:.3f}\t\t| TIME {:.3f}i/s ({})'.format(
                iteration, np.exp(float(sum_perp) / count), mean_speed, time_str))
            sum_perp = 0
            count = 0
            start = time.time()

        # if is_new_epoch:
        if iteration % validation_interval == 0:
            tmp = time.time()
            val_perp = evaluate(model, val_iter)
            time_str = time.strftime('%Y-%m-%d %H-%M-%S')
            print('Epoch {:}: val perp {:.3f}\t\t| TIME [{:.3f}s] ({})'.format(
                train_iter.epoch, val_perp, time.time() - tmp, time_str))
            if val_perp < best_val_perp:
                best_val_perp = val_perp
                best_epoch = train_iter.epoch
                serializers.save_npz(os.path.join(
                    args.out, 'best.model'), model)
            elif args.decay_if_fail:
                if hasattr(optimizer, 'alpha'):
                    optimizer.alpha *= 0.5
                    optimizer.alpha = max(optimizer.alpha, 1e-7)
                else:
                    optimizer.lr *= 0.5
                    optimizer.lr = max(optimizer.lr, 1e-7)
            start += (time.time() - tmp)
            if not args.decay_if_fail:
                if hasattr(optimizer, 'alpha'):
                    optimizer.alpha *= 0.85
                else:
                    optimizer.lr *= 0.85
            print('\t*lr = {:.8f}'.format(
                optimizer.alpha if hasattr(optimizer, 'alpha') else optimizer.lr))
            is_new_epoch = 0

    # Evaluate on test dataset
    print('test')
    print('load best model at epoch {}'.format(best_epoch))
    print('valid perplexity: {}'.format(best_val_perp))
    serializers.load_npz(os.path.join(args.out, 'best.model'), model)
    test_perp = evaluate(model, test_iter)
    print('test perplexity: {}'.format(test_perp))
Beispiel #9
0
import utils

words = [
    'how', 'often', 'does', 'each', 'string', 'occur', 'in', 'this', 'list',
    '?'
]

word2freq = utils.count_words(words)
print('word2freq', word2freq)

print('x', utils.x)
print('python', utils.python)
import utils


words = ['how', 'often', 'does', 'each', 'string', 'occur', 'in', 'this', 'list', '?']

word2freq = utils.count_words(words)
print('word2freq', word2freq)


print('x', utils.x)
print('python', utils.python)


Beispiel #11
0
from utils import count_words
from utils import x
from utils import python


words = ['how', 'often', 'does', 'each', 'string', 'occur', 'in', 'this', 'list', '?']

word2freq = count_words(words)
print('word2freq', word2freq)


print('x', x)
print('python', python)


Beispiel #12
0
def init(
    session: str,
    text: str,
    topk: int,
    top_p: int,
    mode: str,
    ratio: float,
    return_text_to_speech=False,
    return_titles_and_links=False,
) -> Tuple[Any, ...]:

    output_fn = None
    about_fn = None
    titles_urls_fn = None
    pids: Optional[List[int]] = None
    context: Optional[str] = None

    if session == 'SentenceSimilarity':
        output = engine_api.similar(text, top_p=top_p)
        if output is not None:
            # render similar sentences line by line.
            pids = output.pids.squeeze(0).tolist()
            sentences = output.sentences
            output_fn = render_similar(st, sentences)
            nsids, npids = output.sids.size, len(set(pids))
            about_fn = render_about(st, nsids, npids)

    elif session == 'QuestionAnswering':
        num_words = count_words(text, min_word_length=2)
        if num_words < MIN_VALID_WORDS:
            e = 'Text needs to be at least {} words long, and not {}'
            st.sidebar.error(e.format(MIN_VALID_WORDS, num_words))
        else:
            # Do not cache outputs from user's questions.
            output = engine_api.answer(text, topk, top_p, mode, ratio)
            if output is not None:
                with st.spinner('Fetching results...'):
                    pids = output.pids.squeeze(0).tolist()
                    context = output.context
                    answer = output.a[output.topk(0)]
                    output_fn = render_answer(st, text, answer, context)

                    n0, n1 = output.sids.size, len(output.c)
                    nsids = f'**({n0}-{n0 - n1})**'
                    npids = f'**{len(set(pids))}**'
                    about_fn = render_about(st, nsids, npids)
            else:
                e = 'There was an ⚠ issue in trying to answer your question.'
                st.sidebar.error(e)

    elif session == 'Demo':
        # Cache the outputs from the demo questions.
        output = cache_api_answer(text, topk, top_p, mode, ratio)
        pids = output.pids.squeeze(0).tolist()
        context = output.context
        answer = output.a[output.topk(0)]
        output_fn = render_answer(st, text, answer, context)

        n0, n1 = output.sids.size, len(output.c)
        nsids = f'**({n0}-{n0 - n1})**'
        npids = f'**{len(set(pids))}**'
        about_fn = render_about(st, nsids, npids)

    if return_titles_and_links and pids is not None:
        try:
            titles_urls = meta_reader.load_urls(pids)
        except Exception as e:
            print(f'Loading titles and urls raised an exception {e}')
        else:
            titles_urls_fn = render_titles_urls(st, titles_urls)

    if return_text_to_speech and TTS_PORT is not None and context is not None:
        msg = 'Fetching synthesized text from IBM Watson. Please wait ⌛..'
        with st.spinner(msg):
            try:
                audio = engine_api.tts(context, prob=0.99, port=TTS_PORT)
            except Exception as e:
                print(f'TTS loading raised an exception, {e}')
                st.error('There was an issue with text-to-speech service 🤔.')
            else:
                st.audio(audio['audio_file_path'], format='audio/wav')

    return output_fn, about_fn, titles_urls_fn
Beispiel #13
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--gpu', '-g', type=int, default=-1,
                        help='GPU ID (negative value indicates CPU)')
    parser.add_argument('--out', '-o', default='result',
                        help='Directory to output the result')
    parser.add_argument('--batchsize', '-b', type=int, default=32,
                        help='Number of examples in each mini-batch')
    parser.add_argument('--epoch', '-e', type=int, default=5,
                        help='Number of sweeps over the dataset to train')
    parser.add_argument('--gradclip', '-c', type=float, default=10,
                        help='Gradient norm threshold to clip')

    parser.add_argument('--unit', '-u', type=int, default=650,
                        help='Number of LSTM units in each layer')
    parser.add_argument('--layer', type=int, default=2)
    parser.add_argument('--dropout', type=float, default=0.5)

    parser.add_argument('--share-embedding', action='store_true')
    parser.add_argument('--blackout', action='store_true')
    parser.add_argument('--adaptive-softmax', action='store_true')

    parser.add_argument('--log-interval',
                        type=int, default=500)
    parser.add_argument('--validation-interval', '--val-interval',
                        type=int, default=30000)
    parser.add_argument('--decay-if-fail', action='store_true')

    parser.add_argument('--vocab', required=True)
    parser.add_argument('--train-path', '--train', required=True)
    parser.add_argument('--valid-path', '--valid', required=True)

    parser.add_argument('--resume')
    parser.add_argument('--resume-rnn')
    parser.add_argument('--resume-wordemb')
    parser.add_argument('--resume-wordemb-vocab')
    parser.add_argument('--init-output-by-embed', action='store_true')

    parser.add_argument('--language-model', action='store_true')
    parser.add_argument('--rnn', default='gru', choices=['lstm', 'gru'])

    args = parser.parse_args()
    print(json.dumps(args.__dict__, indent=2))

    vocab = json.load(open(args.vocab))
    n_vocab = len(vocab)
    print('vocab is loaded', args.vocab)
    print('vocab =', n_vocab)

    if args.language_model:
        train = chain_utils.SequenceChainDataset(
            args.train_path, vocab, chain_length=1)
        valid = chain_utils.SequenceChainDataset(
            args.valid_path, vocab, chain_length=1)
    else:
        train = chain_utils.SequenceChainDataset(
            args.train_path, vocab, chain_length=2)
        valid = chain_utils.SequenceChainDataset(
            args.valid_path, vocab, chain_length=2)

    print('#train =', len(train))
    print('#valid =', len(valid))
    print('#vocab =', n_vocab)

    # Create the dataset iterators
    train_iter = chainer.iterators.SerialIterator(train, args.batchsize)
    valid_iter = chainer.iterators.SerialIterator(valid, args.batchsize,
                                                  repeat=False, shuffle=False)

    # Prepare an RNNLM model
    if args.blackout:
        counts = utils.count_words(train)
        assert(len(counts) == n_vocab)
    else:
        counts = None

    if args.language_model:
        model = nets.SentenceLanguageModel(
            n_vocab, args.unit, args.layer, args.dropout,
            rnn=args.rnn,
            share_embedding=args.share_embedding,
            blackout_counts=counts,
            adaptive_softmax=args.adaptive_softmax)
    else:
        model = nets.SkipThoughtModel(
            n_vocab, args.unit, args.layer, args.dropout,
            rnn=args.rnn,
            share_embedding=args.share_embedding,
            blackout_counts=counts,
            adaptive_softmax=args.adaptive_softmax)
    print('RNN unit is {}'.format(args.rnn))

    if args.gpu >= 0:
        chainer.cuda.get_device_from_id(args.gpu).use()
        model.to_gpu()

    # Set up an optimizer
    # optimizer = chainer.optimizers.SGD(lr=1.0)
    # optimizer = chainer.optimizers.Adam(alpha=1e-3, beta1=0.)
    optimizer = chainer.optimizers.Adam(alpha=1e-3)
    optimizer.setup(model)
    optimizer.add_hook(chainer.optimizer.GradientClipping(args.gradclip))
    # optimizer.add_hook(chainer.optimizer.WeightDecay(1e-6))

    iter_per_epoch = len(train) // args.batchsize
    log_trigger = (iter_per_epoch // 100, 'iteration')
    eval_trigger = (log_trigger[0] * 50, 'iteration')  # every half epoch

    updater = training.StandardUpdater(
        train_iter, optimizer,
        converter=chain_utils.convert_sequence_chain, device=args.gpu,
        loss_func=model.calculate_loss)
    trainer = training.Trainer(updater, (args.epoch, 'epoch'), out=args.out)

    trainer.extend(extensions.Evaluator(
        valid_iter, model,
        converter=chain_utils.convert_sequence_chain, device=args.gpu,
        eval_func=model.calculate_loss),
        trigger=eval_trigger)
    """
    trainer.extend(utils.SentenceEvaluater(
        model, valid, vocab, 'val/',
        batchsize=args.batchsize,
        device=args.gpu,
        k=args.beam,
        print_sentence_mod=args.print_sentence_mod),
        trigger=eval_trigger)
    """
    record_trigger = training.triggers.MinValueTrigger(
        'validation/main/perp',
        trigger=eval_trigger)
    trainer.extend(extensions.snapshot_object(
        model, 'best_model.npz'),
        trigger=record_trigger)

    trainer.extend(extensions.LogReport(trigger=log_trigger),
                   trigger=log_trigger)

    if args.language_model:
        keys = [
            'epoch', 'iteration',
            'main/perp',
            'validation/main/perp',
            'elapsed_time']
    else:
        keys = [
            'epoch', 'iteration',
            'main/perp',
            'main/FWperp',
            'main/BWperp',
            'validation/main/perp',
            'elapsed_time']
    trainer.extend(extensions.PrintReport(keys),
                   trigger=log_trigger)
    trainer.extend(extensions.ProgressBar(update_interval=50))

    print('iter/epoch', iter_per_epoch)
    print('Training start')

    trainer.run()
Beispiel #14
0
from utils import create_words_frequency, count_words

create_words_frequency()
print(count_words())