コード例 #1
0
def main(unused):
    parser = argparse.ArgumentParser()
    parser.add_argument("embedding_path")
    parser.add_argument("vocabulary_dir")
    parser.add_argument("--training_dir")
    parser.add_argument("--validation_dir")
    parser.add_argument("--decode_dir")
    parser.add_argument("--decode_out_dir")
    parser.add_argument("--mode",
                        choices=["train", "validate", "decode"],
                        default="train")
    parser.add_argument("--logdir")
    parser.add_argument("--batch_size", type=int, default=30)
    parser.add_argument("--validation_interval", type=int, default=20000)
    parser.add_argument("--beam_width", type=int, default=5)
    parser.add_argument("--max_output_length", type=int, default=32)
    parser.add_argument("--target_vocabulary_size", type=int, default=20000)
    parser.add_argument("--synthetic", action="store_true")
    parser.add_argument("--allow_gpu_growth", action="store_true")
    parser.add_argument("--collect_run_metadata", action="store_true")
    parser.add_argument("--log_weight_images", action="store_true")
    options = parser.parse_args()

    if options.mode == "decode":
        # Batching not supported in decoding
        options.batch_size = 1

    embedding_words, word_dict, word_embedding_dim = load_word_embeddings(
        options.embedding_path)

    vocabulary = Vocabulary()

    summary_vocabulary_path = path.join(options.vocabulary_dir,
                                        "summary_vocabulary.txt")

    vocabulary.add_from_file(
        summary_vocabulary_path,
        options.target_vocabulary_size - len(vocabulary.words))

    document_vocabulary_path = path.join(options.vocabulary_dir,
                                         "document_vocabulary.txt")

    # Add the most common words from vocabulary
    vocabulary.add_from_file(document_vocabulary_path, 150000)

    # Add additional common words from loaded embeddings
    # note that embedding_words contains 2D numpy arrays
    vocabulary.add_words(embedding_words[:100000])

    run(options, word_dict, word_embedding_dim, vocabulary)
コード例 #2
0
def build_vocabs():
    train, dev, test = load_boknilev()
    samples = [
        s for r in train + dev + test
        for s in boknilev_record_to_hcpd_samples(r)
    ]

    gold_pos_vocab = Vocabulary('GOLD_POS')
    gold_pos_vocab.add_words(
        set([hc.next_pos for s in samples for hc in s.x.head_cands]))
    gold_pos_vocab.add_word(None)

    words_vocab = Vocabulary('WORDS')
    words_vocab.add_words(
        set([hc.word for s in samples for hc in s.x.head_cands]))
    words_vocab.add_words(set([s.x.pp.word for s in samples]))
    words_vocab.add_words(set([s.x.child.word for s in samples]))
    words_vocab.add_word(None)

    words_to_lemmas = {}
    words_to_lemmas.update({s.x.child.word: s.x.child.lemma for s in samples})
    words_to_lemmas.update(
        {hc.word: hc.lemma
         for s in samples for hc in s.x.head_cands})

    return [gold_pos_vocab, words_vocab, words_to_lemmas]
コード例 #3
0
def main(unused):
    parser = argparse.ArgumentParser()
    parser.add_argument('embedding_path')
    parser.add_argument('vocabulary_dir')
    parser.add_argument('--training_dir')
    parser.add_argument('--validation_dir')
    parser.add_argument('--decode_dir')
    parser.add_argument('--decode_out_dir')
    parser.add_argument('--mode',
                        choices=['train', 'validate', 'decode'],
                        default='train')
    parser.add_argument('--logdir')
    parser.add_argument('--batch_size', type=int, default=30)
    parser.add_argument('--validation_interval', type=int, default=20000)
    parser.add_argument('--beam_width', type=int, default=5)
    parser.add_argument('--max_output_length', type=int, default=32)
    parser.add_argument('--target_vocabulary_size', type=int, default=20000)
    parser.add_argument('--synthetic', action='store_true')
    parser.add_argument('--allow_gpu_growth', action='store_true')
    parser.add_argument('--collect_run_metadata', action='store_true')
    parser.add_argument('--log_weight_images', action='store_true')
    options = parser.parse_args()

    if options.mode == 'decode':
        # Batching not supported in decoding
        options.batch_size = 1

    embedding_words, word_dict, word_embedding_dim = load_word_embeddings(
        options.embedding_path)

    vocabulary = Vocabulary()
    summary_vocabulary_path = path.join(options.vocabulary_dir,
                                        'summary_vocabulary.txt')
    vocabulary.add_from_file(
        summary_vocabulary_path,
        options.target_vocabulary_size - len(vocabulary.words))

    document_vocabulary_path = path.join(options.vocabulary_dir,
                                         'document_vocabulary.txt')

    # Add the most common words from vocabulary
    vocabulary.add_from_file(document_vocabulary_path, 150000)

    # Add additional common words from loaded embeddings
    vocabulary.add_words(embedding_words[:100000])

    run(options, word_dict, word_embedding_dim, vocabulary)
コード例 #4
0
    def _create_and_save_vocab(self, word_sequences, vocab_threshold,
                               model_dir, vocab_filename):
        """Create a Vocabulary instance from a list of word sequences, and save it to disk.

        Args:
            word_sequences: List of word sequences (sentence(s)) to use as basis for the vocabulary.

            vocab_threshold: Minimum number of times any word must appear within word_sequences 
                in order to be included in the vocabulary.
        """
        vocabulary = Vocabulary()
        for i in range(len(word_sequences)):
            word_seq = word_sequences[i]
            vocabulary.add_words(word_seq.split())
        vocabulary.compile(vocab_threshold)

        vocab_filepath = path.join(model_dir, vocab_filename)
        vocabulary.save(vocab_filepath)
        return vocabulary
コード例 #5
0
    def _create_and_save_vocab(self, word_sequences, vocab_threshold,
                               model_dir, vocab_filename, embeddings_dir,
                               normalize_imported_vocab, vocab_import_mode):
        """Create a Vocabulary instance from a list of word sequences, and save it to disk.

        Args:
            word_sequences: List of word sequences (sentence(s)) to use as basis for the vocabulary.

            vocab_threshold: Minimum number of times any word must appear within word_sequences 
                in order to be included in the vocabulary.

            model_dir: directory to save the vocabulary file to

            vocab_filename: file name of the vocabulary file

            embeddings_dir: Optional directory to import external vocabulary & embeddings
                If provided, the external vocabulary will be imported and processed according to the vocab_import_mode.
                If None, only the generated vocabulary will be used.
            
            normalize_imported_vocab: See VocabularyImporter.import_vocabulary

            vocab_import_mode: If embeddings_dir is specified, this flag indicates if the dataset vocabulary should be generated
                and used in combination with the external vocabulary according to the rules of VocabularyImportMode.
        """
        vocabulary = None
        if embeddings_dir is None or vocab_import_mode != VocabularyImportMode.External:
            vocabulary = Vocabulary()
            for i in range(len(word_sequences)):
                word_seq = word_sequences[i]
                vocabulary.add_words(word_seq.split())
            vocabulary.compile(vocab_threshold)

        vocabulary_import_stats = None
        if embeddings_dir is not None:
            vocabulary_importer = vocabulary_importer_factory.get_vocabulary_importer(
                embeddings_dir)
            vocabulary, vocabulary_import_stats = vocabulary_importer.import_vocabulary(
                embeddings_dir, normalize_imported_vocab, vocab_import_mode,
                vocabulary)

        vocab_filepath = path.join(model_dir, vocab_filename)
        vocabulary.save(vocab_filepath)
        return vocabulary, vocabulary_import_stats
コード例 #6
0
    def _create_and_save_vocab(self, word_sequences, vocab_threshold, model_dir, vocab_filename, embeddings_dir, normalize_imported_vocab, vocab_import_mode):
        
        vocabulary = None
        if embeddings_dir is None or vocab_import_mode != VocabularyImportMode.External:
            vocabulary = Vocabulary()
            for i in range(len(word_sequences)):
                word_seq = word_sequences[i]
                vocabulary.add_words(word_seq.split())
            vocabulary.compile(vocab_threshold)
        
        vocabulary_import_stats = None
        if embeddings_dir is not None:
            vocabulary_importer = vocabulary_importer_factory.get_vocabulary_importer(embeddings_dir)
            vocabulary, vocabulary_import_stats = vocabulary_importer.import_vocabulary(embeddings_dir, 
                                                                                        normalize_imported_vocab, 
                                                                                        vocab_import_mode, 
                                                                                        vocabulary)

        vocab_filepath = path.join(model_dir, vocab_filename)
        vocabulary.save(vocab_filepath)
        return vocabulary, vocabulary_import_stats
コード例 #7
0
def build_vocabs():
    tasks = [
        '.'.join([id, syn]) for id in ['autoid', 'goldid']
        for syn in ['autosyn', 'goldsyn']
    ]
    stypes = ['train', 'dev', 'test']

    loader = StreusleLoader()
    STREUSLE_BASE = os.environ.get(
        'STREUSLE_BASE'
    ) or '/cs/usr/aviramstern/nlp/datasets/streusle_v4/release'
    all_files = [
        STREUSLE_BASE + '/' + stype + '/streusle.ud_' + stype + '.' + task +
        '.json' for task in tasks for stype in stypes
    ]
    records = sum([loader.load(f, input_format='json') for f in all_files], [])
    samples = [streusle_record_to_lstm_model_sample(r) for r in records]

    pp_vocab = Vocabulary('PREPS')
    pp_vocab.add_words(
        set([
            x.token for s in samples for x, y in zip(s.xs, s.ys)
            if any([y.supersense_role, y.supersense_func])
        ]))

    ner_vocab = Vocabulary('NERS')
    ner_vocab.add_words(
        set([x.ner for s in samples for x, y in zip(s.xs, s.ys)]))
    ner_vocab.add_word(None)

    lemmas_vocab = Vocabulary('LEMMAS')
    lemmas_vocab.add_words(
        set([x.lemma for s in samples for x, y in zip(s.xs, s.ys)]))

    ud_dep_vocab = Vocabulary('UD_DEPS')
    ud_dep_vocab.add_words(
        set([x.ud_dep for s in samples for x, y in zip(s.xs, s.ys)]))
    ud_dep_vocab.add_word(None)

    ud_xpos_vocab = Vocabulary('UD_XPOS')
    ud_xpos_vocab.add_words(
        set([x.ud_xpos for s in samples for x, y in zip(s.xs, s.ys)]))
    ud_xpos_vocab.add_word(None)

    token_vocab = Vocabulary('TOKENS')
    token_vocab.add_words(
        set([x.token for s in samples for x, y in zip(s.xs, s.ys)]))

    govobj_config_vocab = Vocabulary('GOVOBJ_CONFIGS')
    govobj_config_vocab.add_words(
        set([x.govobj_config for s in samples for x, y in zip(s.xs, s.ys)]))

    pss_vocab = Vocabulary('PSS')
    pss_vocab.add_words(supersense_repo.PREPOSITION_SUPERSENSES_SET)
    pss_vocab.add_word(None)

    pss_vocab = Vocabulary('LEXCAT')
    pss_vocab.add_words(
        set([x.lexcat for s in samples for x, y in zip(s.xs, s.ys)]))

    return [
        pp_vocab, ner_vocab, lemmas_vocab, ud_dep_vocab, ud_xpos_vocab,
        token_vocab, pss_vocab, govobj_config_vocab
    ]
コード例 #8
0
def main():
    """
    Drives the program.

    """

    vocabulary = Vocabulary()

    running = True
    while running:
        menu_list = [
            "Check a word", "Add a word", "Add words", "Remove a word",
            "Remove words", "Print the word list",
            "Print the word list with definition", "Test your vocabulary",
            "Test your grammar"
        ]

        choice = user_input(
            menu_list, "Welcome to Learn English App!",
            "What would you like to do (Type 'quit' to exit the application): "
        )

        if choice == "quit":
            running = False
        elif choice == "1":
            the_word = input("What word you want to check? : ")
            check_dictionary(vocabulary, the_word)
        elif choice == "2":
            new_word = input("Add a new word: ")
            vocabulary.add_a_word(new_word)
            if new_word.strip() == "":
                print("Error. Your word only contains whitespace.")
        elif choice == "3":
            new_words = input("Add new words (separate by commas): ")
            vocabulary.add_words(new_words)
        elif choice == "4":
            try:  # EXCEPTION HANDLING
                delete_word = input("Remove a word: ")
                vocabulary.remove_a_word(delete_word)
            except KeyError:
                print("Error: That word is not in the vocabulary.")
        elif choice == "5":
            try:  # EXCEPTION HANDLING
                words = input(
                    "Enter the words you would like to remove (separate by commas): "
                )
                vocabulary.remove_words(words)
            except KeyError:
                print(
                    "Error: One or more of your words are not in the vocabulary."
                )
        elif choice == "6":
            if len(vocabulary.sort_words()) == 0:
                print(
                    "\nYour list is empty! Add some words to your list first!")
            else:
                print("\nHere is your word list")
                print_lists(vocabulary.sort_words())
        elif choice == "7":
            if len(vocabulary.sort_words()) == 0:
                print(
                    "\nYour list is empty! Add some words to your list first!")
            else:
                print("\nHere is your word list with definition")
                print_list_with_definition(vocabulary.sort_words())
        elif choice == "8":
            try:  # EXCEPTION HANDLING
                test_yourself(vocabulary)
            except IndexError:
                print("The question list is Empty")
        elif choice == "9":
            grammar_test()