Esempi in Python per Vocabulary, esempi in Python per vocabulary.Vocabulary

Esempio n. 1

0

Mostra file

File: corpus.py Progetto: hannawallach/cmpsci691bm

class Corpus(object):

    def __init__(self):

        self.documents = []
        self.vocab = Vocabulary()
        self.frozen = False

    def add(self, name, tokens):

        if not self.frozen:
            w = [self.vocab[x] for x in tokens]
            self.documents.append(Document(self, name, w))

    def freeze(self):

        for doc in self.documents:
            doc.freeze()

        self.vocab.stop_growth()
        self.frozen = True

    def __iter__(self):
        return iter(self.documents)

    def __len__(self):
        return len(self.documents)

    @classmethod
    def load(cls, filename):
        return pickle.load(file(filename, 'r'))

    def save(self, filename):
        pickle.dump(self, file(filename, 'wb'))

Esempio n. 2

0

Mostra file

File: tokenizer.py Progetto: marcusmachado/nbac

 def cleanUpText(self, text):
     cleanedWords = []
     # perform lowercase
     words = text.lower().split(' ')
     # get vocabulary
     vocab = Vocabulary()
     for word in words:
         # check Portuguese stopwords
         # TODO: Implement other languages tokenizers
         if not (word in vocab.getPTStopWords()):
             cleanedWords.append(word)
     return cleanedWords

Esempio n. 3

0

Mostra file

File: main.py Progetto: paupowpow/classify-it

    def __extract_vocabularies_from_data(self, classes):
        vocabularies = set()
        for c in classes:
            strings = self.__access_strings(c, '/train')
            vocabulary = Vocabulary(strings)

            curr_vocabulary = vocabulary.get_vocabulary()

            self.__write_vocabulary(c, curr_vocabulary)

            vocabularies |= curr_vocabulary #append set

        return sorted(vocabularies)

Esempio n. 4

0

Mostra file

File: corpus.py Progetto: hannawallach/cmpsci691bm

class Corpus(object):
    def __init__(self, documents=None, vocab=None, frozen=None):

        if documents:
            self.documents = documents
        else:
            self.documents = []

        if vocab:
            self.vocab = vocab
        else:
            self.vocab = Vocabulary()

        if frozen:
            self.frozen = frozen
        else:
            self.frozen = False

    def add(self, name, tokens):

        if not self.frozen:
            w = [self.vocab[x] for x in tokens]
            self.documents.append(Document(self, name, w))

    def freeze(self):

        for doc in self.documents:
            doc.freeze()

        self.vocab.stop_growth()
        self.frozen = True

    def __getitem__(self, i):
        return self.documents[i]

    def __getslice__(self, i, j):
        return Corpus(self.documents[i:j], self.vocab, self.frozen)

    def __iter__(self):
        return iter(self.documents)

    def __len__(self):
        return len(self.documents)

    @classmethod
    def load(cls, filename):
        return pickle.load(file(filename, "r"))

    def save(self, filename):
        pickle.dump(self, file(filename, "wb"))

Esempio n. 5

0

Mostra file

File: vocabulary_test.py Progetto: fandywang/python-wordsegmenter

class VocabularyTest(unittest.TestCase):

    def setUp(self):
        self.vocabulary = Vocabulary()
        self.vocabulary.load('testdata/vocabulary.dat', 'testdata/custom_words')

        pprint.pprint(self.vocabulary.trie)
        pprint.pprint(self.vocabulary.words)

    def test_vocabulary(self):
        self.assertIn(u'英雄三国', self.vocabulary.words.keys())
        self.assertIn(u'魔鬼代言人', self.vocabulary.words.keys())
        self.assertIn(u'黄河水利委员会', self.vocabulary.words.keys())
        self.assertNotIn(u'十大伪歌手', self.vocabulary.words.keys())
        self.assertNotIn(u'走路太牛', self.vocabulary.words.keys())

        self.assertEqual('n', self.vocabulary.get_pos(u'英雄三国'))
        self.assertEqual('n', self.vocabulary.get_pos(u'魔鬼代言人'))
        self.assertEqual('nt', self.vocabulary.get_pos(u'黄河水利委员会'))
        self.assertEqual('UNK', self.vocabulary.get_pos(u'十大伪歌手'))
        self.assertEqual('UNK', self.vocabulary.get_pos(u'走路太牛'))

    def test_gen_DAG(self):
        pprint.pprint(self.vocabulary.gen_DAG(
            u'《英雄三国》是由网易历时四年自主研发运营的一款英雄对战竞技网游。'))

Esempio n. 6

0

Mostra file

File: max_prob_segmenter_test.py Progetto: fandywang/python-wordsegmenter

 def setUp(self):
     self.vocabulary = Vocabulary()
     self.vocabulary.load('../data/vocabulary.dat')
     self.hmm_segmenter = HMMSegmenter()
     self.hmm_segmenter.load('../data/hmm_segment_model')
     self.max_prob_segmenter = MaxProbSegmenter(
             self.vocabulary, self.hmm_segmenter)

Esempio n. 7

0

Mostra file

File: tests.py Progetto: Anhmike/vocabulary

 def test_pronunciation_valid_phrase(self):
     current_result = vb.pronunciation("hippopotamus")
     result = '[{"rawType": "ahd-legacy", "raw": "(hĭpˌə-pŏtˈə-məs)", "seq": 0}, {"rawType": "arpabet", "raw": "HH IH2 P AH0 P AA1 T AH0 M AH0 S", "seq": 0}]'
     expected_result = json.loads(result)
     if sys.version_info[:2] <= (2, 7):
         self.assertItemsEqual(current_result, expected_result)
     else:
         self.assertCountEqual(current_result, expected_result)

Esempio n. 8

0

Mostra file

File: tests.py Progetto: Anhmike/vocabulary

 def test_antonym_valid_phrase_2(self):
     current_result = vb.antonym("respect")
     result = '{"text": ["disesteem", "disrespect"]}'
     expected_result = json.loads(result)
     if sys.version_info[:2] <= (2, 7):
         self.assertItemsEqual(current_result, expected_result)
     else:
         self.assertCountEqual(current_result, expected_result)

Esempio n. 9

0

Mostra file

File: tests.py Progetto: Anhmike/vocabulary

 def test_translate_valid_phrase(self):
     current_result = vb.translate("hummus", "en", "es")
     result = '[{"text": "hummus", "seq": 0}]'
     middle_val = json.loads(result)
     expected_result = json.dumps(middle_val)
     if sys.version_info[:2] <= (2, 7):
         self.assertItemsEqual(current_result, expected_result)
     else:
         self.assertCountEqual(current_result, expected_result)

Esempio n. 10

0

Mostra file

File: tests.py Progetto: Anhmike/vocabulary

 def test_partOfSpeech_valid_phrase_2(self):
     current_result = vb.part_of_speech("rapidly")
     result = '[{"text": "adverb", "example:": "With speed; in a rapid manner.", "seq": 0}]'
     middle_val = json.loads(result)
     expected_result = json.dumps(middle_val)
     if sys.version_info[:2] <= (2, 7):
         self.assertItemsEqual(current_result, expected_result)
     else:
         self.assertCountEqual(current_result, expected_result)

Esempio n. 11

0

Mostra file

File: tests.py Progetto: Anhmike/vocabulary

 def test_partOfSpeech_valid_phrase_1(self):
     current_result = vb.part_of_speech("hello")
     result = '[{"text": "interjection", "example:": "Used to greet someone, answer the telephone, or express surprise.", "seq": 0}]'
     middle_val = json.loads(result)
     expected_result = json.dumps(middle_val)
     if sys.version_info[:2] <= (2, 7):
         self.assertItemsEqual(current_result, expected_result)
     else:
         self.assertCountEqual(current_result, expected_result)

Esempio n. 12

0

Mostra file

File: tests.py Progetto: Anhmike/vocabulary

 def test_hyphenation_valid_phrase(self):
     current_result = vb.hyphenation("hippopotamus")
     result = '[{"seq": 0, "text": "hip", "type": "secondary stress"}, {"seq": 1, "text": "po"}, {"seq": 2, "text": "pot", "type": "stress"}, {"seq": 3, "text": "a"}, {"seq": 4, "text": "mus"}]'
     middle_val = json.loads(result)
     expected_result = json.dumps(middle_val)
     if sys.version_info[:2] <= (2, 7):
         self.assertItemsEqual(current_result, expected_result)
     else:
         self.assertCountEqual(current_result, expected_result)

Esempio n. 13

0

Mostra file

File: tests.py Progetto: Anhmike/vocabulary

 def test_usageExamples_valid_phrase(self):
     current_result = vb.usage_example("hillock")
     result = '[{"seq": 0, "text": "I went to the to of the hillock to look around."}]'
     middle_val = json.loads(result)
     expected_result = json.dumps(middle_val)
     if sys.version_info[:2] <= (2, 7):
         self.assertItemsEqual(current_result, expected_result)
     else:
         self.assertCountEqual(current_result, expected_result)

Esempio n. 14

0

Mostra file

File: tests.py Progetto: Anhmike/vocabulary

 def test_synonym_valid_phrase(self):
     current_result = vb.synonym("repudiate")
     result = '[{"seq": 0, "text": "deny"}]'
     middle_val = json.loads(result)
     expected_result = json.dumps(middle_val)
     if sys.version_info[:2] <= (2, 7):
         self.assertItemsEqual(current_result, expected_result)
     else:
         self.assertCountEqual(current_result, expected_result)

Esempio n. 15

0

Mostra file

File: vocab.py Progetto: nikhilkumarsingh/vocab_app

def synonyms(word):
   try: 
    synonyms=''
    result=json.loads(vb.synonym(word))
    for res in result:
      synonyms += res['text'] + ','
    return synonyms[:-1] + '\n'
   except:
     return "N/A"

Esempio n. 16

0

Mostra file

File: vocab.py Progetto: nikhilkumarsingh/vocab_app

def meaning(word):
   try: 
    parts=''
    result=json.loads(vb.part_of_speech(word))
    for res in result:
        parts += res['text']+ ':' + res[u'example:'] + '\n\n'
    return parts
   except:
    return "N/A"

Esempio n. 17

0

Mostra file

File: vocab.py Progetto: nikhilkumarsingh/vocab_app

def translate(text):
   try: 
    translation='' 
    result=json.loads(vb.translate(text, "en","hi"))
    for res in result:
      translation += res['text'] + ','
    return translation[:-1] + '\n'
   except:
    return "N/A"

Esempio n. 18

0

Mostra file

File: constant_assignment.py Progetto: Wheatwizard/pyVivid

def main():
    """."""
    from vocabulary import Vocabulary
    from attribute import Attribute
    from attribute_structure import AttributeStructure
    from attribute_system import AttributeSystem

    vocabulary = Vocabulary(['C'], [], ['V'])

    a = Attribute("a", [])
    b = Attribute("b", [])
    astr = AttributeStructure(a, b)
    objs = ['a', 'b', 'c']
    attribute_system = AttributeSystem(astr, objs)

    C = ConstantAssignment(vocabulary, attribute_system, {'C': 'a'})
    print C._vocabulary
    vocabulary.add_constant("C2")
    print C._vocabulary

Esempio n. 19

0

Mostra file

File: lexicon.py Progetto: vbshah/temporary-files

def get_example(word):
    try:
        examples = json.loads(vb.usage_example(word))
        example = ''
        limit = min(3, len(examples))
        for i in range(limit):
            example += examples[i]['text']+'...'    
        return example
    except Exception, e:
        print e,'\nFlag example'
        return ""

Esempio n. 20

0

Mostra file

File: document_test.py Progetto: JackieXie168/mltk

    def setUp(self):
        self.document = Document(20)
        self.vocabulary = Vocabulary()
        self.vocabulary.load("../testdata/vocabulary.dat")

        self.model = Model(20)
        self.model.load('../testdata/lda_model')

        self.doc_tokens = ['macbook', 'ipad',  # exist in vocabulary and model
                'mac os x', 'chrome',  # only exist in vocabulary
                'nokia', 'null']  # inexistent

Esempio n. 21

0

Mostra file

File: max_prob_segmenter_test.py Progetto: fandywang/python-wordsegmenter

class MaxProbSegmenterTest(unittest.TestCase):

    def setUp(self):
        self.vocabulary = Vocabulary()
        self.vocabulary.load('../data/vocabulary.dat')
        self.hmm_segmenter = HMMSegmenter()
        self.hmm_segmenter.load('../data/hmm_segment_model')
        self.max_prob_segmenter = MaxProbSegmenter(
                self.vocabulary, self.hmm_segmenter)

    def call_segment(self, text):
        for word in self.max_prob_segmenter.segment(text):
            print word + '/\t',
        print ''

    def test_segment(self):
        fp = open('testdata/document.dat', 'rb')
        for text in fp.readlines():
            self.call_segment(text.strip())
        fp.close()

Esempio n. 22

0

Mostra file

File: vocab.py Progetto: nikhilkumarsingh/vocab_app

def usage_example(word):
   try: 
    examples=''
    result=json.loads(vb.usage_example(word))
    for res in result: 
      examples += res['text'] + '\n\n'
    if(len<300):  
      return examples
    else:
      return examples[:300]
   except:
     return "N/A"

Esempio n. 23

0

Mostra file

File: lexicon.py Progetto: vbshah/temporary-files

def get_meaning(word):
    try:
        meaning = json.loads(vb.meaning(word))
        means = ''
        limit = min(3, len(meaning))
        for i in range(limit):
            means += meaning[i]['text'] + ';'

        return means
    except Exception, e:
        print e
        return ""

Esempio n. 24

0

Mostra file

File: seq2seq.py Progetto: lrlab/LSTM

    def __init__(self, args, src_file, trg_file):

        self.src_vocabulary = Vocabulary()
        self.src_vocabulary.make_dictionary(src_file)
        self.trg_vocabulary = Vocabulary()
        self.trg_vocabulary.make_dictionary(trg_file)

        self.src_size = len(self.src_vocabulary.wtoi)
        self.embed_size = args.embed_size
        self.hidden_size = args.hidden_size
        self.trg_size = len(self.trg_vocabulary.wtoi)

        super(EncoderDecoder, self).__init__(
            # encoder
            w_xe=F.EmbedID(self.src_size, self.embed_size),
            w_ep=F.Linear(self.embed_size, self.hidden_size*4),
            w_pp=F.Linear(self.hidden_size, self.hidden_size*4),
            # decoder
            w_ey=F.EmbedID(self.trg_size, self.embed_size),
            w_qe=F.Linear(self.embed_size, self.hidden_size*4),
            w_qq=F.Linear(self.hidden_size, self.hidden_size*4),
            w_yq=F.Linear(self.hidden_size, self.trg_size),
        )

Esempio n. 25

0

Mostra file

File: vocabularity.py Progetto: h-qub/wordeater-web

    def get_context(text):
        """
        Try to get context for card
        :param card:
        """

        try:
            m = json.loads(vb.usage_example(text))
            if len(m) > 0:
                return m[0]['text']
            return u''
        except Exception as ex:
            error(u'', ex)
            return u''

Esempio n. 26

0

Mostra file

File: vocabularity.py Progetto: h-qub/wordeater-web

 def get_meaning(text, lang):
     """
     Try to get meaning for card
     :param text:
     :param lang:
     :return:
     """
     try:
         m = json.loads(vb.meaning(text, lang, lang))
         if len(m) > 0:
             return m[0]['text']
         return u''
     except Exception as ex:
         error(u'', ex)
         return u''

Esempio n. 27

0

Mostra file

File: slot_tagger.py Progetto: fin10/MachineLearningStudy

def generate_dataset(items, slots, voca: Vocabulary):
    dataset = Dataset()
    for item in items:
        vectors = []
        for word in item[0].split():
            vectors.append(voca.get(word))

        labels = []
        for tag in item[1].split():
            value = np.zeros([len(slots)], dtype=np.float32)
            value[slots.index(tag)] = 1
            labels.append(value)

        dataset.add(item[0], item[1], vectors, labels)

    return dataset

Esempio n. 28

0

Mostra file

File: corpus.py Progetto: hannawallach/cmpsci691bm

    def __init__(self, documents=None, vocab=None, frozen=None):

        if documents:
            self.documents = documents
        else:
            self.documents = []

        if vocab:
            self.vocab = vocab
        else:
            self.vocab = Vocabulary()

        if frozen:
            self.frozen = frozen
        else:
            self.frozen = False

Esempio n. 29

0

Mostra file

File: corpus.py Progetto: uukuguy/digger

    def open(self, corpus_dir):
        self.root_dir = corpus_dir
        if not path.isdir(corpus_dir):
            os.mkdir(corpus_dir)

        self.meta_dir = self.root_dir + "/meta"

        self.samples_dir = self.root_dir + "/samples"
        if not path.isdir(self.samples_dir):
            os.mkdir(self.samples_dir)

        self.vocabulary_dir = self.root_dir + "/vocabulary"
        self.vocabulary = Vocabulary(self.vocabulary_dir)

        self.categories_dir = self.root_dir + "/categories"
        self.categories = Categories(self.categories_dir)
        self.categories.load_categories()
        self.categories.print_categories()

Esempio n. 30

0

Mostra file

File: tests.py Progetto: Anhmike/vocabulary

    def test_meaning_valid_phrase(self):
        current_result = vb.meaning("humming")
        result = '[{"seq": 0, "text": "Present participle of hum."}]'
        middle_val = json.loads(result)
        expected_result = json.dumps(middle_val)
        if sys.version_info[:2] <= (2, 7):  ## python 2 
            self.assertItemsEqual(current_result, expected_result)
        else:       # python 3
            """
            assertItemsEqual() was renamed to assertCountEqual() 
            Why I am not using assertEqual() here? 

            Reference: 
            - http://stackoverflow.com/a/7473137/3834059
            - https://docs.python.org/2/library/unittest.html#unittest.TestCase.assertItemsEqual
            - https://docs.python.org/3/library/unittest.html?highlight=assertcountequal#unittest.TestCase.assertCountEqual
            """

            self.assertCountEqual(current_result, expected_result)

Esempio n. 31

0

Mostra file

File: train_entity_embedding.py Progetto: w86763777/DCA

def main():
    os.makedirs(os.path.join(args.logdir, 'models'))

    vocab = Vocabulary(os.path.join(args.wiki_preprocess, 'entity_vocab.txt'))
    print(f"# entity in dataset: {len(vocab)}")

    if not os.path.exists(args.cache):
        STOPWORD_PATH = os.path.join(args.dataroot, "previous/stopwords.txt")
        SYMBOL_PATH = os.path.join(args.dataroot, "previous/symbols.txt")
        with open(STOPWORD_PATH, 'r') as f:
            stop_words = set([line.strip() for line in f])
        with open(SYMBOL_PATH, 'r') as f:
            symbols = set([line.strip() for line in f])
        stop_words = stop_words.union(symbols)

        # Pre-trained word embedding
        wiki2vec = Wikipedia2Vec.load(args.wiki2vec)

        context_entity_word_co_occur_path = os.path.join(
            args.wiki_preprocess, 'context_entity_word_co_occur.txt')
        context_positive_words = filter_positive_words(
            context_entity_word_co_occur_path,
            stop_words,
            wiki2vec,
            vocab,
        )

        page_entity_word_co_occur_path = os.path.join(
            args.wiki_preprocess, 'page_entity_word_co_occur.txt')
        page_positive_words = filter_positive_words(
            page_entity_word_co_occur_path,
            stop_words,
            wiki2vec,
            vocab,
        )

        word_count_path = os.path.join(args.wiki_preprocess, 'word_count.json')
        negative_words, negative_freqs = \
            filter_negative_words(
                word_count_path,
                stop_words,
                wiki2vec,
                freq_power=0.6
            )
        (page_positive_words, context_positive_words, negative_words,
         vecs) = get_reduced_embedding(page_positive_words,
                                       context_positive_words, negative_words,
                                       wiki2vec)
        del wiki2vec

        os.makedirs(os.path.dirname(args.cache), exist_ok=True)
        pickle.dump((page_positive_words, context_positive_words,
                     negative_words, negative_freqs, vecs),
                    open(args.cache, 'wb'))
    else:
        print(f"Load cache {args.cache}")
        (page_positive_words, context_positive_words, negative_words,
         negative_freqs, vecs) = pickle.load(open(args.cache, 'rb'))

    page_non_empty_index = [
        i for i, positive_words in enumerate(page_positive_words)
        if len(positive_words) != 0
    ]
    context_non_empty_index = [
        i for i, positive_words in enumerate(context_positive_words)
        if len(positive_words) != 0
    ]
    non_empty_index = set(page_non_empty_index + context_non_empty_index)
    print(f'# entity in vocab  : {len(vocab) - 1:d}')
    print(f'# non empty page   : {len(page_non_empty_index):d}')
    print(f'# non empty context: {len(context_non_empty_index):d}')
    print(f'# non empty        : {len(non_empty_index):d}')

    word_embedding = nn.Embedding.from_pretrained(torch.tensor(vecs))
    word_embedding = word_embedding.to(device)
    entity_embedding = nn.Embedding(len(vocab) - 1, vecs.shape[1])
    nn.init.normal_(entity_embedding.weight, mean=0, std=1.)
    with torch.no_grad():
        for idx in range(len(vocab) - 1):
            if idx not in non_empty_index:
                entity_embedding.weight[idx] = 0.
    entity_embedding = entity_embedding.to(device)
    optimizer = torch.optim.Adagrad(entity_embedding.parameters(), lr=args.lr)

    dataset = ContrastiveDataset(page_positive_words, negative_freqs,
                                 negative_words, args.positive_num,
                                 args.negative_num)
    dataset = Subset(dataset, page_non_empty_index)
    writer = SummaryWriter(os.path.join(args.logdir, 'phase1'))
    print('Phase 1')
    train(word_embedding,
          entity_embedding,
          optimizer,
          dataset,
          writer,
          start_epochs=1,
          end_epochs=args.phase1_epochs)

    dataset = ContrastiveDataset(context_positive_words, negative_freqs,
                                 negative_words, args.positive_num,
                                 args.negative_num)
    dataset = Subset(dataset, context_non_empty_index)
    writer = SummaryWriter(os.path.join(args.logdir, 'phase2'))
    print('Phase 2')
    train(word_embedding,
          entity_embedding,
          optimizer,
          dataset,
          writer,
          start_epochs=args.phase1_epochs + 1,
          end_epochs=args.phase2_epochs)

Esempio n. 32

0

Mostra file

class Main:
    def main(self):
        clearCli()
        self.vocabulary = Vocabulary()
        vocabulary = self.vocabulary
        vocabulary.buildVocabulary()

        isValidCommand = True
        while True:
            quiz = Quiz(vocabulary)
            clearCli()
            print(CLI.main_menu)
            if not isValidCommand:
                print(CLI.invalid_command)
            command = input()
            isValidCommand = command in ['sa', 's', 'sl', 'q', 'j1', 'j2', 'j3', 'la', 't', 'o']
            if isValidCommand:
                if command == 'sa':
                    print('Starting quiz!\n\n')
                    language = self.selectLanguage()
                    quiz.startall(language)
                elif command == 's':
                    numQuestions = self.selectNumQuestions()
                    language = self.selectLanguage()
                    print('Starting quiz!\n\n')
                    quiz.start(language, numQuestions)
                elif command == 'sl':
                    startLesson, endLesson = self.selectLessons()
                    language = self.selectLanguage()
                    print('Starting quiz!\n\n')
                    quiz.start(language, startLesson=startLesson,
                               endLesson=endLesson)
                elif command == 'j1':
                    language = self.selectLanguage()
                    print('Starting quiz for Japanese 1 vocabulary!\n\n')
                    quiz.start(language, startLesson=1, endLesson=10)
                elif command == 'j2':
                    language = self.selectLanguage()
                    print('Starting quiz for Japanese 2 vocabulary!\n\n')
                    quiz.start(language, startLesson=11, endLesson=20)
                elif command == 'j3':
                    language = self.selectLanguage()
                    print('Starting quiz for Japanese 3 vocabulary!\n\n')
                    quiz.start(language, startLesson=21, endLesson=32)
                elif command == 'o':
                    print('Starting open ended quiz')
                    startLesson, endLesson = self.selectLessons()
                    quiz.start_open_ended(startLesson=startLesson, endLesson=endLesson)
                elif command == 'q':
                    print('Quiting program')
                    break
                elif command == 'la':
                    print('Listing all vocabulary')
                    vocabulary.printWholeVocabulary()
                elif command == 't':
                    print('Test')
                    self.testFunction()

    def testFunction(self):
        kksi = kakasi()
        kksi.setMode("J", "H")
        conv = kksi.getConverter()
        all_hiragana = 'がくせい'
        partial_hiragana1 = '学せい'
        partial_hiragana2 = 'がく生'
        all_kanji = '学生'
        print(conv.do(all_hiragana))
        print(conv.do(partial_hiragana1))
        print(conv.do(partial_hiragana2))
        print(conv.do(all_kanji))  
        print(conv.do(all_hiragana) == conv.do(partial_hiragana1) == conv.do(partial_hiragana2) == conv.do(all_kanji))
        input()
    
    def selectNumQuestions(self):
        clearCli()
        print('How many questions?')
        while True:
            value = input()
            isANumber = value.isnumeric()
            isNumberWithinVocabSize = isANumber and 1 <= int(value) <= self.vocabulary.getVocabularySize()
            if isNumberWithinVocabSize:
                break
            clearCli()
            print('How many questions?')
            if not isANumber: 
                print('Not a number')
            elif isANumber and not isNumberWithinVocabSize:
                print('Invalid value. Vocabulary size is', self.vocabulary.getVocabularySize())
        numQuestions = int(value)
        return numQuestions

    def selectLanguage(self):
        clearCli()
        print('Language of questions? (jp/en)')
        while True:
            value = input()
            isValidInput = value == 'jp' or value == 'en'
            if isValidInput:
                break
            clearCli()
            print('Language of questions? (jp/en)')
            print(CLI.invalid_command)
        return value

    def selectLessons(self):
        clearCli()
        print('Do you want to do a (s)ingle or a (r)ange of lessons? (s/r)')
        while True:
            value = input()
            isValidInput = value == 's' or value == 'r'
            if isValidInput:
                break
            clearCli()
            print('Do you want to do a (s)ingle or a (r)ange of lessons? (s/r)')
            print(CLI.invalid_command)
        if value == 's':
            startLesson, endLesson = self.selectSingleLesson()
        elif value == 'r':
            startLesson, endLesson = self.selectRangeOfLessons()
        return startLesson, endLesson
    
    def selectSingleLesson(self):
        clearCli()
        print('Type lesson number?')
        while True:
            value = input()
            isValidInput = self.vocabulary.hasLesson(int(value))
            if isValidInput:
                print('Valid lesson', value)
                break
            clearCli()
            print('Type lesson number?')
            print('Lesson does not exist')
        selectedLesson = value
        return selectedLesson , selectedLesson

    def selectRangeOfLessons(self):
        clearCli()
        print('Type start lesson number?')
        while True:
            value = input()
            isValidInput = self.vocabulary.hasLesson(int(value))
            if isValidInput:
                print('Valid lesson', value)
                break
            clearCli()
            print('Type start lesson number?')
            print('Lesson does not exist')
        startLesson = value

        clearCli()
        print('Start lesson: ', startLesson)
        print('Type end lesson number?')
        while True:
            value = input()
            isLargerThanStart = int(value) > int(startLesson) 
            doesLessonExist = self.vocabulary.hasLesson(int(value))
            isValidInput = isLargerThanStart and doesLessonExist
            if isValidInput:
                print('Valid lesson', value)
                break
            clearCli()
            print('Start lesson: ', startLesson)
            print('Type end lesson number?')
            if not doesLessonExist: print('Lesson does not exist')
            elif not isLargerThanStart: print('End lesson should be greater than start lesson')
        endLesson = value
        assert int(endLesson) > int(startLesson)
        return startLesson, endLesson

Esempio n. 33

0

Mostra file

File: train_lstm_decoder.py Progetto: bityangke/ActivityNetVideoCaptioning-1

def main(args):
    assert FLAGS.training_data_loader, "--training_data_loader is required"
    assert FLAGS.vocab_file, "--vocab_file is required"
    assert FLAGS.train_dir, "--train_dir is required"

    model_config = configuration.ModelConfig()
    training_config = configuration.TrainingConfig()

    print('Loading vocabulary file...')
    vocab = Vocabulary(FLAGS.vocab_file)
    vocab_size = vocab.get_vocabulary_size()

    # Assign parameters to model configuration.
    model_config.vocab_size = vocab_size
    training_config.train_dir = FLAGS.train_dir
    training_config.num_iterations = FLAGS.number_of_steps
    training_config.log_every_n_steps = FLAGS.log_every_n_steps
    training_config.validation_loss_every_n_steps = FLAGS.validation_loss_every_n_steps

    # Create training directory.
    if not tf.gfile.IsDirectory(training_config.train_dir):
        tf.logging.info("Creating training directory: %s",
                        training_config.train_dir)
        tf.gfile.MakeDirs(training_config.train_dir)

    # Build the TensorFlow graph.
    g = tf.Graph()
    with g.as_default():
        print('Building LSTM decoder model...')
        if not FLAGS.repeated_feed_images:
            model = LSTMDecoder(model_config, mode="train")
        else:
            model = LSTMDecoderRepeatedImageFeed(model_config, mode="train")
        model.build()

        # Setup learning rate decay.
        num_batches_per_epoch = (training_config.num_examples_per_epoch /
                                 model_config.batch_size)
        decay_steps = int(num_batches_per_epoch *
                          training_config.num_epochs_per_decay)
        global_step = tf.Variable(0, name='global_step', trainable=False)
        learning_rate = tf.train.exponential_decay(
            training_config.initial_learning_rate,
            global_step,
            decay_steps=decay_steps,
            decay_rate=training_config.learning_rate_decay_factor,
            staircase=True)
        tf.summary.scalar('learning_rate', learning_rate)

        # Setup optimizer.
        optimizer = tf.train.AdamOptimizer(learning_rate)
        train = optimizer.minimize(model.total_loss, global_step=global_step)

        # Setup summary.
        all_summary = tf.summary.merge_all()
        train_writer = tf.summary.FileWriter(FLAGS.summary_dir + '/train')
        val_writer = tf.summary.FileWriter(FLAGS.summary_dir + '/val')

        # Create saver
        saver = tf.train.Saver(
            max_to_keep=training_config.max_checkpoints_to_keep)

        # Initialize variables.
        print('Initializing variables...')
        init = tf.global_variables_initializer()
        sess = tf.Session()
        sess.run(init)

        print('Initializing data loader for training set...')
        start = time.time()
        data_loader_train = DataLoader()
        data_loader_train.load(FLAGS.training_data_loader)
        end = time.time()
        time_elapsed = end - start
        print('Finished initializing data loader (time elapsed: %f)' %
              time_elapsed)

        print('Initializing data loader for validation set...')
        start = time.time()
        data_loader_val = DataLoader()
        data_loader_val.load(FLAGS.validation_data_loader)
        end = time.time()
        time_elapsed = end - start
        print('Finished initializing data loader (time elapsed: %f)' %
              time_elapsed)

        print('Start training...')
        # Stochastic Gradient Descent
        for i in range(training_config.num_iterations):
            print('Sampling mini-batch...')
            image_features, input_sequence, input_mask, target_sequence =\
                data_loader_train.segmental_sampling(batch_size=training_config.batch_size,
                                                     num_segments=model_config.num_segments)

            _, total_loss, summary = sess.run(
                (train, model.total_loss, all_summary),
                feed_dict={
                    "input_features:0": image_features,
                    "input_feed:0": input_sequence,
                    "input_mask:0": input_mask,
                    "target_sequences:0": target_sequence
                })
            train_writer.add_summary(summary, i)

            # Logging
            if i % training_config.log_every_n_steps == 0:
                print('[%d/%d] loss: %f' %
                      (i, training_config.num_iterations, total_loss))

            # Save model.
            if i % training_config.save_every_n_steps == 0:
                print('Saving model at step %d...' % i)
                saver.save(sess, FLAGS.train_dir + '/model', global_step=i)

            # evaluate the loss with validation set at every epoch
            if i % training_config.validation_loss_every_n_steps == 0:
                image_features, input_sequence, input_mask, target_sequence = \
                    data_loader_val.segmental_sampling(batch_size=training_config.batch_size,
                                                       num_segments=model_config.num_segments)

                total_loss, summary = sess.run(
                    (model.total_loss, all_summary),
                    feed_dict={
                        "input_features:0": image_features,
                        "input_feed:0": input_sequence,
                        "input_mask:0": input_mask,
                        "target_sequences:0": target_sequence
                    })
                val_writer.add_summary(summary, i)

Esempio n. 34

0

Mostra file

File: WordRoom.py Progetto: GitContainer/WordRoom

def load_view(view_name: str):
    """Return a given view from a UI file."""
    return ui.load_view(os.path.join(UI_DIR, view_name))


if __name__ == '__main__':
    # This `builtins` trick fixes a problem where launching the script from
    # the home screen can cause multiple instances to run at once.
    # https://forum.omz-software.com/topic/4097/home-screen-alias-is-script-already-running/
    try:
        (vocab, jinja2env, lookup_view, word_view,
         compact_word_view, about_view, container) = builtins.wordroom
    except (AttributeError, ValueError):
        container = None
    if isinstance(container, ui.View) and container.on_screen:
        pass  # reuse the original globals
    else:  # initialize new globals
        vocab = Vocabulary(data_file=VOCABULARY_FILE)
        jinja2env = Environment(loader=FileSystemLoader(HTML_DIR))
        lookup_view = load_view('lookup')
        word_view = load_view('word')
        compact_word_view = load_view('word')
        about_view = load_view('about')
        container = AdaptiveView(lookup_view, word_view)
        container.name = 'WordRoom'
        container.present('fullscreen', hide_title_bar=True)
        builtins.wordroom = (vocab, jinja2env, lookup_view, word_view,
                             compact_word_view, about_view, container)
    # if appex.is_running_extension():
    #    load_word_view(appex.get_text())

Esempio n. 35

0

Mostra file

File: chatbot_model.py Progetto: MonadWizard/bot

    def chat(self, question, chat_settings):
        """Chat with the chatbot model by predicting an answer to a question.
        'question' and 'answer' in this context are generic terms for the interactions in a dialog exchange
        and can be statements, remarks, queries, requests, or any other type of dialog speech.
        For example:
        Question: "How are you?"     Answer: "Fine."
        Question: "That's great."    Answer: "Yeah."

        Args:
            question: The input question for which the model should predict an answer.

            chat_settings: The ChatSettings instance containing the chat settings and inference hyperparameters

        Returns:
            q_with_hist: question with history if chat_settings.show_question_context = True otherwise None.

            answers: array of answer beams if chat_settings.show_all_beams = True otherwise the single selected answer.
            
        """
        #Process the question by cleaning it and converting it to an integer encoded vector
        question = Vocabulary.clean_text(question)
        question = self.input_vocabulary.words2ints(question)

        #Prepend the currently tracked steps of the conversation history separated by EOS tokens.
        #This allows for deeper dialog context to influence the answer prediction.
        question_with_history = []
        for i in range(len(self.conversation_history)):
            question_with_history += self.conversation_history[i] + [
                self.input_vocabulary.eos_int()
            ]
        question_with_history += question

        #Get the answer prediction
        batch = np.zeros((1, len(question_with_history)))
        batch[0] = question_with_history
        max_output_sequence_length = chat_settings.inference_hparams.max_answer_words + 1  # + 1 since the EOS token is counted as a timestep
        predicted_answer_info = self.predict_batch(
            inputs=batch,
            input_sequence_length=np.array([len(question_with_history)]),
            max_output_sequence_length=max_output_sequence_length,
            beam_length_penalty_weight=chat_settings.inference_hparams.
            beam_length_penalty_weight,
            sampling_temperature=chat_settings.inference_hparams.
            sampling_temperature,
            log_summary=chat_settings.inference_hparams.log_summary)

        #Read the answer prediction
        answer_beams = []
        if self.beam_width > 0:
            #For beam search decoding: if show_all_beams is enabeled then output all beams (sequences), otherwise take the first beam.
            #   The beams (in the "predictions" matrix) are ordered with the highest ranked beams first.
            beam_count = 1 if not chat_settings.show_all_beams else len(
                predicted_answer_info["predictions_seq_lengths"][0])
            for i in range(beam_count):
                predicted_answer_seq_length = predicted_answer_info[
                    "predictions_seq_lengths"][0][
                        i] - 1  #-1 to exclude the EOS token
                predicted_answer = predicted_answer_info["predictions"][
                    0][:predicted_answer_seq_length, i].tolist()
                answer_beams.append(predicted_answer)
        else:
            #For greedy / sampling decoding: only one beam (sequence) is returned, based on the argmax for greedy decoding
            #   or the sampling distribution for sampling decoding. Return this beam.
            beam_count = 1
            predicted_answer_seq_length = predicted_answer_info[
                "predictions_seq_lengths"][0] - 1  #-1 to exclude the EOS token
            predicted_answer = predicted_answer_info["predictions"][
                0][:predicted_answer_seq_length].tolist()
            answer_beams.append(predicted_answer)

        #Add new conversation steps to the end of the history and trim from the beginning if it is longer than conv_history_length
        self.conversation_history.append(question)
        self.conversation_history.append(answer_beams[0])
        self.trim_conversation_history(
            chat_settings.inference_hparams.conv_history_length)

        #Convert the answer(s) to text and return
        answers = []
        for i in range(beam_count):
            answer = self.output_vocabulary.ints2words(answer_beams[i])
            answers.append(answer)

        q_with_hist = None if not chat_settings.show_question_context else self.output_vocabulary.ints2words(
            question_with_history)
        if chat_settings.show_all_beams:
            return q_with_hist, answers
        else:
            return q_with_hist, answers[0]

Esempio n. 36

0

Mostra file

def train(trainFile, devFile, gramsNumber, smoothStrategy, BLaplace):
    # process data
    with open(trainFile, "r") as f:
        corpusTrain = f.readlines()
    with open(devFile, "r") as f:
        corpusDev = f.readlines()
    corpusTrainDev = corpusTrain + corpusDev

    if smoothStrategy == "laplace":
        vocab = Vocabulary(gramsNumber, corpusTrainDev)
        vocab.tune_with_Laplace_smoothing(BLaplace)
    elif smoothStrategy == "held_out":
        vocab = Vocabulary(gramsNumber, corpusTrain)
        vocab.tune_with_held_out_smoothing(corpusDev)
    elif smoothStrategy == "cross_valid":
        vocab = Vocabulary(gramsNumber, corpusTrain)
        vocab.tune_with_cross_val_smoothing(corpusDev)
    elif smoothStrategy == "good_turing":
        vocab = Vocabulary(gramsNumber, corpusTrain)
        vocab.tune_with_good_turing_smoothing()
    else:
        raise KeyError
    return vocab

Esempio n. 37

0

Mostra file

File: dataset.py Progetto: hyes92121/MLDS2018SPRING-1

    return avi_data, targets, lengths





if __name__ == '__main__':
    import time
    from torch.autograd import Variable
    from torch.nn.utils.rnn import pack_padded_sequence
    from checkpoint import *

    json_file = 'data/testing_label.json'
    numpy_file = 'data/testing_data/feat'

    helper = Vocabulary(json_file, min_word_count=5)

    dataset = TrainingDataset(label_json_file=json_file, training_data_path=numpy_file, helper=helper, load_into_ram=True)

    dataloader = DataLoader(dataset, batch_size=32, shuffle=True, num_workers=8, collate_fn=collate_fn)

    ss = time.time()

    for epoch in range(1):

        s = time.time()

        print('epoch: {}'.format(epoch+1))
        for batch_n, batch in enumerate(dataloader):

            #e = time.time()

Esempio n. 38

0

Mostra file

File: model.py Progetto: jlin816/scone

    def __init__(self,
                 in_vocab,
                 output_vocabularies,
                 state_encoder_builder,
                 valid_action_fn,
                 args):
        SconeModel.__init__(self,
                            state_encoder_builder,
                            in_vocab,
                            args.embeddings_size,
                            args.num_enc_layers,
                            args.encoder_size,
                            args.decoder_size,
                            RNNBuilder)

        self.args = args
        self._dropout = 0.

        # Output vocabs and embeddings.
        self.output_action_vocabulary = Vocabulary(output_vocabularies[0], [EOS, BEG])
        self.output_location_vocabulary.g = Vocabulary(output_vocabularies[1], [NO_ARG, BEG])
        self.output_argument_vocabulary = Vocabulary(output_vocabularies[2], [NO_ARG, BEG])

        # All outputs vocabulary.
        all_vocabulary_list = []
        self._valid_action_indices = []
        index = 0
        for action in self.output_action_vocabulary:
            for location in self.output_location_vocabulary.g:
                for argument in self.output_argument_vocabulary:
                    if action != BEG and location != BEG and argument != BEG:
                        if valid_action_fn(action, location, argument):
                            self._valid_action_indices.append(index)
                        all_vocabulary_list.append((action, location, argument))
                        index += 1
        self._all_output_vocabulary = Vocabulary(all_vocabulary_list, [])

        self._output_action_embeddings = self._pc.add_lookup_parameters(
            (len(self.output_action_vocabulary),
             args.embeddings_size),
            name="output-action-embeddings")
        self._output_location_embeddings = self._pc.add_lookup_parameters(
            (len(self.output_location_vocabulary.g),
             args.embeddings_size),
            name="output-location-embeddings")
        self._output_argument_embeddings = self._pc.add_lookup_parameters(
            (len(self.output_argument_vocabulary),
             args.embeddings_size),
            name="output-argument-embeddings")

        # Action decoder RNN.
        self._dec_input_size = args.encoder_size * 2 \
            + args.encoder_size * 2 \
            + self._state_encoder.item_size() * 2 \
            + args.embeddings_size * 3
        self._decoder = RNNBuilder(args.num_dec_layers,
                                   self._dec_input_size,
                                   args.decoder_size,
                                   self._pc)

        situated_in_size = self._dec_input_size
        if self.args.always_initial_state:
            self._state_attention_winitial = self._pc.add_parameters(
                (self.args.encoder_size * 2 + self.args.decoder_size,
                 self._state_encoder.item_size()),
                name="state-attention-winitial")
            self._state_attention_winitial2 = self._pc.add_parameters(
                (self.args.encoder_size * 2 + self.args.decoder_size,
                 self._state_encoder.item_size()),
                name="state-attention-winitial2")
            situated_in_size += 2 * self._state_encoder.item_size()

        # MLP parameters to mix the situated embedding.
        self._situated_w = self._pc.add_parameters(
            (self._dec_input_size, situated_in_size),
            name="situated-w")
        self._situated_b = self._pc.add_parameters((self._dec_input_size),
                                                   name="situated-b")

        # Project the RNN output to a vector that is the length of the output
        # vocabulary.
        self._final_w = self._pc.add_parameters(
            (args.decoder_size, args.decoder_size), name="final-w")

        self._output_w_action = self._pc.add_parameters(
            (len(self.output_action_vocabulary) - 1, args.decoder_size),
            name="output-w-action")
        self._output_w_location = self._pc.add_parameters(
            (len(self.output_location_vocabulary.g) - 1, args.decoder_size),
            name="output-w-location")
        self._output_w_argument = self._pc.add_parameters(
            (len(self.output_argument_vocabulary) - 1, args.decoder_size),
            name="output-w-argument")

Esempio n. 39

0

Mostra file

File: chatbot.py Progetto: dhimanshu07/Python-Chat-bot

class ChatBot:
    def __init__(self, layers=5, maxlen=10, embedding_size=128, batch_size=32, is_train=True, lr=0.0001):
        self.layers = layers
        self.maxlen = maxlen
        self.embedding_size = embedding_size
        self.batch_size = batch_size
        self.learning_rate = lr
        self.model_path = "model/chatbot/model.npz" #what is npz? It is the extension , it is the file in which we save the weight of our seq2seq model.

        ## Vocabulary
        self.vocab = Vocabulary(corpus=None, maxlen=maxlen)
        self.vocab_size = self.vocab.vocab_size

        ## Init Session
        sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
        tf.reset_default_graph()
        self.sess = tf.Session(config=sess_config)

        ## Placeholders
        self.encoder_inputs = tf.placeholder(tf.int32, shape=[None, None])
        self.decoder_inputs = tf.placeholder(tf.int32, shape=[None, None])
        self.decoder_outputs = tf.placeholder(tf.int32, shape=[None, None])
        self.mask = tf.placeholder(tf.int32, shape=[None, None])

        ## Model
        self.net_out, _ = self.create_model(
            self.encoder_inputs,
            self.decoder_inputs,
            self.vocab_size,
            self.embedding_size,
            reuse=False)
        self.net_out.print_params(False)

        self.loss = tl.cost.cross_entropy_seq_with_mask(
            logits=self.net_out.outputs,
            target_seqs=self.decoder_outputs,
            input_mask=self.mask,
            return_details=False,
            name='cost')

        ## Optimizer
        self.train_op = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate).minimize(self.loss)

    def train(self, X, Y, num_epochs=1):
        ## Init Vars
        self.sess.run(tf.global_variables_initializer())

        ## Load Model
        tl.files.load_and_assign_npz(sess=self.sess, name=self.model_path, network=self.net_out)

        n_step = len(X)//self.batch_size

        for epoch in range(num_epochs):
            X, Y = shuffle(X, Y, random_state=0)
            total_loss, n_iter = 0, 0
            for x, y in tqdm(tl.iterate.minibatches(
                inputs=X,
                targets=Y,
                batch_size=self.batch_size,
                shuffle=False),
                total=n_step,
                desc='Epoch[{}/{}]'.format(epoch + 1, num_epochs),
                leave=False):

                x1, x2, y1, W = self.vocab.dataset(x, y)
                feed_data = {}
                feed_data[self.encoder_inputs] = x1
                feed_data[self.decoder_inputs] = x2
                feed_data[self.decoder_outputs] = y1
                feed_data[self.mask] = W


                _, loss_iter = self.sess.run([self.train_op, self.loss], feed_dict=feed_data)
                total_loss += loss_iter
                n_iter += 1

            ## printing average loss after every epoch
            print('Epoch [{}/{}]: loss {:.4f}'.format(epoch + 1, num_epochs, total_loss / n_iter))

            ## saving the model
            tl.files.save_npz(self.net_out.all_params, name=self.model_path, sess=self.sess)

        ## session cleanup
        self.sess.close()


    """
    Creates the LSTM Model
    """
    def create_model(self, encoder_inputs, decoder_inputs, vocab_size, emb_dim, is_train=True, reuse=False):
        with tf.variable_scope("model", reuse=reuse):
            # for chatbot, you can use the same embedding layer,
            # for translation, you may want to use 2 seperated embedding layers # embedding layers?
            with tf.variable_scope("embedding") as vs:
                net_encode = EmbeddingInputlayer(
                    inputs = encoder_inputs,
                    vocabulary_size = vocab_size,
                    embedding_size = emb_dim,
                    name = 'seq_embedding')
                vs.reuse_variables()
                net_decode = EmbeddingInputlayer(
                    inputs = decoder_inputs,
                    vocabulary_size = vocab_size,
                    embedding_size = emb_dim,
                    name = 'seq_embedding')

            net_rnn = Seq2Seq(net_encode, net_decode,
                    cell_fn = tf.nn.rnn_cell.LSTMCell,
                    n_hidden = emb_dim,
                    initializer = tf.random_uniform_initializer(-0.1, 0.1),
                    encode_sequence_length = retrieve_seq_length_op2(encoder_inputs),
                    decode_sequence_length = retrieve_seq_length_op2(decoder_inputs),
                    initial_state_encode = None,
                    dropout = (0.5 if is_train else None),
                    n_layer = self.layers,
                    return_seq_2d = True,
                    name = 'seq2seq')

            net_out = DenseLayer(net_rnn, n_units=vocab_size, act=tf.identity, name='output')
        return net_out, net_rnn


    def infer(self, query):
        unk_id = self.vocab.word_index["<unk>"]
        pad_id = self.vocab.word_index["<pad>"]

        start_id = self.vocab.word_index["<start>"]
        end_id = self.vocab.word_index["<end>"]

        ## Init Session
        sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
        tf.reset_default_graph()
        sess = tf.Session(config=sess_config)

        ## Inference Data Placeholders
        encode_inputs = tf.placeholder(dtype=tf.int64, shape=[1, None], name="encode_inputs")
        decode_inputs = tf.placeholder(dtype=tf.int64, shape=[1, None], name="decode_inputs")

        net, net_rnn = self.create_model(
            encode_inputs,
            decode_inputs,
            self.vocab_size,
            self.embedding_size,
            is_train=False,
            reuse=False)
        y = tf.nn.softmax(net.outputs)

        ## Init Vars
        sess.run(tf.global_variables_initializer())

        ## Load Model
        tl.files.load_and_assign_npz(sess=sess, name=self.model_path, network=net)

        """
        Inference using pre-trained model
        """
        def inference(seed):
            seed_id = self.vocab.text_to_sequence(seed)

            ## Encode and get state
            state = sess.run(net_rnn.final_state_encode, {encode_inputs: [seed_id]})

            ## Decode, feed start_id and get first word [https://github.com/zsdonghao/tensorlayer/blob/master/example/tutorial_ptb_lstm_state_is_tuple.py]
            o, state = sess.run([y, net_rnn.final_state_decode], {
                net_rnn.initial_state_decode: state,
                decode_inputs: [[start_id]]})
            w_id = tl.nlp.sample_top(o[0], top_k=3)
            #w = self.vocab.index_word[w_id]

            ## Decode and feed state iteratively
            sentence = [w_id]
            for _ in range(self.maxlen): # max sentence length
                o, state = sess.run([y, net_rnn.final_state_decode],{
                    net_rnn.initial_state_decode: state,
                    decode_inputs: [[w_id]]})
                w_id = tl.nlp.sample_top(o[0], top_k=2)
                #w = self.vocab.index_word[w_id]
                if w_id == end_id:
                    break
                sentence = sentence + [w_id]
            return sentence

        ## infer
        sentence = inference(query)
        response = self.vocab.seqs_to_text(sentence)
        response = " ".join(response.split(" "))
        return response

Esempio n. 40

0

Mostra file

File: dataset.py Progetto: seanhtchoi/SemanticParsing

def prepare_data(config):

    print('Loading data for ' + config.phase)
    if config.phase == 'train':
        filetemp = os.path.join(config.train_dir, config.temp_train_file)
    elif config.phase == 'eval':
        filetemp = os.path.join(config.eval_dir, config.temp_eval_file)
    elif config.phase == 'test':
        filetemp = os.path.join(config.test_dir, config.temp_test_file)

    data = np.load(filetemp).item()
    src = data['src']
    dst = data['dst']

    #
    print("Building the vocabulary...")

    vocabulary1 = Vocabulary(config.vocab1_size, save_file=config.vocab1_file)
    #vocabulary1.save(config.vocab1_file)
    print("Vocabulary built.")

    #
    if config.phase == 'train':
        filetemp = os.path.join(config.train_dir, config.train_file)
    elif config.phase == 'eval':
        filetemp = os.path.join(config.eval_dir, config.eval_file)
    elif config.phase == 'test':
        filetemp = os.path.join(config.test_dir, config.test_file)

    if True:  #not os.path.exists(filetemp):
        word_idxs1, word_idxs2 = [], []
        masks1, masks2 = [], []
        len1, len2 = [], []
        for sent in src:  #tqdm(src):
            current_word_idxs_ = vocabulary1.process_sentence(sent)
            current_num_words = len(current_word_idxs_)
            #
            len1.append(len(current_word_idxs_))
            #print('len(current_word_idxs_)', len(current_word_idxs_))

            current_word_idxs = np.zeros(config.max_input_length,
                                         dtype=np.int32)
            current_masks = np.zeros(config.max_input_length)
            current_word_idxs[:current_num_words] = np.array(
                current_word_idxs_)
            current_masks[:current_num_words] = 1.0
            word_idxs1.append(current_word_idxs)
            masks1.append(current_masks)

        print('src max length', max(len1))
        #
        #import pdb;pdb.set_trace()
        for sent in dst:  #tqdm(dst):
            current_word_idxs_ = vocabulary1.process_sentence(sent + ' stop')
            current_num_words = len(current_word_idxs_)
            #
            len2.append(len(current_word_idxs_))
            #print('len(current_word_idxs_)', len(current_word_idxs_))

            current_word_idxs = np.zeros(config.max_output_length,
                                         dtype=np.int32)
            current_masks = np.zeros(config.max_output_length)
            current_word_idxs[:current_num_words] = np.array(
                current_word_idxs_)
            current_masks[:current_num_words] = 1.0
            word_idxs2.append(current_word_idxs)
            masks2.append(current_masks)

        print('dst max length', max(len2))
        #
        word_idxs1 = np.array(word_idxs1)
        masks1 = np.array(masks1)
        word_idxs2 = np.array(word_idxs2)
        masks2 = np.array(masks2)
        len1 = np.array(len1)
        len2 = np.array(len2)
        data = {
            'word_idxs1': word_idxs1,
            'masks1': masks1,
            'word_idxs2': word_idxs2,
            'masks2': masks2,
            'len1': len1,
            'len2': len2
        }
        np.save(filetemp, data)
    else:
        data = np.load(filetemp).item()
        word_idxs1 = data['word_idxs1']
        masks1 = data['masks1']
        len1 = data['len1']
        word_idxs2 = data['word_idxs2']
        masks2 = data['masks2']
        len2 = data['len2']
    #
    print("Building the dataset...")
    is_train = config.phase == 'train'
    dataset = DataSet(word_idxs1,
                      masks1,
                      len1,
                      config.batch_size,
                      word_idxs2,
                      masks2,
                      len2,
                      is_train=is_train,
                      shuffle=is_train)
    print("Dataset built.")
    print("prepare data for " + config.phase + " done!")
    return dataset, vocabulary1  #, vocabulary2

Esempio n. 41

0

Mostra file

File: model.py Progetto: jlin816/scone

class ConstrainedContextSeq2SeqEmbeddings(SconeModel):
    """Model that predicts a sequence of actions (action and arguments).

    Attributes:

    Todo:
        * Consider refactoring. E.g., have a class for an encoder and a
            decoder.
        * Fewer parameters in the constructor.
    """

    def __init__(self,
                 in_vocab,
                 output_vocabularies,
                 state_encoder_builder,
                 valid_action_fn,
                 args):
        SconeModel.__init__(self,
                            state_encoder_builder,
                            in_vocab,
                            args.embeddings_size,
                            args.num_enc_layers,
                            args.encoder_size,
                            args.decoder_size,
                            RNNBuilder)

        self.args = args
        self._dropout = 0.

        # Output vocabs and embeddings.
        self.output_action_vocabulary = Vocabulary(output_vocabularies[0], [EOS, BEG])
        self.output_location_vocabulary.g = Vocabulary(output_vocabularies[1], [NO_ARG, BEG])
        self.output_argument_vocabulary = Vocabulary(output_vocabularies[2], [NO_ARG, BEG])

        # All outputs vocabulary.
        all_vocabulary_list = []
        self._valid_action_indices = []
        index = 0
        for action in self.output_action_vocabulary:
            for location in self.output_location_vocabulary.g:
                for argument in self.output_argument_vocabulary:
                    if action != BEG and location != BEG and argument != BEG:
                        if valid_action_fn(action, location, argument):
                            self._valid_action_indices.append(index)
                        all_vocabulary_list.append((action, location, argument))
                        index += 1
        self._all_output_vocabulary = Vocabulary(all_vocabulary_list, [])

        self._output_action_embeddings = self._pc.add_lookup_parameters(
            (len(self.output_action_vocabulary),
             args.embeddings_size),
            name="output-action-embeddings")
        self._output_location_embeddings = self._pc.add_lookup_parameters(
            (len(self.output_location_vocabulary.g),
             args.embeddings_size),
            name="output-location-embeddings")
        self._output_argument_embeddings = self._pc.add_lookup_parameters(
            (len(self.output_argument_vocabulary),
             args.embeddings_size),
            name="output-argument-embeddings")

        # Action decoder RNN.
        self._dec_input_size = args.encoder_size * 2 \
            + args.encoder_size * 2 \
            + self._state_encoder.item_size() * 2 \
            + args.embeddings_size * 3
        self._decoder = RNNBuilder(args.num_dec_layers,
                                   self._dec_input_size,
                                   args.decoder_size,
                                   self._pc)

        situated_in_size = self._dec_input_size
        if self.args.always_initial_state:
            self._state_attention_winitial = self._pc.add_parameters(
                (self.args.encoder_size * 2 + self.args.decoder_size,
                 self._state_encoder.item_size()),
                name="state-attention-winitial")
            self._state_attention_winitial2 = self._pc.add_parameters(
                (self.args.encoder_size * 2 + self.args.decoder_size,
                 self._state_encoder.item_size()),
                name="state-attention-winitial2")
            situated_in_size += 2 * self._state_encoder.item_size()

        # MLP parameters to mix the situated embedding.
        self._situated_w = self._pc.add_parameters(
            (self._dec_input_size, situated_in_size),
            name="situated-w")
        self._situated_b = self._pc.add_parameters((self._dec_input_size),
                                                   name="situated-b")

        # Project the RNN output to a vector that is the length of the output
        # vocabulary.
        self._final_w = self._pc.add_parameters(
            (args.decoder_size, args.decoder_size), name="final-w")

        self._output_w_action = self._pc.add_parameters(
            (len(self.output_action_vocabulary) - 1, args.decoder_size),
            name="output-w-action")
        self._output_w_location = self._pc.add_parameters(
            (len(self.output_location_vocabulary.g) - 1, args.decoder_size),
            name="output-w-location")
        self._output_w_argument = self._pc.add_parameters(
            (len(self.output_argument_vocabulary) - 1, args.decoder_size),
            name="output-w-argument")

    def probability_of_token(self, token, probability_dist):
        return probability_dist[self._all_output_vocabulary.lookup_index(tuple(token))]

    def set_dropout(self, amount):
        """ Sets the dropout amount for the model, changes during various learning stages.

        Inputs:
            amount (float): Amount of dropout to apply.
        """
        self._dropout = amount

    def compute_entropy(self, distribution):
        """ Gets the entropy of a probability distribution that may contain zeroes.

        Inputs:
            probability_distribution (dy.Expression): The probability distribution.

        Returns:
            dy.Expression representing the entropy.
        """
        num_actions = len(self.output_action_vocabulary) - 1
        num_locations = len(self.output_location_vocabulary.g) - 1
        num_arguments = len(self.output_argument_vocabulary) - 1
        valid_mask = numpy.zeros(num_actions * num_locations * num_arguments)
        for index in self._valid_action_indices:
            valid_mask[index] = 1.
        # This mask is one for all valid indices, and zero for all others.
        valid_mask = dy.inputTensor(valid_mask)

        # This basically replaces everything in the probability distribution
        # with the original value (if valid), or zero (if not valid).
        valid_probs = dy.cmult(valid_mask, distribution)

        # The inverse of valid mask, this gives a value of 1. if something is invalid.
        invalid_probs = 1.-valid_mask

        # The result of this operation is that everything that's valid gets its
        # original probability, and everything that's not gets a probability of 1.
        probs = valid_probs + invalid_probs

        # dy.log(probs) will give log(p(action)) if action is valid, and
        # log(1)=0 for invalid actions.
        # then entropies will be zero for everything that isn't valid, and the
        # actual p log(p) otherwise.
        entropies = dy.cmult(probs, dy.log(probs + 0.00000000001))
        return -dy.sum_elems(entropies)


    def action_probabilities(self, distribution):
        num_actions = len(self.output_action_vocabulary) - 1
        num_locations = len(self.output_location_vocabulary.g) - 1
        num_arguments = len(self.output_argument_vocabulary) - 1
        zeroes = numpy.zeros(num_locations * num_arguments)
        ones = numpy.ones(num_locations * num_arguments)
        
        actions_masks = []
        probs = { }
        action_idx = 0
        for action in self.output_action_vocabulary:
            if action != BEG:
                masks = numpy.concatenate(
                            (numpy.repeat(zeroes, action_idx),
                             ones,
                             numpy.repeat(zeroes, num_actions - action_idx - 1)))
                actions_masks = dy.reshape(dy.inputTensor(masks),
                                           (num_actions * num_locations * num_arguments, 1))
                action_prob = dy.sum_elems(dy.cmult(actions_masks, distribution))
                probs[action] = action_prob
                action_idx += 1
        return probs

    def group_tokens(self, string):
        """ Groups tokens from a flat list of strings into action sequence.

        Inputs:
            string (list of str): Flat action sequence.

        Returns:
            list of tuple, representing parameterized actions.
        """
        seq = []
        current_triple = []
        for token in string:
            if token in self.output_action_vocabulary:
                if len(current_triple) == 3:
                    # Push the current triple and add this one
                    seq.append(current_triple)
                elif len(current_triple) < 3 and current_triple:
                    # Means that there were no arguments
                    current_triple.extend(
                        [NO_ARG for _ in range(3 - len(current_triple))])
                    assert len(current_triple) == 3
                    seq.append(current_triple)
                current_triple = [token]
            elif token in self.output_location_vocabulary.g:
                assert len(current_triple) == 1, \
                    "Location " + str(token) + " must follow an action," \
                    + " but current triple was " + str(current_triple)
                current_triple.append(token)
            elif token in self.output_argument_vocabulary:
                assert len(current_triple) == 2, \
                    "Argument " + str(token) + " must follow an action and location," \
                    + " but current triple was " + str(current_triple)
                current_triple.append(token)
        if len(current_triple) < 3 and current_triple:
            current_triple.extend(
                [NO_ARG for _ in range(3 - len(current_triple))])
        assert len(current_triple) == 3 or not current_triple
        if len(current_triple) == 3:
            seq.append(current_triple)
        return seq

    def _out_to_int(self, string, add_eos=False):
        if add_eos:
            string = list(string) + [EOS]
        else:
            string = list(string)

        return [(self.output_action_vocabulary.lookup_index(tok[0]),
                 self.output_location_vocabulary.g.lookup_index(tok[1]),
                 self.output_argument_vocabulary.lookup_index(tok[2])) \
                    for tok in self.group_tokens(string)]

    def _get_probs(self, rnn_output, restrict=None):
        final_w = dy.parameter(self._final_w)
        output_w_action = dy.parameter(self._output_w_action)
        output_w_location = dy.parameter(self._output_w_location)
        output_w_argument = dy.parameter(self._output_w_argument)

        intermediate_state = final_w * rnn_output
        if self.args.final_nonlinearity:
            intermediate_state = dy.tanh(intermediate_state)
        action_scores = output_w_action * intermediate_state
        location_scores = output_w_location * intermediate_state
        argument_scores = output_w_argument * intermediate_state

        flattened_scores = flatten_triple(action_scores, location_scores, argument_scores)
        if restrict or self.args.syntax_restricted:
            restrict_tokens = self._valid_action_indices
            if restrict:
                restrict_tokens = restrict
            return dy.exp(dy.log_softmax(flattened_scores,
                                         restrict=restrict_tokens))
        else:
            probs = dy.softmax(flattened_scores)
        return probs

    def _predict(self, rnn_output, fsa_restricted=False, fsa=None):
        # Forces a forward pass to get value.
        probs = self._get_probs(
            rnn_output,
            restrict=fsa.valid_actions(self._all_output_vocabulary) if fsa_restricted else None).value()
        max_tuple = numpy.argmax(probs)
        predicted_token = self._all_output_vocabulary.lookup_token(max_tuple)

        return (predicted_token, probs[max_tuple])

    def _init_decoder(self):
        return self._decoder.initial_state().add_input(dy.vecInput(self._dec_input_size))

    def _embed_predicted_triple(self, triple):
        return dy.concatenate([self._output_action_embeddings[triple[0]],
                               self._output_location_embeddings[triple[1]],
                               self._output_argument_embeddings[triple[2]]])

    def _decoder_input_embedding(self,
                                 rnn_state,
                                 previous_triple,
                                 encoded_string,
                                 enc_state,
                                 encoded_history,
                                 training=False,
                                 initial_state=None):
        attention_vecs = {}

        # Compute attention over encodded string.
        utterance_attn, utterance_dist = attend(encoded_string,
                                                rnn_state.h()[-1],
                                                dy.parameter(self._utterance_attention_w),
                                                self._dropout if training else 0.)
        attention_vecs['utterance'] = utterance_dist

        # Key for state and history attention.
        attn_key = dy.concatenate([utterance_attn, rnn_state.h()[-1]])
        if training:
            attn_key = dy.dropout(attn_key, self._dropout)

        # Attend on history using current state and utterance attention.
        history_attn, history_dist = attend(encoded_history,
                                            attn_key,
                                            dy.parameter(self._history_attention_w),
                                            self._dropout if training else 0.)
        attention_vecs['history'] = history_dist

        # Attend on state.
        state_attn, state_dist = attend(enc_state,
                                        attn_key,
                                        dy.parameter(self._state_attention_w),
                                        self._dropout if training else 0.)
        state_attn2, state_dist2 = attend(enc_state,
                                          attn_key,
                                          dy.parameter(self._state_attention_w2),
                                          self._dropout if training else 0.)
        attention_vecs['state_1'] = state_dist
        attention_vecs['state_2'] = state_dist2

        # Compute previous embedding
        prev_emb = self._embed_predicted_triple(previous_triple)

        # Concatenate with history and state, and mix with a feed-forward
        # layer.
        situated_embedding = dy.concatenate([utterance_attn,
                                             history_attn,
                                             state_attn,
                                             state_attn2,
                                             prev_emb])

        # Attend on initial state (if provided)
        if self.args.feed_updated_state and self.args.always_initial_state:
            if not initial_state:
                raise ValueError("Encoding the initial state but it was not provided.")
            initial_attn, initial_dist = attend(initial_state,
                                                attn_key,
                                                dy.parameter(self._state_attention_winitial),
                                                self._dropout if training else 0.)
            initial_attn2, initial_dist2 = attend(initial_state,
                                                  attn_key,
                                                  dy.parameter(self._state_attention_winitial2),
                                                  self._dropout if training else 0.)
            attention_vecs['initial_1'] = initial_dist
            attention_vecs['initial_2'] = initial_dist2

            situated_embedding = dy.concatenate([situated_embedding,
                                                 initial_attn,
                                                 initial_attn2])

        # Situated embedding mixing parameters.
        weights = dy.parameter(self._situated_w)
        biases = dy.parameter(self._situated_b)

        situated_embedding = dy.tanh(weights * situated_embedding + biases)

        return situated_embedding, attention_vecs

    def get_losses(
            self,
            utterance,
            output_seq,
            state,
            history,
            fsa=None,
            training=False):
        """Gets the losses of a gold sequence.

        Args:
            utterance (list of str): Represents the current utterance.
            output_seq (list of triple of str): Represents the gold output sequence.
            state (WorldState): Represents the state of the environment.
            history (list of list of str): Represents the previous utterances.
            fsa (ExecutableFSA, optional): An FSA builder object.
            training (bool, optional): Whether or not you are training right now.

        Returns:
            list of dy.Expression, where each corresponds to the loss at each
                gold output prediction.

        """
        enc_utterance, enc_history, enc_state = self._encode_inputs(
            utterance, state, history)
        initial_encoded_state = enc_state

        output_seq = self.group_tokens(output_seq + [EOS])

        # Run the decoder (forced decoding).
        rnn_state = self._init_decoder()
        losses = []
        prev_token_ints = (self.output_action_vocabulary.lookup_index(BEG),
                           self.output_location_vocabulary.g.lookup_index(BEG),
                           self.output_argument_vocabulary.lookup_index(BEG))
        for i, output_token in enumerate(output_seq):
            if self.args.feed_updated_state:
                if not fsa:
                    raise ValueError("Attempting to feed the updated state " \
                                     + "no FSA was provided")
                enc_state = self._state_encoder.encode(fsa.state())
            # Compute the decoder input.
            situated_embedding, _ = self._decoder_input_embedding(
                rnn_state,
                prev_token_ints,
                enc_utterance,
                enc_state,
                enc_history,
                training,
                initial_state=initial_encoded_state if self.args.always_initial_state else None)
            if training:
                situated_embedding = dy.dropout(
                    situated_embedding, self._dropout)

            # Weird choice -- not adding previous token generated token
            # embedding. TODO: fix
            rnn_state = rnn_state.add_input(situated_embedding)

            gold_index = self._all_output_vocabulary.lookup_index(tuple(output_token))
            log_prob_token = dy.log(self._get_probs(rnn_state.output())[gold_index])

            if self.args.feed_updated_state and output_token != (EOS, NO_ARG, NO_ARG) and output_token != [EOS, NO_ARG, NO_ARG]:
                fsa.feed_complete_action(*output_token)

            # Loss of labeled token.
            losses.append(-log_prob_token)

            prev_token_ints = (self.output_action_vocabulary.lookup_index(output_token[0]),
                               self.output_location_vocabulary.g.lookup_index(output_token[1]),
                               self.output_argument_vocabulary.lookup_index(output_token[2]))

        return losses

    def _update_rnn_state(self,
                          encoded_states,
                          fsa,
                          rnn_state,
                          previous_token,
                          initial_state=None,
                          training=False):
        """ Generates a single token given a state.
        """
        # Generate only if at the beginning of the sequence or the
        # previously generated token was EOS.
        utterance = encoded_states[0]
        history = encoded_states[1]
        world_state = encoded_states[2]

        if self.args.feed_updated_state:
            if not fsa:
                raise ValueError("Attempting to feed the updated state " \
                                 + "no FSA was provided")
            if not fsa.state():
                raise ValueError("Attempting to feed the updated state " \
                                 + "FSA state was None")
            world_state = self._state_encoder.encode(fsa.state())
        situated_embedding, attentions = self._decoder_input_embedding(
            rnn_state,
            previous_token,
            utterance,
            world_state,
            history,
            initial_state=initial_state,
            training=training)
        if training:
            situated_embedding = dy.dropout(situated_embedding, self._dropout)
        return rnn_state.add_input(situated_embedding), attentions

    def _policy_shape_probs(self,
                            prob_dist):
        # TODO: this is specific to Alchemy
        num_actions = len(self.output_action_vocabulary) - 1
        num_locations = len(self.output_location_vocabulary.g) - 1
        num_arguments = len(self.output_argument_vocabulary) - 1
        new_probdist = dy.zeros(prob_dist.dim()[0])
        zeroes = numpy.zeros(num_locations * num_arguments)
        ones = numpy.ones(num_locations * num_arguments)
        eos_prob = prob_dist[self._all_output_vocabulary.lookup_index((EOS, NO_ARG, NO_ARG))]
        action_idx = 0
        for action in self.output_action_vocabulary:
            masks = numpy.concatenate(
                        (numpy.repeat(zeroes, action_idx),
                         ones,
                         numpy.repeat(zeroes, num_actions - action_idx - 1)))
            actions_masks = dy.reshape(dy.inputTensor(masks),
                                       (num_actions * num_locations * num_arguments, 1))
            if action == EOS:
                new_probdist += dy.cmult(actions_masks, prob_dist) / 2.
            elif action == "push":
                new_probdist += dy.cmult(actions_masks, prob_dist) + eos_prob / (2. * 56.)
            elif action == "pop":
                new_probdist += dy.cmult(actions_masks, prob_dist)

        if self.args.syntax_restricted:
            return dy.exp(dy.log_softmax(dy.cmult(new_probdist, prob_dist),
                                         restrict = self._valid_action_indices))
        else:
            return dy.softmax(dy.cmult(new_probdist, prob_dist))

    def sample_sequences(self,
                         batch,
                         length=LEN_LIMIT,
                         training=False,
                         fsa_builder=None):
        """Rolls out using a policy (the probability distribution.

        Args:
            batch (list of examples): The batch that is being used to roll
                out.
            length (int, optional): The maximum length of the roll out.
            training (bool, optional): Whether or not training.
            fsa_builder (ExecutableFSA): An FSA that can be used to constrain.

        Returns:

        Todo:
            * Docstring.
            * No use of 'filter'.
            * Make returned value more clear.
            * Fewer branches.
            * Shorter (i.e. refactor).
        """
        sample_start = time.time()
        batch_states = []
        batch_initial_states = []
        batch_prob_sequences = [[] for example in batch]
        batch_sequences = [[] for example in batch]
        finished_seqs = [False for example in batch]
        batch_encoded_states = []

        
        for example in batch:
            encoded_inputs = self._encode_inputs(
                example.utterance,
                example.initial_state,
                example.history)
            batch_encoded_states.append(encoded_inputs)
            batch_initial_states.append(encoded_inputs[2])
            initial_state = None
            if self.args.feed_updated_state:
                if not fsa_builder:
                    raise ValueError("Need an FSA builder when feeding the "\
                                     + " updated state during sampling")
                initial_state = fsa_builder(example.initial_state)
            batch_states.append( \
                (initial_state,
                 self._init_decoder(),
                 (self.output_action_vocabulary.lookup_index(BEG),
                  self.output_location_vocabulary.g.lookup_index(BEG),
                  self.output_argument_vocabulary.lookup_index(BEG))))

        for _ in range(length):
            # Generate probabilities for this step.
            batch_probs = []

            batch_rnn_states = []

            assert len(batch) == len(batch_encoded_states)
            assert len(batch) == len(batch_states)

            for j, (example, encoded_states, state, initial_state) in \
                    enumerate(zip(batch, batch_encoded_states, batch_states, batch_initial_states)):

                if not finished_seqs[j]:
                    rnn_state, _ = self._update_rnn_state(encoded_states,
                                                          state[0],
                                                          state[1],
                                                          state[2],
                                                          initial_state,
                                                          training=training)
                    probs = self._get_probs(rnn_state.output())
                else:
                    probs = None
                    rnn_state = None
                batch_probs.append(probs)
                batch_rnn_states.append(rnn_state)

            # Do a forward pass on the entire batch.
            if [prob for prob in batch_probs if prob]:
                dy.esum([dy.concatenate(list(prob))
                         for prob in batch_probs if prob]).value()

                # Update the batch states and keep track of probability distribution
                # and generated sequences.
                new_states = []

                assert len(batch) == len(batch_states)
                assert len(batch) == len(batch_probs)
                assert len(batch) == len(batch_rnn_states)
                for j, (example, old_state, prob_dist, rnn_state) in enumerate(
                        zip(batch, batch_states, batch_probs, batch_rnn_states)):
                    if not finished_seqs[j]:
                        # Get the predicted token by sampling.
                        sampling_policy = prob_dist
                        if self.args.policy_shaping:
                            sampling_policy = self._policy_shape_probs(prob_dist)
                        predicted_token, token_prob = sample_any_tok(
                            sampling_policy, self._all_output_vocabulary)

                        # Update the FSA.
                        fsa = None
                        if self.args.feed_updated_state and predicted_token != (EOS, NO_ARG, NO_ARG):
                            fsa = old_state[0]
                            peek_state = fsa.peek_complete_action(*predicted_token)
                            if peek_state and predicted_token != (EOS, NO_ARG, NO_ARG):
                                fsa.feed_complete_action(*predicted_token)

                        # Only update batch states if you don't predict EOS. Otherwise,
                        # no point in continuing to generate for this example.
                        if predicted_token == (EOS, NO_ARG, NO_ARG):
                            finished_seqs[j] = True
                            new_states.append((None, None, None))
                        else:
                            predicted_token_idxs = \
                                (self.output_action_vocabulary.lookup_index(predicted_token[0]),
                                 self.output_location_vocabulary.g.lookup_index(predicted_token[1]),
                                 self.output_argument_vocabulary.lookup_index(predicted_token[2]))
                            new_states.append(
                                (fsa, rnn_state, predicted_token_idxs))

                        # Update probability expressions and samples.
                        batch_sequences[j].append(
                            (predicted_token, token_prob))
                        batch_prob_sequences[j].append(prob_dist)
                    else:
                        new_states.append((None, None, None))
                batch_states = new_states
            else:
                break

        return batch_prob_sequences, batch_sequences

    def generate_probs(self, utterance, state, history, fsa=None, fsa_restricted=False):
        """Gets predictions (by argmax) and their probabilities.


        Args:
            utterance (list of str): The current utterance.
            state (WorldState): The world state.
            history (list of list of str): The previous utterances.
            fsa (ExecutableFSA, optional): The FSA builder object, if using
                constrained decoding.

        Returns:
            list of (str, float), representing the predicted sequence, where
                each string is the predicted token and the float is the
                probability of the token.
        """
        dy.renew_cg()

        encoded_states = self._encode_inputs(utterance, state, history)
        initial_state = encoded_states[2]

        # Run the decoder.
        rnn_state = self._init_decoder()
        output_seq_probs = []
        attentions = []
        predicted_token_ints = [self.output_action_vocabulary.lookup_index(BEG),
                                self.output_location_vocabulary.g.lookup_index(BEG),
                                self.output_argument_vocabulary.lookup_index(BEG)]
        while len(output_seq_probs) <= LEN_LIMIT:
            # Compute the decoder input.
            rnn_state, attention = self._update_rnn_state(
                encoded_states,
                fsa,
                rnn_state,
                predicted_token_ints,
                initial_state if self.args.always_initial_state else None)
            attentions.append(attention)

            if self.args.fsa_restricted:
                raise ValueError("FSA generation is not implemented " \
                                 + "jointly predicting all three things")
            else:
                predicted_token, prob = self._predict(rnn_state.output(),
                                                      fsa_restricted,
                                                      fsa)

            output_seq_probs.append((predicted_token, prob))
            predicted_token_ints = \
                [self.output_action_vocabulary.lookup_index(predicted_token[0]),
                 self.output_location_vocabulary.g.lookup_index(predicted_token[1]),
                 self.output_argument_vocabulary.lookup_index(predicted_token[2])]
            if predicted_token == (EOS, NO_ARG, NO_ARG):
                return output_seq_probs, attentions
            if self.args.feed_updated_state:
                peek_state = fsa.peek_complete_action(*predicted_token)
                if peek_state:
                    fsa.feed_complete_action(*predicted_token)
        return output_seq_probs, attentions

    def generate(self, utterance, state, history, fsa, fsa_restricted=False):
        """Generates a sequence of predicted tokens for an input.

        Args:
            utterance (list of str): The current utterance.
            state (WorldState): The world state.
            history (list of list of str): The previous utterances.
            fsa (ExecutableFSA): The FSA, for constrained decoding.

        Returns:
            list of str, representing the predicted sequence.

        Todo:
            * Don't use map.
        """
        preds_and_probs, attentions = self.generate_probs(utterance,
                                                          state,
                                                          history,
                                                          fsa,
                                                          fsa_restricted)

        # Get only the tokens and remove the EOS token at the end.
        preds = [p[0] for p in preds_and_probs]
        if list(preds[-1]) == [EOS, NO_ARG, NO_ARG]:
            preds = preds[:-1]
        return preds, attentions

Esempio n. 42

0

Mostra file

def main(args):
    assert FLAGS.validation_data_loader, "--vocab_file is required"
    assert FLAGS.vocab_file, "--vocab_file is required"
    assert FLAGS.model_path, "--model_path is required"
    assert FLAGS.selection_method in ['sampling', 'argmax', 'beam_search'],\
        "--selection_method can only be one of 'sampling', 'argmax' and 'beam_search'."
    model_config = configuration.ModelConfig()

    print('Loading vocabulary file...')
    vocab = Vocabulary(FLAGS.vocab_file)
    vocab_size = vocab.get_vocabulary_size()

    # Assign parameters to model configuration.
    model_config.vocab_size = vocab_size

    # Build the TensorFlow graph.
    g = tf.Graph()
    with g.as_default():
        print('Building LSTM decoder model for inference...')

        if not FLAGS.repeated_feed_images:
            model = LSTMDecoder(model_config, mode="inference")
        else:
            model = LSTMDecoderRepeatedImageFeed(model_config,
                                                 mode="inference")
        model.build()

        print('Initializing variables...')
        init = tf.global_variables_initializer()
        sess = tf.Session()
        sess.run(init)

        print('Loading saved model...')
        saver = tf.train.Saver()
        saver.restore(sess, FLAGS.model_path)

        print('Initializing data loader for validation set...')
        start = time.time()
        data_loader_val = DataLoader()
        data_loader_val.load(FLAGS.validation_data_loader)
        end = time.time()
        time_elapsed = end - start
        print('Finished initializing data loader (time elapsed: %f)' %
              time_elapsed)

        print('Start inference...')
        initial_input_sequence = np.zeros(model_config.batch_size,
                                          dtype=np.int32)
        initial_input_sequence.fill(vocab.start_id)

        max_sentence_length = const_config.lstm_truncated_length + 1

        json_results = []
        for image_features, _, _, _, video_indices, video_segment_indices, valid_count in \
                data_loader_val.segmental_sampling_iter(batch_size=model_config.batch_size,
                                                        num_segments=model_config.num_segments):

            current_input = initial_input_sequence.copy()
            if not FLAGS.repeated_feed_images:
                current_state = sess.run(
                    fetches="lstm/initial_state:0",
                    feed_dict={"input_features:0": image_features})
            else:
                current_state = sess.run(fetches="lstm/initial_state:0",
                                         feed_dict={})

            generated_sentences =\
                np.zeros((model_config.batch_size, max_sentence_length), dtype=np.int32)
            generated_sentences[:, 0] = current_input
            completed_masks = np.zeros(model_config.batch_size, dtype=np.bool)

            for i in range(const_config.lstm_truncated_length):
                if not FLAGS.repeated_feed_images:
                    softmax_output, next_state = sess.run(
                        fetches=["softmax:0", "lstm/state:0"],
                        feed_dict={
                            "input_feed:0": current_input,
                            "lstm/state_feed:0": current_state
                        })
                else:
                    softmax_output, next_state = sess.run(
                        fetches=["softmax:0", "lstm/state:0"],
                        feed_dict={
                            "input_feed:0": current_input,
                            "lstm/state_feed:0": current_state,
                            "input_features:0": image_features
                        })

                if FLAGS.selection_method == 'sampling':
                    # Sample the next word according to the probability.
                    next_input = []
                    for probs in softmax_output:
                        next_input.append(np.random.choice(vocab_size,
                                                           p=probs))
                    next_input = np.array(next_input)
                elif FLAGS.selection_method == 'argmax':
                    next_input = np.argmax(softmax_output, axis=1)
                else:
                    # TODO: implement beam search
                    next_input = None
                generated_sentences[:, i + 1] = next_input

                # Update input and state.
                current_input = next_input
                current_state = next_state

                # Early stop if we have generated the <END> token for all sentences.
                for j, word_id in enumerate(next_input):
                    if word_id == vocab.end_id:
                        completed_masks[j] = True
                if sum(completed_masks) == model_config.batch_size:
                    break

            # Extract text sentences.
            sentences = []
            for word_id_array in generated_sentences:
                word_id_array = remove_start_end_word_ids(word_id_array, vocab)
                text = vocab.id_array_to_sentence(word_id_array)
                sentences.append(text)
            sentences = sentences[:valid_count]

            for sentence in sentences:
                print sentence

            for i in range(valid_count):
                video_idx = video_indices[i]
                segment_idx = video_segment_indices[i]
                video = data_loader_val.videos[video_idx]
                video_segment = video.video_segments[segment_idx]
                caption_trimmed = remove_start_end_word_ids(
                    video_segment.caption, vocab)
                gt_caption = vocab.id_array_to_sentence(caption_trimmed)
                video_segment_name = video.name + str(segment_idx)
                json_results.append({
                    'name': video_segment_name,
                    'video_caption': sentences[i],
                    'gt_caption': gt_caption
                })
        print('Finished Inference.')

        print('Dumping results...')
        fo = open(FLAGS.output_file, 'w')
        json.dump(json_results, fo, indent=4)
        fo.close()
        print('Done.')

Esempio n. 43

0

Mostra file

File: train_lstm.py Progetto: selBaez/nlp-multimodal-language-models

                        type=int,
                        default=VAL_FREQ_DEFAULT,
                        help='Frequency of evaluation on validation set')
    parser.add_argument('--vocab_file',
                        type=str,
                        default=DEFAULT_VOCAB_FILE,
                        help='Default vocabulary file')
    parser.add_argument('--one_hot',
                        type=str,
                        default=ONE_HOT_DEFAULT,
                        help='apply one hot encoding')
    parser.add_argument('--check_freq',
                        type=int,
                        default=CHECKPOINT_FREQ_DEFAULT,
                        help='test and save results ')
    parser.add_argument('--name',
                        type=str,
                        default=MODEL_NAME_DEFAULT,
                        help='model name')
    parser.add_argument('--append',
                        type=bool,
                        default=APPEND_DEFAULT,
                        help='append start,end token')
    FLAGS, unparsed = parser.parse_known_args()

    vocabulary = Vocabulary(FLAGS.vocab_file, None, None, flag='load')
    VOCAB_SIZE = len(vocabulary._vocab)
    start_v = vocabulary.word_to_id("#START#")
    end_v = vocabulary.word_to_id("#END#")
    print(vocabulary.word_to_id('dressing'))
    main(None)

Esempio n. 44

0

Mostra file

import atislexicon
from augmentation import Augmenter
import domains
from encoderdecoder import EncoderDecoderModel
from attention import AttentionModel
from example import Example
import spec as specutil
from vocabulary import Vocabulary

MODELS = collections.OrderedDict([
    ('encoderdecoder', EncoderDecoderModel),
    ('attention', AttentionModel),
])

VOCAB_TYPES = collections.OrderedDict([
    ('raw', lambda s, e, **kwargs: Vocabulary.from_sentences(s, e, **kwargs)),
    ('glove', lambda s, e, **kwargs: Vocabulary.from_sentences(
        s, e, use_glove=True, **kwargs))
])

# Global options
OPTIONS = None

# Global statistics
STATS = {}


def _parse_args():
    global OPTIONS
    parser = argparse.ArgumentParser(
        description='A neural semantic parser.',

Esempio n. 45

0

Mostra file

File: main.py Progetto: mcislab-machine-learning/MSVD-CN


from vocabulary import *
import pipeline_lstm
import pipeline_cnn

# pipeline_lstm.train()
# pipeline_lstm.test()
# pipeline_cnn.test()

import data_video
from vocabulary import Vocabulary

vocab_path = data_video.msvd_bilingual_vocab_char_path
vocab = Vocabulary.load(vocab_path)
dataset = data_video.MSVDDatasetBilingual(vocab=vocab, segment_method='char', caption_mode='text', split='train')

dataset.data.sort(key=lambda x: x.video_id)

with open('all_captions.txt', 'w') as f:
    for d in dataset.data:
        f.write('{:>12} {}\n'.format(d.video_id, d.caption))

Esempio n. 46

0

Mostra file

File: chat.py Progetto: kuldeepkhatana/machine-learning-project

import general_utils
import chat_command_handler
from chat_settings import ChatSettings
from chatbot_model import ChatbotModel
from vocabulary import Vocabulary

#Read the hyperparameters and configure paths
_, model_dir, hparams, checkpoint = general_utils.initialize_session("chat")

#Load the vocabulary
print()
print("Loading vocabulary...")
if hparams.model_hparams.share_embedding:
    shared_vocab_filepath = path.join(model_dir,
                                      Vocabulary.SHARED_VOCAB_FILENAME)
    input_vocabulary = Vocabulary.load(shared_vocab_filepath)
    output_vocabulary = input_vocabulary
else:
    input_vocab_filepath = path.join(model_dir,
                                     Vocabulary.INPUT_VOCAB_FILENAME)
    input_vocabulary = Vocabulary.load(input_vocab_filepath)
    output_vocab_filepath = path.join(model_dir,
                                      Vocabulary.OUTPUT_VOCAB_FILENAME)
    output_vocabulary = Vocabulary.load(output_vocab_filepath)

#Create the model
print("Initializing model...")
print()
with ChatbotModel(mode="infer",
                  model_hparams=hparams.model_hparams,
                  input_vocabulary=input_vocabulary,

Esempio n. 47

0

Mostra file

File: preprocess.py Progetto: txsun1997/Bachelor-Thesis-XDU

def read_instances_from_file(files, max_len=400, keep_case=False):
    ''' Collect instances and construct vocab '''

    vocab = Vocabulary()
    lb_vocab = Vocabulary(need_default=False)
    sets = []

    for file in files:
        sents, labels = [], []
        trimmed_sent = 0
        with open(file) as f:
            lines = f.readlines()
            for l in lines:
                l = l.strip().split('\t')
                if len(l) < 2:
                    continue
                label = l[0]
                sent = l[1]
                if not keep_case:
                    sent = sent.lower()
                word_lst = sent.split()
                if len(word_lst) > max_len:
                    word_lst = word_lst[:max_len]
                    trimmed_sent += 1
                if word_lst:
                    sents.append(word_lst)
                    labels.append(label)
                    vocab.add_word_lst(word_lst)
                    lb_vocab.add_word(label)

        assert len(sents) == len(labels)

        sets.append({'sents': sents, 'labels': labels})

        logger.info('Get {} instances from file {}'.format(len(sents), file))
        if trimmed_sent:
            logger.info(
                '{} sentences are trimmed. Max sentence length: {}.'.format(
                    trimmed_sent, max_len))

    logger.info('Building vocabulary...')
    vocab.add_word_lst(['<cls>'] * 6)
    vocab.build_vocab()
    lb_vocab.build_vocab()
    logger.info('Finished. Size of vocab: {}. # Class: {}.'.format(
        len(vocab), len(lb_vocab)))
    logger.info('<pad>: {}'.format(vocab.to_index('<pad>')))
    logger.info('<unk>: {}'.format(vocab.to_index('<unk>')))
    logger.info('<cls>: {}'.format(vocab.to_index('<cls>')))

    return sets, vocab, lb_vocab

Esempio n. 48

0

Mostra file

File: dump_supersenses_vocabs.py Progetto: nschneid/pss-nn

def build_vocabs():
    tasks = [
        '.'.join([id, syn]) for id in ['autoid', 'goldid']
        for syn in ['autosyn', 'goldsyn']
    ]
    stypes = ['train', 'dev', 'test']

    loader = StreusleLoader()
    STREUSLE_BASE = os.environ.get(
        'STREUSLE_BASE'
    ) or '/cs/usr/aviramstern/nlp/datasets/streusle_v4/release'
    all_files = [
        STREUSLE_BASE + '/' + stype + '/streusle.ud_' + stype + '.' + task +
        '.json' for task in tasks for stype in stypes
    ]
    records = sum([loader.load(f, input_format='json') for f in all_files], [])
    samples = [streusle_record_to_lstm_model_sample(r) for r in records]

    pp_vocab = Vocabulary('PREPS')
    pp_vocab.add_words(
        set([
            x.token for s in samples for x, y in zip(s.xs, s.ys)
            if any([y.supersense_role, y.supersense_func])
        ]))

    ner_vocab = Vocabulary('NERS')
    ner_vocab.add_words(
        set([x.ner for s in samples for x, y in zip(s.xs, s.ys)]))
    ner_vocab.add_word(None)

    lemmas_vocab = Vocabulary('LEMMAS')
    lemmas_vocab.add_words(
        set([x.lemma for s in samples for x, y in zip(s.xs, s.ys)]))

    ud_dep_vocab = Vocabulary('UD_DEPS')
    ud_dep_vocab.add_words(
        set([x.ud_dep for s in samples for x, y in zip(s.xs, s.ys)]))
    ud_dep_vocab.add_word(None)

    ud_xpos_vocab = Vocabulary('UD_XPOS')
    ud_xpos_vocab.add_words(
        set([x.ud_xpos for s in samples for x, y in zip(s.xs, s.ys)]))
    ud_xpos_vocab.add_word(None)

    token_vocab = Vocabulary('TOKENS')
    token_vocab.add_words(
        set([x.token for s in samples for x, y in zip(s.xs, s.ys)]))

    govobj_config_vocab = Vocabulary('GOVOBJ_CONFIGS')
    govobj_config_vocab.add_words(
        set([x.govobj_config for s in samples for x, y in zip(s.xs, s.ys)]))

    pss_vocab = Vocabulary('PSS')
    pss_vocab.add_words(supersense_repo.PREPOSITION_SUPERSENSES_SET)
    pss_vocab.add_word(None)

    pss_vocab = Vocabulary('LEXCAT')
    pss_vocab.add_words(
        set([x.lexcat for s in samples for x, y in zip(s.xs, s.ys)]))

    return [
        pp_vocab, ner_vocab, lemmas_vocab, ud_dep_vocab, ud_xpos_vocab,
        token_vocab, pss_vocab, govobj_config_vocab
    ]

Esempio n. 49

0

Mostra file

def main():
    vocabulary = Vocabulary()
    hangman = Hangman(vocabulary)
    hangman.startGame()

Esempio n. 50

0

Mostra file

dev_e_path = '../data/validation/dev.e.gz'
dev_f_path = '../data/validation/dev.f.gz'
dev_wa = '../data/validation/dev.wa.nonullalign'

test_e_path = '../data/test/test.e.gz'
test_f_path = '../data/test/test.f.gz'
test_wa = '../data/test/test.wa.nonullalign'

# Using only 1000 words will result in many UNKs, but
# it will make training a lot faster.
# If you have a fast computer, a GPU, or a lot of time,
# try with 10000 instead.
max_tokens = 1000

corpus_e = smart_reader(train_e_path)
vocabulary_e = Vocabulary(corpus=corpus_e, max_tokens=max_tokens)
pickle.dump(vocabulary_e, open("vocabulary_e.pkl", mode="wb"))
print("English vocabulary size: {}".format(len(vocabulary_e)))

corpus_f = smart_reader(train_f_path)
vocabulary_f = Vocabulary(corpus=corpus_f, max_tokens=max_tokens)
pickle.dump(vocabulary_f, open("vocabulary_f.pkl", mode="wb"))
print("French vocabulary size: {}".format(len(vocabulary_f)))

# load test corpus
test_corpus = list(
    bitext_reader(smart_reader(test_e_path), smart_reader(test_f_path)))

# run
tf.reset_default_graph()

Esempio n. 51

0

Mostra file

File: ners.py Progetto: nschneid/pss-nn

from vocabulary import Vocabulary

NERS = Vocabulary('NERS', [
    'DATE', 'ORGANIZATION', 'O', 'ORDINAL', 'TIME', 'NUMBER', 'MONEY',
    'PERCENT', 'MISC', 'PERSON', 'LOCATION', 'DURATION', 'SET', None
])

Esempio n. 52

0

Mostra file

File: yelp_vocabulary.py Progetto: sabirakhtar/maldives

    return options, pattern


if __name__ == '__main__':
    import os
    import glob
    import pprint
    from vocabulary import Vocabulary
    import parallelize

    options, pattern = parse_args()

    olddir = os.getcwd()
    os.chdir(options.datadir)

    fnames = glob.glob(pattern)

    nprocesses = len(fnames) if options.parallel else None
    results = parallelize.run(process_file, fnames, nprocesses, options)

    full_counter = Counter()
    for counter in results:
        full_counter.update(counter)

    vocabulary = Vocabulary(full_counter, n_most_common=options.nwords)
    vocabulary.save('index')

    pprint.pprint(full_counter.most_common(200))
    print(len(full_counter))
    print(vocabulary)
    os.chdir(olddir)

Esempio n. 53

0

Mostra file

    def import_vocabulary(self,
                          vocabulary_dir,
                          normalize=True,
                          import_mode=VocabularyImportMode.External,
                          dataset_vocab=None):

        if dataset_vocab is None and import_mode != VocabularyImportMode.External:
            raise ValueError(
                "dataset_vocab must be provided if import_mode is not 'External'."
            )

        import_stats = VocabularyImportStats()

        #Read the external vocabulary tokens and embeddings
        tokens_with_embeddings = self._read_vocabulary_and_embeddings(
            vocabulary_dir)

        #If normalize flag is true, normalize casing of the external vocabulary and average embeddings for any resulting duplicate tokens
        if normalize:
            tokens_with_embeddings = self._normalize_tokens_with_embeddings(
                tokens_with_embeddings)

        import_stats.external_vocabulary_size = len(tokens_with_embeddings)

        #Apply dataset filters if applicable
        if dataset_vocab is not None:
            import_stats.dataset_vocabulary_size = dataset_vocab.size()

            if import_mode == VocabularyImportMode.ExternalIntersectDataset or import_mode == VocabularyImportMode.Dataset:
                #Get rid of all tokens that exist in the external vocabulary but don't exist in the dataset
                for token in list(tokens_with_embeddings.keys()):
                    if not dataset_vocab.word_exists(token):
                        del tokens_with_embeddings[token]
                import_stats.intersection_size = len(tokens_with_embeddings)

            if import_mode == VocabularyImportMode.ExternalUnionDataset or import_mode == VocabularyImportMode.Dataset:
                #Add any tokens that exist in the dataset but don't exist in the external vocabulary.
                #These added tokens will get word vectors sampled from the gaussian distributions of their components:
                #   where the mean of each component is the mean of that component in the external embedding matrix
                #   and the standard deviation of each component is the standard deviation of that component in the external embedding matrix
                embeddings_matrix = np.array(list(
                    tokens_with_embeddings.values()),
                                             dtype=np.float32)
                emb_size = embeddings_matrix.shape[1]
                emb_mean = np.mean(embeddings_matrix, axis=0)
                emb_stdev = np.std(embeddings_matrix, axis=0)
                for i in range(dataset_vocab.size()):
                    dataset_token = dataset_vocab.int2word(i,
                                                           capitalize_i=False)
                    if dataset_token not in tokens_with_embeddings:
                        tokens_with_embeddings[
                            dataset_token] = np.random.normal(
                                emb_mean, emb_stdev, emb_size)

        if len(tokens_with_embeddings) == 0:
            raise ValueError(
                "Imported vocabulary size is 0. Try a different VocabularyImportMode (currently {0})"
                .format(VocabularyImportMode(import_mode).name))

        tokens, embeddings_matrix = zip(*tokens_with_embeddings.items())
        embeddings_matrix = np.array(embeddings_matrix, dtype=np.float32)

        #Create the vocabulary instance
        vocabulary = Vocabulary(external_embeddings=embeddings_matrix)
        for i in range(len(tokens)):
            vocabulary.load_word(tokens[i], i)
        vocabulary.compile(loading=True)
        return vocabulary, import_stats

Esempio n. 54

0

Mostra file

File: model.py Progetto: tk1363704/environment-learning

class Model(object):
    def __init__(self):
        self.vocab = Vocabulary()
        self.language_module = dataset.LSTMLanguageModule(
            message_flags.flattened_message_size(),
            self.vocab.get_vocab_size()).to(device)
        self.training_examples = []

        self.encoder = pretrain.load_saved_encoder().to(device)
        self.encoder.eval()
        self.decoder = pretrain.load_saved_decoder().to(device)
        self.decoder.eval()

        params_to_train = list(self.language_module.parameters())
        if FLAGS.model_train_decoder:
            params_to_train.extend(list(self.decoder.parameters()))
        self.optimizer = optim.Adam(params_to_train, weight_decay=1e-5)

    def predict(self, state, command):
        self.language_module.eval()
        self.decoder.eval()
        token_ids = self.vocab.token_ids(command)
        command_variable = torch.LongTensor(token_ids).unsqueeze(0).to(device)
        state_variable = dataset.state_to_variable(state).to(device)
        encoder_output = self.language_module.forward(command_variable)
        decoder_input = encoder_output if FLAGS.continuous_message else discrete_util.discrete_transformation(
            encoder_output)
        prediction = self.decoder.forward(state_variable, decoder_input)
        return dataset.output_from_variable(prediction, state)

    def optimizer_step(self):
        self.language_module.train()
        self.decoder.train()
        random.shuffle(self.training_examples)
        for batch in util.batch_iterator(self.training_examples,
                                         FLAGS.model_batch_size):
            states = [s for s, c, t, m in batch]
            commands = [c for s, c, t, m in batch]
            targets = [t for s, c, t, m in batch]
            target_messages = [m for s, c, t, m in batch]
            self.optimizer.zero_grad()
            target_message = torch.from_numpy(
                np.concatenate(target_messages, 0)).to(device)
            state_variable = dataset.state_to_variable_batch(states).to(device)
            target_variable = dataset.output_to_variable_batch(
                targets, states).to(device)
            max_command_len = max(len(c) for c in commands)
            token_ids = np.zeros((len(commands), max_command_len),
                                 dtype=np.int64)
            for i, command in enumerate(commands):
                ids = self.vocab.token_ids(command)
                token_ids[i, -len(ids):] = ids
            command_variable = torch.from_numpy(token_ids).to(device)

            encoder_output = self.language_module.forward(command_variable)
            decoder_input = encoder_output if FLAGS.continuous_message else discrete_util.discrete_transformation(
                encoder_output)

            prediction = self.decoder.forward(state_variable, decoder_input,
                                              target_variable)

            if FLAGS.continuous_message:
                error = encoder_output - target_message
                message_loss = (error * error).sum()
            else:
                log_message_probs = F.log_softmax(
                    encoder_output.view(-1, FLAGS.discrete_message_size,
                                        FLAGS.discrete_message_symbols), 2)
                target_message_reshaped = target_message.view(
                    -1, FLAGS.discrete_message_size,
                    FLAGS.discrete_message_symbols)
                message_loss = -(log_message_probs *
                                 target_message_reshaped).sum()

            loss = dataset.loss(
                prediction, target_variable
            ) + FLAGS.model_message_loss_weight * message_loss
            loss = loss / len(batch)  # avg instead of sum

            loss.backward()
            self.optimizer.step()

    def training_accuracy(self):
        n_correct = 0
        for state, command, target, target_message in self.training_examples:
            prediction = self.predict(state, command)
            if prediction == target:
                n_correct += 1
        return n_correct / len(self.training_examples)

    def update(self, state, command, target_output, num_updates=None):
        if num_updates is None:
            num_updates = FLAGS.model_max_updates
        state_variable = dataset.state_to_variable(state).to(device)
        target_variable = dataset.output_to_variable(target_output,
                                                     state).to(device)
        encoder_output = self.encoder.forward(state_variable, target_variable)
        target_message = encoder_output if FLAGS.continuous_message else discrete_util.discrete_transformation(
            encoder_output)
        target_message = target_message.cpu().detach().numpy()

        self.training_examples.append(
            (state, command, target_output, target_message))
        for _ in range(num_updates):
            self.optimizer_step()

Esempio n. 55

0

Mostra file

    if title != None:
        ax.set_title(title)
    ax.imshow(image)

    return ax


vocab_threshold = 5
vocab_file = './vocab.pkl'
start_word = "<start>"
end_word = "<end>"
unk_word = "<unk>"
annotations_file = os.path.join(
    '/home/george/', 'cocoapi/annotations/image_info_test2014.json')

vocab = Vocabulary(vocab_threshold, vocab_file, start_word, end_word, unk_word,
                   annotations_file, True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder_file = 'encoder-2.pkl'
decoder_file = 'decoder-2.pkl'

embed_size = 256
hidden_size = 512

vocab_size = len(vocab)

encoder = EncoderCNN(embed_size)
encoder.eval()
decoder = DecoderRNN(embed_size, hidden_size, vocab_size)
decoder.eval()

Esempio n. 56

0

Mostra file

File: ud_deps.py Progetto: nschneid/pss-nn

from vocabulary import Vocabulary

UD_DEPS = Vocabulary('UD_DEPS', [
    'ROOT', 'mark', 'obj', 'amod', 'dep', 'cop', 'appos', 'advmod', 'conj',
    'cc', 'nsubjpass', 'compound', 'aux:pass', 'iobj', 'nsubj', 'root',
    'nmod:tmod', 'ccomp', 'aux', 'cc:preconj', 'nsubj:pass', 'nmod', 'neg',
    'acl', 'fixed', 'dobj', 'xcomp', 'auxpass', 'reparandum', 'det',
    'discourse', 'vocative', 'flat', 'csubj:pass', 'obl', 'obl:tmod', 'punct',
    'compound:prt', 'csubjpass', 'nummod', 'mwe', 'csubj', 'list', 'nmod:poss',
    'advcl', 'obl:npmod', 'dislocated', 'orphan', 'expl', 'acl:relcl',
    'nmod:npmod', 'goeswith', 'det:predet', 'case', 'parataxis', None
])

Esempio n. 57

0

Mostra file

File: chatbot_model.py Progetto: shubhamsinha47/chat-bot

    def chat(self, question, chat_settings):

        if chat_settings.enable_auto_punctuation:
            question = Vocabulary.auto_punctuate(question)
        question = Vocabulary.clean_text(
            question,
            normalize_words=chat_settings.inference_hparams.normalize_words)
        question = self.input_vocabulary.words2ints(question)

        question_with_history = []
        for i in range(len(self.conversation_history)):
            question_with_history += self.conversation_history[i] + [
                self.input_vocabulary.eos_int()
            ]
        question_with_history += question

        #Get the answer prediction
        batch = np.zeros((1, len(question_with_history)))
        batch[0] = question_with_history
        max_output_sequence_length = chat_settings.inference_hparams.max_answer_words + 1  # + 1 since the EOS token is counted as a timestep
        predicted_answer_info = self.predict_batch(
            inputs=batch,
            input_sequence_length=np.array([len(question_with_history)]),
            max_output_sequence_length=max_output_sequence_length,
            beam_length_penalty_weight=chat_settings.inference_hparams.
            beam_length_penalty_weight,
            sampling_temperature=chat_settings.inference_hparams.
            sampling_temperature,
            log_summary=chat_settings.inference_hparams.log_summary)

        #Read the answer prediction
        answer_beams = []
        if self.beam_width > 0:
            #For beam search decoding: if show_all_beams is enabeled then output all beams (sequences), otherwise take the first beam.
            #   The beams (in the "predictions" matrix) are ordered with the highest ranked beams first.
            beam_count = 1 if not chat_settings.show_all_beams else len(
                predicted_answer_info["predictions_seq_lengths"][0])
            for i in range(beam_count):
                predicted_answer_seq_length = predicted_answer_info[
                    "predictions_seq_lengths"][0][
                        i] - 1  #-1 to exclude the EOS token
                predicted_answer = predicted_answer_info["predictions"][
                    0][:predicted_answer_seq_length, i].tolist()
                answer_beams.append(predicted_answer)
        else:
            #For greedy / sampling decoding: only one beam (sequence) is returned, based on the argmax for greedy decoding
            #   or the sampling distribution for sampling decoding. Return this beam.
            beam_count = 1
            predicted_answer_seq_length = predicted_answer_info[
                "predictions_seq_lengths"][0] - 1  #-1 to exclude the EOS token
            predicted_answer = predicted_answer_info["predictions"][
                0][:predicted_answer_seq_length].tolist()
            answer_beams.append(predicted_answer)

        #Add new conversation steps to the end of the history and trim from the beginning if it is longer than conv_history_length
        #Answers need to be converted from output_vocabulary ints to input_vocabulary ints (since they will be fed back in to the encoder)
        self.conversation_history.append(question)
        answer_for_history = self.output_vocabulary.ints2words(
            answer_beams[0], is_punct_discrete_word=True, capitalize_i=False)
        answer_for_history = self.input_vocabulary.words2ints(
            answer_for_history)
        self.conversation_history.append(answer_for_history)
        self.trim_conversation_history(
            chat_settings.inference_hparams.conv_history_length)

        #Convert the answer(s) to text and return
        answers = []
        for i in range(beam_count):
            answer = self.output_vocabulary.ints2words(answer_beams[i])
            answers.append(answer)

        q_with_hist = None if not chat_settings.show_question_context else self.input_vocabulary.ints2words(
            question_with_history)
        if chat_settings.show_all_beams:
            return q_with_hist, answers
        else:
            return q_with_hist, answers[0]

Esempio n. 58

0

Mostra file

class Model(object):
    def __init__(self):
        self.vocab = Vocabulary()
        self.language_module = dataset.LSTMLanguageModule(
            message_flags.flattened_message_size(),
            self.vocab.get_vocab_size()).to(device)
        self.decoder = dataset.Decoder(
            message_flags.flattened_message_size()).to(device)
        all_params = list(self.language_module.parameters()) + list(
            self.decoder.parameters())
        self.optimizer = optim.Adam(all_params, weight_decay=1e-5)
        self.training_examples = []

    def predict(self, state, command):
        self.language_module.eval()
        self.decoder.eval()
        token_ids = self.vocab.token_ids(command)
        command_variable = torch.LongTensor(token_ids).unsqueeze(0).to(device)
        state_variable = dataset.state_to_variable(state).to(device)
        encoder_output = self.language_module.forward(command_variable)
        decoder_input = encoder_output if FLAGS.continuous_message else discrete_util.discrete_transformation(
            encoder_output)
        prediction = self.decoder.forward(state_variable, decoder_input)
        return dataset.output_from_variable(prediction, state)

    def optimizer_step(self):
        self.language_module.train()
        self.decoder.train()
        random.shuffle(self.training_examples)
        for state, command, target in self.training_examples:
            self.optimizer.zero_grad()
            state_variable = dataset.state_to_variable(state).to(device)
            target_variable = dataset.output_to_variable(target,
                                                         state).to(device)
            token_ids = self.vocab.token_ids(command)
            command_variable = torch.LongTensor(token_ids).unsqueeze(0).to(
                device)

            encoder_output = self.language_module.forward(command_variable)
            decoder_input = encoder_output if FLAGS.continuous_message else discrete_util.discrete_transformation(
                encoder_output)
            prediction = self.decoder.forward(state_variable, decoder_input,
                                              target_variable)

            loss = dataset.loss(prediction, target_variable)

            loss.backward()
            self.optimizer.step()

    def training_accuracy(self):
        n_correct = 0
        for state, command, target in self.training_examples:
            prediction = self.predict(state, command)
            if prediction == target:
                n_correct += 1
        return n_correct / len(self.training_examples)

    def update(self, state, command, target_output, num_updates=None):
        if num_updates is None:
            num_updates = FLAGS.baseline_max_updates
        self.training_examples.append((state, command, target_output))
        for _ in range(num_updates):
            self.optimizer_step()

Esempio n. 59

0

Mostra file

	def rewrite(self):
		""" Rewrite the flight according to the vocabulary voc (voc is a Vocabulary)"""
		rw=[]
		for part in self.vocabulary.getPartitions():
			for partelt in part.getModalities():
				val=self.getValue(part.getAttName())
				mu = partelt.getMu(val)
				rw.append(mu)
		return rw

	def satisfaisant(self, conditions):

		for condition in conditions:
			part = self.vocabulary.getPartition(condition[0])
			partelt = part.getModality(condition[1])
			val = self.getValue(part.getAttName())
			mu = partelt.getMu(val)
			if (mu < condition[2]):
				return False
		return True

if __name__ == "__main__":
	if len(sys.argv)  < 2:
		print("Usage: python flight.py <vocfile.csv>")
	else:
		if os.path.isfile(sys.argv[1]):
			voc = Vocabulary(sys.argv[1])
			line= "2008,1,3,4,1103,1955,2211,2225,WN,335,N712SW,128,150,116,-14,8,IAD,TPA,810,4,8,0,,0,NA,NA,NA,NA,NA"
			f = Flight(line,voc)
			print(f.rewrite())

Esempio n. 60

0

Mostra file

File: run.py Progetto: sld/tensorflow-word2vec

if __name__ == '__main__':
    arguments = parse_args()

    logger.info('Loading config')
    with open(arguments.config) as config_file:
        config = yaml.load(config_file)

    logger.info('Initializing input stream')
    input_stream = LineSentence(
        arguments.corpus,
        max_sentence_length=config['sliding_window']['change_every_words'])

    min_word_freq = config['vocabulary']['min_freq']
    logger.info('Building vocabulary with min_freq={}'.format(min_word_freq))
    vocab = Vocabulary.from_documents(input_stream, min_word_freq)

    vocabulary_size = len(vocab)
    logger.info('Vocabulary size: {}'.format(vocabulary_size))

    logger.info('Building negative sampling distribution')
    negative_sampler = HierarchicalSampler(
        vocab=vocab,
        alpha=config['negative_sampling']['alpha'],
        chunks_num=config['negative_sampling']['vocab_chunks_num'])

    logger.info('Building model computation graph')
    optimizer = tf.train.AdagradOptimizer(
        learning_rate=config['training_params']['initial_learning_rate'])

    negative_samples_num = config['sliding_window']['max_size'] * \