Python NgramModelの例、ngram.NgramModel Pythonの例

コード例 #1

0

ファイルを表示

def gen_disassociated_press(file=KJBIBLE, order=3, len=100):
    """Generate some autocorrelated text."""
    tokens = [k for k in tokenize(file) if k.isalpha()]
    model = NgramModel(order, tokens, MLEProbDist)
    ret = ['',] * (order-1)
    for i in range(len):
        tail = ret[-(order-1):]
        ret.append(model.generate_one(tail))
    return ' '.join(ret[(order-1):])

コード例 #2

0

ファイルを表示

ファイル: train.py プロジェクト: whtngus/Kakao_Arena_brunch_recommendation

 def train_ngram(self, target_data_path, days, kakao_data_path):
     results = []
     ngramModel = NgramModel(self.dataLoader)
     self.dataLoader.ngram_data_loader(days)
     self.dataLoader.kakao_data_loader(kakao_data_path)
     target_datas = self.dataLoader.target_data_loader(target_data_path)
     target_data_len = len(target_datas)
     for i, target_data in enumerate(target_datas):
         if i % (target_data_len / 100) == 0:
             print("{}% 완료".format((i / target_data_len) * 100))
         re = ngramModel.detect_rule_recommend(target_data)
         results.append(target_data + " " + " ".join(re))
     self.dataLoader.write_result(self.write_file_path, results)

コード例 #3

0

ファイルを表示

ファイル: add_information.py プロジェクト: laurelmackenzie/Variation

def train(train_file):
    """Return the required language models trained from a file."""
    unigram = NgramModel(1)
    bigram_left = NgramModel(2)
    bigram_right = NgramModel(2)
    for line in train_file:
        tokens = line.rstrip().split()
        unigram.update(tokens)
        bigram_left.update(tokens)
        bigram_right.update(reversed(tokens))

    return (unigram, bigram_left, bigram_right)

コード例 #4

0

ファイルを表示

ファイル: generator.py プロジェクト: tocubed/imitare

    def _make_models(self, tuples):
        self._word_ids = WordIdDictionary()

        # Extract sequence of words, lemmas, and tags
        words, lemmas, tags = tuple(map(lambda tokens: list(
            self._word_ids.add_words_transform(tokens)), zip(*tuples)))
        self._tags = tags

        # Create models for words, lemmas, and tags
        self._words_ngram = NgramModel(words, self._n)
        self._lemmas_ngram = NgramModel(lemmas, self._n)
        self._tags_ngram = NgramModel(tags, 2 * self._n) # Can afford to use 2 * n-gram size for grammar

        # Map tag and (tag, lemma) to valid lemmas and vocabulary, respectively
        # It's faster to use a list than predicate on unigrams during backoff search
        self._tag_lemmas = ConditionalFreqDist(zip(tags, lemmas))
        self._tag_lemma_words = ConditionalFreqDist(
            zip(zip(tags, lemmas), words))

コード例 #5

0

ファイルを表示

ファイル: train_ngram_model.py プロジェクト: curtislb/ReviewTranslation

def main():
    
    start = time.clock()

    #sys.argv[1] is path to training data
    #sys.argv[2] is length of n-grams

    ngram_model = NgramModel(int(sys.argv[2]), sys.argv[1], pad_right=True)

    end = time.clock()
    print 'Done computing ngram model, ' + str(end - start) + ' seconds running time'

    fileName = sys.argv[2] + 'gramModel' + '_' + sys.argv[1][3:].split('.')[0] + '.p'
    pickle.dump(ngram_model, open(fileName, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)

    end = time.clock()
    print 'Done pickling, ' + str(end - start) + ' seconds running time'
    
    '''
    #for unpickling
    fileName = sys.argv[2] + 'gramModel' + '_' + sys.argv[1][3:].split('.')[0] + '.p'
    restored_model = pickle.load(open(fileName, 'rb'))
    #end = time.clock()
    #print 'Done unpickling, ' + str(end - start) + ' seconds running time'
    '''

    print "Generated examples: "
    for i in range (200):
        #print ' '.join(ngram_model.generate(20, ('', '')))

        review = []
        context = ['', '']
        nextToken = ngram_model._generate_one(context)
        while nextToken != '.' and nextToken != '...EOR...' and len(review) < 500:
            review.append(nextToken)
            context[0] = context[1]
            context[1] = nextToken
            nextToken = ngram_model._generate_one(context)

        print ' '.join(review) + '   len: ' + str(len(review))

コード例 #6

0

ファイルを表示

ファイル: chatbot.py プロジェクト: mikabr/chatbot-puzzle

def initialize_bot(chars, nicks):
    n = 3
#    intros = ["So", "Hi", "In fact", "For what it's worth", "Think about it",
#              "Conversely", "On the other hand", "Debatably", "Especially", "Not to mention", "Although", "Moreover", "Equally",
#              "But", "Yes",
#              "See here", "Ultimately", "Rather", "Nevertheless", "As you said", "Mind you", "Even so"]
    char_corps = load_corpora(chars)
    est = lambda fdist, bins: MLEProbDist(fdist)
#    est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
#    est = lambda fdist, bins: WittenBellProbDist(fdist)
#    est = lambda fdist, bins: KneserNeyProbDist(fdist)
    models = {character: NgramModel(n, corp, estimator=est)
              for character, corp in char_corps.iteritems()}
#    return ChatBot(chars, nicks, intros, models, ngram=n, debug=False)
    return ChatBot(chars, nicks, models, ngram=n, debug=False)

コード例 #7

0

ファイルを表示

ファイル: hmm.py プロジェクト: iiisthu/Keystroke-Recognition

 def viterbi(self,sentence, order):
     tokenizer = RegexpTokenizer(r'[\w\']+')
     self.token_words = tokenizer.tokenize(sentence)
     #self.token_words = word_tokenize(sentence)
     self.nngram =  NgramModel(order, [ word.lower() for word in brown.words()], estimator) 
     self.N = len(self.token_words)
     MAX_OFFSET = 10
     if order == 2:
         viterbiM = np.zeros((self.N + MAX_OFFSET, self.M + 2), dtype = 'double')
         words = np.zeros((self.N + MAX_OFFSET, self.M + 2), dtype = object)
         backpointer = np.zeros( ( self.N + MAX_OFFSET, self.M + 2), dtype = 'int32')
         offset = np.zeros( self.N, dtype = 'int32' )
         return self.viterbi_first_order(viterbiM, words, backpointer, offset)
     if order == 3:
         viterbiM = np.zeros((self.N + MAX_OFFSET, self.M + 2, self.M + 2), dtype = 'double')
         words = np.zeros((self.N + MAX_OFFSET, self.M + 2), dtype = object)
         backpointer = np.zeros( ( self.N + MAX_OFFSET, self.M + 2, self.M + 2), dtype = 'int32')
         offset = np.zeros( self.N, dtype = 'int32' )
         return self.viterbi_second_order(viterbiM, words, backpointer, offset)

コード例 #8

0

ファイルを表示

    def _make_models(self, tuples):
        self._word_ids = WordIdDictionary()

        # Extract sequence of words, lemmas, and tags
        words, lemmas, tags = tuple(
            map(
                lambda tokens: list(self._word_ids.add_words_transform(tokens)
                                    ), zip(*tuples)))
        self._tags = tags

        # Create models for words, lemmas, and tags
        self._words_ngram = NgramModel(words, self._n)
        self._lemmas_ngram = NgramModel(lemmas, self._n)
        self._tags_ngram = NgramModel(
            tags, 2 * self._n)  # Can afford to use 2 * n-gram size for grammar

        # Map tag and (tag, lemma) to valid lemmas and vocabulary, respectively
        # It's faster to use a list than predicate on unigrams during backoff search
        self._tag_lemmas = ConditionalFreqDist(zip(tags, lemmas))
        self._tag_lemma_words = ConditionalFreqDist(
            zip(zip(tags, lemmas), words))

コード例 #9

0

ファイルを表示

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print "Usage: %s <corpus-root> <tweets-file>" % (sys.argv[0])
        sys.exit(1)

    corpus_root = sys.argv[1]
    estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    ignored_words = nltk.corpus.stopwords.words('english')

    pos_movie_reviews = PlaintextCorpusReader(corpus_root + "/pos", ".*\.txt")
    neg_movie_reviews = PlaintextCorpusReader(corpus_root + "/neg", ".*\.txt")

    print "Corpora built."

    pos_unigram_lm = NgramModel(1, pos_movie_reviews.words(), estimator)
    print "Positive unigram model complete."
    pos_bigram_lm = NgramModel(2, pos_movie_reviews.words(), estimator)
    print "Positive bigram model complete."
    #pos_trigram_lm = NgramModel(3, pos_movie_reviews.words(), estimator)

    neg_unigram_lm = NgramModel(1, neg_movie_reviews.words(), estimator)
    print "Negative unigram model complete."
    neg_bigram_lm = NgramModel(2, neg_movie_reviews.words(), estimator)
    print "Negative bigram model complete."
    #neg_trigram_lm = NgramModel(3, neg_movie_reviews.words(), estimator)

    #read in the tweets
    tweets = []
    tokenizer = utils.Tokenizer()

コード例 #10

0

ファイルを表示

ファイル: hmm.py プロジェクト: iiisthu/Keystroke-Recognition

class HMM(object):
    def __init__(self, matrixE):
        self.matrixE = matrixE
        self.M = 20
        self.nngram = None
        self.word_dict_path  = '../data/word_by_len'
        self.word_dict = {}
        self.load_word_dict()
        self.ppservers=("localhost", )
        self.threshold  = 0.00000001
        self.job_server = pp.Server(ppservers= self.ppservers) 
        print "active nodes: ", self.job_server.get_active_nodes()
        #self.trigram =  NgramModel(3, brown.words(), estimator)

    def load_word_dict(self):
        for i in xrange(1,24):
            with open("%s/%d.txt"%(self.word_dict_path, i), 'r') as fd:
                self.word_dict[i] = [line.strip().lower() for line in fd.readlines()]

    def segment(self, word):
        prob_max = 0.0
        first, second = [],[] 
        special_char = [',','.','\'', ' ', '\n']
        for c in xrange(1,len(word)-1):
            first_cur = self.most_similar_words(word[:c])
            second_cur  = self.most_similar_words(word[(c+1):])
            for (key1, val1) in first_cur:
                for (key2, val2) in second_cur:
                    prob = val1 * val2 * max( [ self.matrixE[charToNum(sc)][charToNum(word[c])] for sc in special_char ])
                    if prob > prob_max:
                        prob_max = prob
                        LOG( log_file, '(%s, %d,  %f, %s, %s)\n' %(word, c, prob_max, key1, key2)) 
                        first = first_cur
                        second = second_cur
        return first, second, prob_max 
            
    def viterbi(self,sentence, order):
        tokenizer = RegexpTokenizer(r'[\w\']+')
        self.token_words = tokenizer.tokenize(sentence)
        #self.token_words = word_tokenize(sentence)
        self.nngram =  NgramModel(order, [ word.lower() for word in brown.words()], estimator) 
        self.N = len(self.token_words)
        MAX_OFFSET = 10
        if order == 2:
            viterbiM = np.zeros((self.N + MAX_OFFSET, self.M + 2), dtype = 'double')
            words = np.zeros((self.N + MAX_OFFSET, self.M + 2), dtype = object)
            backpointer = np.zeros( ( self.N + MAX_OFFSET, self.M + 2), dtype = 'int32')
            offset = np.zeros( self.N, dtype = 'int32' )
            return self.viterbi_first_order(viterbiM, words, backpointer, offset)
        if order == 3:
            viterbiM = np.zeros((self.N + MAX_OFFSET, self.M + 2, self.M + 2), dtype = 'double')
            words = np.zeros((self.N + MAX_OFFSET, self.M + 2), dtype = object)
            backpointer = np.zeros( ( self.N + MAX_OFFSET, self.M + 2, self.M + 2), dtype = 'int32')
            offset = np.zeros( self.N, dtype = 'int32' )
            return self.viterbi_second_order(viterbiM, words, backpointer, offset)
            
    def most_similar_words_with_split(self,word):
        states = self.most_similar_words(word)
        # If the probability is too small , it's possible that space is recognized as character
        states_tmp = []
        if states[0][1] < self.threshold:
            LOG(log_file, "Beyond bottom threshold, try to split %s\n"%word)
            states1, states2, prob  = self.segment(word)
            if prob > states[0][1]:
                states_tmp.append(states1)
                states_tmp.append(states2)
                splited = True
            else:
                states_tmp.append(states)
        else:
            states_tmp.append(states)
        return states_tmp

    def viterbi_first_order(self,viterbiM, words, backpointer, offset):
        print 'call_bigram'
        for i, word in enumerate(self.token_words):
            # Find matched word by probability
            splited = False
            starttime = int(time())
            print 'finding similar words(%s)'%word
            states_tmp = self.most_similar_words_with_split(word)
            print states_tmp
            find_time = int(time()) - starttime
            # initial current offset
            cur_offset = 0
            if i != 0:
                cur_offset =  offset[i-1]
            # recursion step
            print 'iterate states'
            for index, st in enumerate(states_tmp): 
                id_with_offset = cur_offset + index + i
                for j, (state, prob) in enumerate(st):
                    # print something out for debuging
                    if j < 10:
                        LOG(log_file, '%s, %f\n'%( state, prob ))
                        print state,prob
                    words[i][j+1] = state
                    if i == 0:
                        pref = [u' ']
                        viterbiM[id_with_offset][j+1] = self.nngram.prob(state, pref)*prob
                        backpointer[id_with_offset][j+1] = 0
                    else:
                        l_tmp =  max(enumerate([
                                                viterbiM[id_with_offset - 1 ][k+1]
                                                *self.nngram.prob(state, [ str(words[id_with_offset - 1 ][k+1]) ])
                                                * prob 
                                                for k in xrange(self.M)
                                            ]), 
                                                key = operator.itemgetter(1)
                                            )

                        backpointer[id_with_offset][j+1] , viterbiM[id_with_offset][j+1] = l_tmp 
            if splited:
                offset[i] = cur_offset + 1
            LOG( log_file, "Eclapse %d s matching most possible word (%s)," 
                            "eclapse %d s for viterbi...\n"
                            %(find_time, word , int(time()) - starttime - find_time))
        
        final_offset = offset[-1] 
        print 'final offset is %d'%final_offset
        # termination step
        l = [
                viterbiM[self.N + final_offset - 1][k+1]
                *self.endOfSentence(self.nngram, [ words[self.N + final_offset - 1][k+1] ]) 
                for k in xrange(self.M)
            ]
        backpointer[self.N + final_offset - 1][self.M+1], viterbiM[self.N + final_offset - 1][self.M + 1] = max(enumerate(l), key = operator.itemgetter(1))
        # backtrace
        path=[]
        end = backpointer[self.N + final_offset - 1][self.M+1]
        for i in xrange(self.N + final_offset - 1, 0, -1):
            path.append(end)
            end = backpointer[i][end+1]
        path.append(end)
        word_vector = []
        for i in xrange(self.N + final_offset -1, -1, -1):
            word_vector.append(words[self.N + final_offset -1 - i][path[i]+1])
        return word_vector 

    def viterbi_second_order(self,viterbiM, words, backpointer, offset):
        for i, word in enumerate(self.token_words):
            # Find matched word by probability
            starttime = int(time())
            splited = False
            states_tmp = self.most_similar_words_with_split(word)
            find_time = int(time()) - starttime
            # initial current offset
            cur_offset = 0
            if i != 0:
                cur_offset =  offset[i-1]
            # recursion step
            for index, st in enumerate(states_tmp): 
                id_with_offset = cur_offset + index + i
                for j, (state, prob) in enumerate(st):
                    # print something out for debuging
                    if j < 20:
                        LOG(log_file, '%s, %f\n'%( state, prob ))
                        print state,prob
                    words[i][j+1] = state
                    for l in xrange(self.M):
                        if i == 0:
                            pref = [u' ']
                            viterbiM[id_with_offset][l+1][j+1] = self.nngram.prob(state, pref)*prob
                            backpointer[id_with_offset][l+1][j+1] = 0
                        elif i == 1:
                            backpointer[id_with_offset][l+1][j+1] , viterbiM[id_with_offset][l+1][j+1] = max(enumerate([
                                                                        viterbiM[id_with_offset - 1 ][k+1][l+1]
                                                                        *self.nngram.prob(state, [
                                                                        ' ',
                                                                        str(words[id_with_offset - 1][l+1]) ])
                                                                        * prob 
                                                                        for k in xrange(self.M)
                                                                    ]), 
                                                                    key = operator.itemgetter(1)
                                                            )
                        else:
                            backpointer[id_with_offset][l+1][j+1] , viterbiM[id_with_offset][l+1][j+1] = max(enumerate([
                                                                        viterbiM[id_with_offset - 1 ][k+1][l+1]
                                                                        *self.nngram.prob(state, [
                                                                        str(words[id_with_offset - 2 ][k+1]), 
                                                                        str(words[id_with_offset - 1][l+1]) ])
                                                                        * prob 
                                                                        for k in xrange(self.M)
                                                                    ]), 
                                                                    key = operator.itemgetter(1)
                                                            )
            if splited:
                offset[i] = cur_offset + 1
            LOG( log_file, "Eclapse %d s matching most possible word (%s)," 
                            "eclapse %d s for viterbi...\n"
                            %(find_time, word , int(time()) - starttime - find_time))
        
        final_offset = offset[-1] 
        print 'final offset is %d'%final_offset
        # termination step
        for l in xrange(self.M):
            backpointer[self.N + final_offset - 1][l+1][self.M+1],viterbiM[self.N + final_offset - 1][l + 1][self.M+1] = max(enumerate([
                            viterbiM[self.N + final_offset - 1][k+1][l+1]
                            *self.endOfSentence(self.nngram, [ str(words[self.N + final_offset - 2][k+1]) 
                                                                , str(words[self.N + final_offset - 1][l+1]) ]) 
                            for k in xrange(self.M)
                            ]),
                            key = operator.itemgetter(1))
        backpointer[self.N + final_offset - 1][self.M+1][self.M+1],viterbiM[self.N + final_offset - 1][self.M+1][self.M+1] = max(enumerate([
                            viterbiM[self.N + final_offset - 1][k+1][self.M+1]
                            *self.endOfSentence(self.nngram, [ str(words[self.N + final_offset - 1][k+1]) ])
                            for k in xrange(self.M)
                            ]),
                            key = operator.itemgetter(1))
        # backtrace
        import pdb
        pdb.set_trace()
        path=[]
        end = backpointer[self.N + final_offset - 1][self.M+1][self.M+1]
        path.append(end)
        last_end = backpointer[self.N + final_offset - 1][end][self.M+1]
        path.append(last_end)
        for i in xrange(self.N + final_offset - 1, 1, -2):
            end = backpointer[i][last_end+1][end + 1]
            last_end = backpointer[i-1][end + 1][last_end + 1]
            path.append(end)
            path.append(last_end)

        path.append(end)
        print path
        word_vector = []
        for i in xrange(self.N + final_offset -1, -1, -1):
            word_vector.append(words[self.N + final_offset -1 - i][path[i]+1])
        print word_vector
        return word_vector 


    def endOfSentence(self, lm, word):
        prob = 0
        for separator in [',', '.']:
            prob += lm.prob(separator, word) 
        return prob

       
    def most_similar_words(self, word):
        prob_list = {}
        jobs = self.paralize(word)
        prob_list.update(jobs)
        sorted_prob = sorted(
                                prob_list.items(),
                                key = operator.itemgetter(1), 
                                reverse=True
                            )
        return sorted_prob[:self.M]

    def paralize(self, word):
        parts = 17
        jobs = []
        for index in xrange(parts):
            for i in xrange(3):
                if i == 1:
                    punish = 1
                else:
                    punish = 0.05
                length =  len(word) + i - 1
                if length <= 0:
                    _list = []
                else:
                    _list = split_dict(self.word_dict[length], length, parts - 1, index) 
                jobs.append(
                            self.job_server.submit(
                                                    populate_prob, 
                                                    (_list, self.matrixE, unigram, length, word, {}, substitue, punish,), 
                                                    (substitue, insert, delete, probaWord, weightedPopularity, charToNum,),
                                                    ("nltk",) 
                                                )
                            )
        self.job_server.wait()
        stats = {}
        for job in jobs:
            stats.update(job())
        #self.job_server.print_stats()
        return stats

コード例 #11

0

ファイルを表示

ファイル: aglearn.py プロジェクト: ConstantineLignos/ArtificialLangLearning

    def train(self, train_path):
        """Train based on co-occurences in the provided data."""
        term_symbols = []

        # Process input
        with open(train_path, "Ur") as train_file:
            for line in train_file:
                line_symbols = line.split()

                # Learn co-occurences and precedes/follows
                for idx, sym1 in enumerate(line_symbols):
                    # Count the symbol
                    self.counts[sym1] += 1

                    # Get the sets of the other items
                    preceding_symbols = set(line_symbols[:idx])
                    following_symbols = set(line_symbols[idx + 1:])
                    other_symbols = preceding_symbols | following_symbols

                    # Count cooccurences
                    for sym2 in other_symbols:
                        self.cooccurs[sym1][sym2] += 1

                    # Mark before/after
                    for sym2 in preceding_symbols:
                        self.before[sym1].add(sym2)
                    for sym2 in following_symbols:
                        self.after[sym1].add(sym2)

                    # Remove if one of the always relationships does
                    # not hold up
                    for sym2 in SYMBOLS:
                        if sym2 not in preceding_symbols:
                            try:
                                self.mustprecede[sym1].remove(sym2)
                            except KeyError:
                                pass
                        if sym2 not in following_symbols:
                            try:
                                self.mustfollow[sym1].remove(sym2)
                            except KeyError:
                                pass

                # Add beginning and end terminators for n-grams
                term_symbols.extend([START_SYM] + line_symbols + [END_SYM])

        # Learn
        # Requires/excludes and precedes/follows
        for sym1 in SYMBOLS:
            for sym2 in SYMBOLS:
                # Co-occurence counts imply requires/excludes
                count1 = self.counts[sym1]
                if self.cooccurs[sym1][sym2] == count1:
                    self.requires[sym1].add(sym2)
                elif self.cooccurs[sym1][sym2] == 0:
                    self.excludes[sym1].add(sym2)

                # Figure out what cannot precede/follow
                if sym2 not in self.before[sym1]:
                    self.noprecede[sym1].add(sym2)
                if sym2 not in self.after[sym1]:
                    self.nofollow[sym1].add(sym2)

        # N-gram model
        self.ngram = NgramModel(2, term_symbols)

コード例 #12

0

ファイルを表示

ファイル: generator.py プロジェクト: tocubed/imitare

class LVGNgramGenerator:
    """
    Lemmatized vocabulary and grammar ngram-based generator
    """

    def __init__(self, tuples, n):
        """
        Parameters
        ----------
        tuples : Iterable[(str, str, str)]
            A list of (word, lemma, tag) tuples from which to learn a model
        n : int
            Maximum size of n-grams to use in NgramModel
        """
        self._n = n
        print("Creating models... (this may take some time)");
        self._make_models(tuples)
        print("Done!");

    def _make_models(self, tuples):
        self._word_ids = WordIdDictionary()

        # Extract sequence of words, lemmas, and tags
        words, lemmas, tags = tuple(map(lambda tokens: list(
            self._word_ids.add_words_transform(tokens)), zip(*tuples)))
        self._tags = tags

        # Create models for words, lemmas, and tags
        self._words_ngram = NgramModel(words, self._n)
        self._lemmas_ngram = NgramModel(lemmas, self._n)
        self._tags_ngram = NgramModel(tags, 2 * self._n) # Can afford to use 2 * n-gram size for grammar

        # Map tag and (tag, lemma) to valid lemmas and vocabulary, respectively
        # It's faster to use a list than predicate on unigrams during backoff search
        self._tag_lemmas = ConditionalFreqDist(zip(tags, lemmas))
        self._tag_lemma_words = ConditionalFreqDist(
            zip(zip(tags, lemmas), words))

    def generate_without_pos(self, n):
        """
        Generate n words without using any special POS information
        """
        # Just use words NgramModel generate function
        generated_words = self._words_ngram.generate(n)
        return list(self._word_ids.transform_ids(generated_words))

    def generate(self, n):
        """
        Generate n words using copied grammar, generated lemmas, and words based on lemmas
        """
        start = random.randint(n, len(self._tags) - n)
        generated_tags = self._tags[start : start + n] # Copy a random section of POS tags for grammar

        # Generate sequence of lemmas based off of grammar
        generated_lemmas = []
        for tag in generated_tags:
            # Search for and choose a lemma with correct tag
            choice = self._lemmas_ngram.choose_word(
                generated_lemmas, backoff_limit=2, predicate=lambda lemma: lemma in self._tag_lemmas[tag])
            if choice is None: 
                # Could not find a good lemma for current POS tag, choose from list
                choice = MLEProbDist(self._tag_lemmas[tag]).generate()
            generated_lemmas.append(choice)

        # Generate sequence of words based off of lemmas and grammar
        generated_words = []
        for (tag, lemma) in zip(generated_tags, generated_lemmas):
            # Search for and choose word with correct lemma/tag
            choices = self._words_ngram.backoff_search(
                generated_words, backoff_limit=2, predicate=lambda word: word in self._tag_lemma_words[(tag, lemma)])
            if choices is None:
                # Could not find a good word, choose from list
                choices = self._tag_lemma_words[(tag, lemma)]
            generated_words.append(MLEProbDist(choices).generate())

        return list(self._word_ids.transform_ids(generated_words))

    def generate_alternative(self, n):
        """
        Generate n words using a more complicated algorithm
        """
        generated_tags = []
        generated_lemmas = []
        generated_words = []

        # Incrementally generate (tag, lemma) pairs
        for i in range(n):
            tag_choice = None # Start with nothing

            # Loop through n-grams of grammar
            size = 2 * self._n
            while size > 2:
                tag_choices = self._tags_ngram.backoff_search(
                    generated_tags, backoff_limit=2, predicate=lambda tag: True, start_n=size)

                # Determine valid lemmas in context with these tag choices
                tag_to_lemma = {}
                if tag_choices is not None:
                    for tag, _ in tag_choices.items():
                        # For each tag, find valid lemmas in context with that tag
                        lemma = self._lemmas_ngram.choose_word(
                            generated_lemmas, backoff_limit=2, predicate=lambda lemma: lemma in self._tag_lemmas[tag])
                        if lemma is not None:
                            tag_to_lemma[tag] = lemma

                    if len(tag_to_lemma) > 1:
                        # We have found valid (tag, lemma) pairs
                        tag_probdist = MLEProbDist(FreqDist(
                            {tag: freq for tag, freq in tag_choices.items() if tag in tag_to_lemma}))
                        tag_choice = tag_probdist.generate() # Randomly select the tag
                        lemma_choice = tag_to_lemma[tag_choice] # Set the lemma
                        break
                size -= 1 # Lower to smaller n-gram for more tag choices

            if tag_choice is None:
                # We still didn't find a valid (tag, lemma) pair, fallback
                tag_choice = MLEProbDist(tag_choices).generate()
                lemma_choice = MLEProbDist(
                    self._tag_lemmas[tag_choice]).generate()

            generated_tags.append(tag_choice)
            generated_lemmas.append(lemma_choice)

        # Generate all words based on (tag, lemma) pairs
        for (tag, lemma) in zip(generated_tags, generated_lemmas):
            # Search for and choose word with correct lemma/tag
            choices = self._words_ngram.backoff_search(
                generated_words, backoff_limit=2, predicate=lambda word: word in self._tag_lemma_words[(tag, lemma)])
            if choices is None:
                # Could not find a good word, choose from list
                choices = self._tag_lemma_words[(tag, lemma)]
            generated_words.append(MLEProbDist(choices).generate())

        return list(self._word_ids.transform_ids(generated_words))

コード例 #13

0

ファイルを表示

ファイル: ngram.py プロジェクト: luketurner/cs-assignments

def test_model(n, inits):
	ngram = NgramModel(n, brown.words(), selector)

	with open("out" + str(n), 'w') as outf:
		for i in inits:
			print(' '.join(ngram.generate_sentence(i)), file=outf)

コード例 #14

0

ファイルを表示

ファイル: nnet_rbm.py プロジェクト: Sandy4321/masters_thesis_code

def main():
    """
    Trains and evaluates neural
    language models on the Microsoft Sentence Completion
    Challenge dataset.

    Allowed cmd-line flags:
        -s TS_FILES : Uses the reduced trainsed (TS_FILES trainset files)
        -o MIN_OCCUR : Only uses terms that occur MIN_OCCUR or more times
            in the trainset. Other terms are replaced with a special token.
        -f MIN_FILES : Only uses terms that occur in MIN_FILES or more files
            in the trainset. Other terms are replaced with a special token.
        -n : n-gram length (default 4)
        -t : Use tree-grams (default does not ues tree-grams)
        -u FTRS : Features to use. FTRS must be a string composed of zeros
            and ones, of length 5. Ones indicate usage of following features:
            (word, lemma, google_pos, penn_pos, dependency_type), respectively.

    Neural-net specific cmd-line flags:
        -ep EPOCHS : Number of training epochs, defaults to 20.
        -eps EPS : Learning rate, defaults to 0.005.
        -mnb MBN_SIZE : Size of the minibatch, defaults to 2000.

    """
    logging.basicConfig(level=logging.INFO)
    log.info("Evaluating model")

    #   get the data handling parameters
    ts_reduction = util.argv('-s', None, int)
    min_occ = util.argv('-o', 5, int)
    min_files = util.argv('-f', 2, int)
    n = util.argv('-n', 4, int)
    use_tree = '-t' in sys.argv
    bool_format = lambda s: s.lower() in ["1", "true", "yes", "t", "y"]
    ft_format = lambda s: map(bool_format, s)
    ftr_use = np.array(util.argv('-u', ft_format("001000"), ft_format))
    val_per_epoch = util.argv('-v', 10, int)

    #   nnets only support one-feature ngrams
    assert ftr_use.sum() == 1

    #   get nnet training parameters
    use_lbl = '-l' in sys.argv
    epochs = util.argv('-ep', 20, int)
    eps = util.argv('-eps', 0.002, float)
    mnb_size = util.argv('-mnb', 2000, int)
    n_hid = util.argv('-h', 1000, int)
    d = util.argv('-d', 100, int)

    #   load data
    ngrams, q_groups, answers, feature_sizes = data.load_ngrams(
        n, ftr_use, use_tree, subset=ts_reduction,
        min_occ=min_occ, min_files=min_files)
    used_ftr_sizes = feature_sizes[ftr_use]
    #   remember, we only use one feature
    vocab_size = used_ftr_sizes[0]
    log.info("Data loaded, %d ngrams", ngrams.shape[0])

    #   split data into sets
    x_train, x_valid, x_test = util.dataset_split(ngrams, 0.05, 0.05, rng=456)

    #   generate a version of the validation set that has
    #   the first term (the conditioned one) randomized
    #   w.r.t. unigram distribution
    #   so first create the unigram distribution, no smoothing
    unigrams_data = data.load_ngrams(1, ftr_use, False, subset=ts_reduction,
                                     min_occ=min_occ, min_files=min_files)[0]
    unigrams_data = NgramModel(1, False, ftr_use, feature_sizes, ts_reduction,
                               min_occ, min_files, 0.0, 0.0, unigrams_data)
    unigrams_dist = unigrams_data.probability_additive(
        np.arange(vocab_size).reshape(vocab_size, 1))
    unigrams_dist /= unigrams_dist.sum()
    #   finally, generate validation sets with randomized term
    x_valid_r = random_ngrams(x_valid, vocab_size, False, unigrams_dist)

    #   the directory for this model
    dir = "%s_%s_%d-gram_features-%s_data-subset_%r-min_occ_%r-min_files_%r"\
        % ("llbl" if use_lbl else "lmlp",
            "tree" if use_tree else "linear", n,
            "".join([str(int(b)) for b in ftr_use]),
            ts_reduction, min_occ, min_files)
    dir = os.path.join(_DIR, dir)
    if not os.path.exists(dir):
        os.makedirs(dir)

    #   filename base for this model
    file = "nhid-%d_d-%d_train_mnb-%d_epochs-%d_eps-%.5f" % (
        n_hid, d, mnb_size, epochs, eps)

    #   store the logs
    if False:
        log_file_handler = logging.FileHandler(
            os.path.join(dir, file + ".log"))
        log_file_handler.setLevel(logging.INFO)
        logging.root.addHandler(log_file_handler)

    #   we will plot log-lik ratios for every _VALIDATE_MNB minibatches
    #   we will also plot true mean log-lik
    valid_on = {"x_valid": x_valid[:_LL_SIZE], "x_valid_r": x_valid_r[
        :_LL_SIZE], "x_train": x_train[:_LL_SIZE]}
    valid_ll = {k: [] for k in valid_on.keys()}
    valid_p_mean = {k: [] for k in valid_on.keys()}

    #   how often we validate
    mnb_count = (x_train.shape[0] - 1) / mnb_size + 1
    _VALIDATE_MNB = mnb_count / val_per_epoch

    def mnb_callback(net, epoch, mnb):
        """
        Callback function called after every minibatch.
        """
        if (mnb + 1) % _VALIDATE_MNB:
            return

        #   calculate log likelihood using the exact probability
        probability_f = theano.function([net.input], net.probability)
        for name, valid_set in valid_on.iteritems():
            p = probability_f(valid_set)
            valid_ll[name].append(np.log(p).mean())
            valid_p_mean[name].append(p.mean())

        log.info('Epoch %d, mnb: %d, x_valid mean-log-lik: %.5f'
                 ' , x_valid p-mean: %.5f'
                 ' , ln(p(x_valid) / p(x_valid_r).mean(): %.5f',
                 epoch, mnb, valid_ll["x_valid"][-1],
                 valid_p_mean["x_valid"][-1],
                 valid_ll["x_valid"][-1] - valid_ll["x_valid_r"][-1])

    #   track if the model progresses on the sentence completion challenge
    # sent_challenge = []

    def epoch_callback(net, epoch):

        #   log some info about the parameters, just so we know
        param_mean_std = [(k, v.mean(), v.std())
                          for k, v in net.params().iteritems()]
        log.info("Epoch %d: %s", epoch, "".join(
            ["\n\t%s: %.5f +- %.5f" % pms for pms in param_mean_std]))

        #   evaluate model on the sentence completion challenge
        # probability_f = theano.function([net.input], net.probability)
        # qg_log_lik = [[np.log(probability_f(q)).sum() for q in q_g]
        #               for q_g in q_groups]
        # predictions = map(lambda q_g: np.argmax(q_g), qg_log_lik)
        # sent_challenge.append((np.array(predictions) == answers).mean())
        # log.info('Epoch %d sentence completion eval score: %.4f',
        #          epoch, sent_challenge[-1])

    log.info("Creating model")
    if use_lbl:
        net = LLBL(n, vocab_size, d, 12345)
    else:
        net = LMLP(n, vocab_size, d, 12345)
    net.mnb_callback = mnb_callback
    net.epoch_callback = epoch_callback
    train_cost, valid_cost, _ = net.train(
        x_train, x_valid, mnb_size, epochs, eps)

    #   plot training progress info
    #   first we need values for the x-axis (minibatch count)
    mnb_count = (x_train.shape[0] - 1) / mnb_size + 1
    mnb_valid_ep = mnb_count / _VALIDATE_MNB
    x_axis_mnb = np.tile((np.arange(mnb_valid_ep) + 1) * _VALIDATE_MNB, epochs)
    x_axis_mnb += np.repeat(np.arange(epochs) * mnb_count, mnb_valid_ep)
    x_axis_mnb = np.hstack(([0], x_axis_mnb))

    plt.figure(figsize=(16, 12))
    plt.subplot(221)
    plt.plot(mnb_count * (np.arange(epochs) + 1), train_cost, 'b-',
             label='train')
    plt.plot(mnb_count * (np.arange(epochs) + 1), valid_cost, 'g-',
             label='valid')
    plt.axhline(min(valid_cost), linestyle='--', color='g')
    plt.yticks(list(plt.yticks()[0]) + [min(valid_cost)])
    plt.title('cost')
    plt.grid()
    plt.legend(loc=1)

    plt.subplot(222)
    for name, valid_set in valid_ll.items():
        plt.plot(x_axis_mnb, valid_set, label=name)
    plt.ylim((np.log(0.5 / vocab_size),
              max([max(v) for v in valid_ll.values()]) + 0.5))
    plt.axhline(max(valid_ll["x_valid"]), linestyle='--', color='g')
    plt.yticks(list(plt.yticks()[0]) + [max(valid_ll["x_valid"])])
    plt.title('log-likelihood(x)')
    plt.grid()
    plt.legend(loc=4)

    plt.subplot(224)
    for name, valid_set in valid_p_mean.items():
        plt.plot(x_axis_mnb, valid_set, label=name)
    plt.title('p(x).mean()')
    plt.grid()
    plt.legend(loc=4)

    # plt.subplot(224)
    # plt.plot(mnb_count * np.arange(epochs + 1), sent_challenge, 'g-')
    # plt.title('sent_challenge')
    # plt.grid()

    plt.savefig(os.path.join(dir, file + ".pdf"))

コード例 #15

0

ファイルを表示

ファイル: graph_perplexity.py プロジェクト: nathan-gilbert/coreference-perplexity-generation

            test_text.extend(sentences)
        else:
            test_text.append(txt)

    #print test_files
    print len(test_files)

    total_train_files = []
    TOTAL = INCREMENT
    UPPER_LIMIT = 500

    while len(total_train_files) < UPPER_LIMIT:
        total_train_files = train_files[:TOTAL]
        data_set_corpus = PlaintextCorpusReader(sys.argv[1], total_train_files)
        estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
        lm = NgramModel(3, data_set_corpus.words(), estimator)
        #lm = NgramModel(2, data_set_corpus.words(), estimator)

        P = []
        for s in test_text:
            s_tokens = nltk.word_tokenize(s)
            if SENTENCE:
                #if len(s_tokens) > 3:
                if len(s_tokens) > 10:
                    p = lm.perplexity(s_tokens)
                    P.append(p)
            else:
                p = lm.perplexity(s_tokens)
                P.append(p)
        TOTAL += INCREMENT

コード例 #16

0

ファイルを表示

        with open(file=path, mode="wb") as fp:
            fp.write(pickle.dumps(obj=params_dict))

    @classmethod
    def load(cls, path):
        """
        加载模型
        :param path: 保存路径
        :return:
        """
        params_dict = pickle.load(open(file=path, mode="rb"))
        lookup_table = params_dict['_lookup_table']
        ngram_model = pickle.loads(params_dict['_ngram_model_pickle'],
                                   fix_imports=True)
        return cls(ngram_model=ngram_model, lookup_table=lookup_table)


if __name__ == '__main__':
    from nltk.text import Text
    from nltk.corpus import gutenberg

    text1 = Text(gutenberg.words('melville-moby_dick.txt'))
    #
    ngramCounter = NgramCounter(order=2, train=text1)
    ngramModel = NgramModel(ngram_counter=ngramCounter)

    corrector = NgramCorrector(ngram_model=ngramModel)
    print(corrector.correct(['I', 'dooo', 'think', 'you', 'rre', 'goooood']))
    corrector2 = NgramCorrector.load("123")
    print(corrector2.correct(['I', 'don', 'think', 'you', 'rre', 'goooood']))

コード例 #17

0

ファイルを表示

class LVGNgramGenerator:
    """
    Lemmatized vocabulary and grammar ngram-based generator
    """
    def __init__(self, tuples, n):
        """
        Parameters
        ----------
        tuples : Iterable[(str, str, str)]
            A list of (word, lemma, tag) tuples from which to learn a model
        n : int
            Maximum size of n-grams to use in NgramModel
        """
        self._n = n
        print("Creating models... (this may take some time)")
        self._make_models(tuples)
        print("Done!")

    def _make_models(self, tuples):
        self._word_ids = WordIdDictionary()

        # Extract sequence of words, lemmas, and tags
        words, lemmas, tags = tuple(
            map(
                lambda tokens: list(self._word_ids.add_words_transform(tokens)
                                    ), zip(*tuples)))
        self._tags = tags

        # Create models for words, lemmas, and tags
        self._words_ngram = NgramModel(words, self._n)
        self._lemmas_ngram = NgramModel(lemmas, self._n)
        self._tags_ngram = NgramModel(
            tags, 2 * self._n)  # Can afford to use 2 * n-gram size for grammar

        # Map tag and (tag, lemma) to valid lemmas and vocabulary, respectively
        # It's faster to use a list than predicate on unigrams during backoff search
        self._tag_lemmas = ConditionalFreqDist(zip(tags, lemmas))
        self._tag_lemma_words = ConditionalFreqDist(
            zip(zip(tags, lemmas), words))

    def generate_without_pos(self, n):
        """
        Generate n words without using any special POS information
        """
        # Just use words NgramModel generate function
        generated_words = self._words_ngram.generate(n)
        return list(self._word_ids.transform_ids(generated_words))

    def generate(self, n):
        """
        Generate n words using copied grammar, generated lemmas, and words based on lemmas
        """
        start = random.randint(n, len(self._tags) - n)
        generated_tags = self._tags[
            start:start + n]  # Copy a random section of POS tags for grammar

        # Generate sequence of lemmas based off of grammar
        generated_lemmas = []
        for tag in generated_tags:
            # Search for and choose a lemma with correct tag
            choice = self._lemmas_ngram.choose_word(
                generated_lemmas,
                backoff_limit=2,
                predicate=lambda lemma: lemma in self._tag_lemmas[tag])
            if choice is None:
                # Could not find a good lemma for current POS tag, choose from list
                choice = MLEProbDist(self._tag_lemmas[tag]).generate()
            generated_lemmas.append(choice)

        # Generate sequence of words based off of lemmas and grammar
        generated_words = []
        for (tag, lemma) in zip(generated_tags, generated_lemmas):
            # Search for and choose word with correct lemma/tag
            choices = self._words_ngram.backoff_search(
                generated_words,
                backoff_limit=2,
                predicate=lambda word: word in self._tag_lemma_words[
                    (tag, lemma)])
            if choices is None:
                # Could not find a good word, choose from list
                choices = self._tag_lemma_words[(tag, lemma)]
            generated_words.append(MLEProbDist(choices).generate())

        return list(self._word_ids.transform_ids(generated_words))

    def generate_alternative(self, n):
        """
        Generate n words using a more complicated algorithm
        """
        generated_tags = []
        generated_lemmas = []
        generated_words = []

        # Incrementally generate (tag, lemma) pairs
        for i in range(n):
            tag_choice = None  # Start with nothing

            # Loop through n-grams of grammar
            size = 2 * self._n
            while size > 2:
                tag_choices = self._tags_ngram.backoff_search(
                    generated_tags,
                    backoff_limit=2,
                    predicate=lambda tag: True,
                    start_n=size)

                # Determine valid lemmas in context with these tag choices
                tag_to_lemma = {}
                if tag_choices is not None:
                    for tag, _ in tag_choices.items():
                        # For each tag, find valid lemmas in context with that tag
                        lemma = self._lemmas_ngram.choose_word(
                            generated_lemmas,
                            backoff_limit=2,
                            predicate=lambda lemma: lemma in self._tag_lemmas[
                                tag])
                        if lemma is not None:
                            tag_to_lemma[tag] = lemma

                    if len(tag_to_lemma) > 1:
                        # We have found valid (tag, lemma) pairs
                        tag_probdist = MLEProbDist(
                            FreqDist({
                                tag: freq
                                for tag, freq in tag_choices.items()
                                if tag in tag_to_lemma
                            }))
                        tag_choice = tag_probdist.generate(
                        )  # Randomly select the tag
                        lemma_choice = tag_to_lemma[
                            tag_choice]  # Set the lemma
                        break
                size -= 1  # Lower to smaller n-gram for more tag choices

            if tag_choice is None:
                # We still didn't find a valid (tag, lemma) pair, fallback
                tag_choice = MLEProbDist(tag_choices).generate()
                lemma_choice = MLEProbDist(
                    self._tag_lemmas[tag_choice]).generate()

            generated_tags.append(tag_choice)
            generated_lemmas.append(lemma_choice)

        # Generate all words based on (tag, lemma) pairs
        for (tag, lemma) in zip(generated_tags, generated_lemmas):
            # Search for and choose word with correct lemma/tag
            choices = self._words_ngram.backoff_search(
                generated_words,
                backoff_limit=2,
                predicate=lambda word: word in self._tag_lemma_words[
                    (tag, lemma)])
            if choices is None:
                # Could not find a good word, choose from list
                choices = self._tag_lemma_words[(tag, lemma)]
            generated_words.append(MLEProbDist(choices).generate())

        return list(self._word_ids.transform_ids(generated_words))

コード例 #18

0

ファイルを表示

def train(train_file):
    """Return the required language models trained from a file."""
    unigram = NgramModel(1)
    bigram_left = NgramModel(2)
    bigram_right = NgramModel(2)
    for line in train_file:
        tokens = line.rstrip().split()
        unigram.update(tokens)
        bigram_left.update(tokens)
        bigram_right.update(reversed(tokens))

    return (unigram, bigram_left, bigram_right)

コード例 #19

0

ファイルを表示

ファイル: aglearn.py プロジェクト: ConstantineLignos/ArtificialLangLearning

class AGLearner:
    """A simple artificial language grammar learner."""

    def __init__(self):
        # Input counts
        # Number of times each symbol is seen
        self.counts = defaultdict(int)
        # Number of times symbols co-occur
        self.cooccurs = defaultdict(lambda: defaultdict(int))
        # Whether a symbol is observed before or after another symbol
        self.before = {sym: set() for sym in SYMBOLS}
        self.after = {sym: set() for sym in SYMBOLS}

        # Learning structures
        self.requires = defaultdict(set)
        self.excludes = defaultdict(set)
        self.noprecede = defaultdict(set)
        self.nofollow = defaultdict(set)
        self.mustprecede = {sym: set(SYMBOLS) for sym in SYMBOLS}
        self.mustfollow = {sym: set(SYMBOLS) for sym in SYMBOLS}
        self.ngram = None

    def train(self, train_path):
        """Train based on co-occurences in the provided data."""
        term_symbols = []

        # Process input
        with open(train_path, "Ur") as train_file:
            for line in train_file:
                line_symbols = line.split()

                # Learn co-occurences and precedes/follows
                for idx, sym1 in enumerate(line_symbols):
                    # Count the symbol
                    self.counts[sym1] += 1

                    # Get the sets of the other items
                    preceding_symbols = set(line_symbols[:idx])
                    following_symbols = set(line_symbols[idx + 1:])
                    other_symbols = preceding_symbols | following_symbols

                    # Count cooccurences
                    for sym2 in other_symbols:
                        self.cooccurs[sym1][sym2] += 1

                    # Mark before/after
                    for sym2 in preceding_symbols:
                        self.before[sym1].add(sym2)
                    for sym2 in following_symbols:
                        self.after[sym1].add(sym2)

                    # Remove if one of the always relationships does
                    # not hold up
                    for sym2 in SYMBOLS:
                        if sym2 not in preceding_symbols:
                            try:
                                self.mustprecede[sym1].remove(sym2)
                            except KeyError:
                                pass
                        if sym2 not in following_symbols:
                            try:
                                self.mustfollow[sym1].remove(sym2)
                            except KeyError:
                                pass

                # Add beginning and end terminators for n-grams
                term_symbols.extend([START_SYM] + line_symbols + [END_SYM])

        # Learn
        # Requires/excludes and precedes/follows
        for sym1 in SYMBOLS:
            for sym2 in SYMBOLS:
                # Co-occurence counts imply requires/excludes
                count1 = self.counts[sym1]
                if self.cooccurs[sym1][sym2] == count1:
                    self.requires[sym1].add(sym2)
                elif self.cooccurs[sym1][sym2] == 0:
                    self.excludes[sym1].add(sym2)

                # Figure out what cannot precede/follow
                if sym2 not in self.before[sym1]:
                    self.noprecede[sym1].add(sym2)
                if sym2 not in self.after[sym1]:
                    self.nofollow[sym1].add(sym2)

        # N-gram model
        self.ngram = NgramModel(2, term_symbols)

    def report(self):
        """Report the rules learned."""
        print "Co-occurence rules:"
        for sym in SYMBOLS:
            print sym, "requires", ', '.join(sorted(self.requires[sym]))
            print sym, "excludes", ','.join(sorted(self.excludes[sym]))

        print
        print "Linear precedence rules:"
        for sym in SYMBOLS:
            print sym, "cannot be preceded by", ', '.join(sorted(self.noprecede[sym]))
            print sym, "cannot be followed by", ', '.join(sorted(self.nofollow[sym]))

        print
        print "N-grams:"
        for event, context, prob in self.ngram.allngrams():
            print "{0} -> {1}: {2}".format(' '.join(context), event, prob)

    def test(self, test_path, out_path):
        """Test on a file"""
        test_file = open(test_path, "Ur")
        out_file = open(out_path, "w")

        header = ["Sentence", "Gold response", "Co-occur response", "Co-occur reason",
                  "Linear response", "Linear reason", "N-gram prob."]
        print >> out_file, "\t".join(header)

        for line in test_file:
            sent, gold = line.strip().split(',')
            gold = (gold.strip() == "True")
            line_symbols = sent.split()
            line_symbols_term = [START_SYM] + line_symbols + [END_SYM]

            # Decode violations
            cooccur_ok = True
            cooccur_reasons = set()
            linear_ok = True
            linear_reasons = set()

            for idx, sym1 in enumerate(line_symbols):
                # Get the sets of the other items
                preceding_symbols = set(line_symbols[:idx])
                following_symbols = set(line_symbols[idx + 1:])
                other_symbols = preceding_symbols | following_symbols

                # Check requirements and exclusions for each pair
                # Excluded symbols that are present
                for sym2 in self.excludes[sym1] & other_symbols:
                    cooccur_ok = False
                    cooccur_reasons.add("{0} excludes {1}".format(sym1, sym2))
                # Required symbols that are missing
                for sym2 in self.requires[sym1] & (set(SYMBOLS) - other_symbols):
                    cooccur_ok = False
                    cooccur_reasons.add("{0} requires {1}".format(sym1, sym2))

                # Check that preceding/following symbols are okay
                for prec_sym in preceding_symbols:
                    if prec_sym in self.noprecede[sym1]:
                        linear_ok = False
                        linear_reasons.add("{1} cannot precede {0}".format(sym1, prec_sym))
                for fol_sym in following_symbols:
                    if fol_sym in self.nofollow[sym1]:
                        linear_ok = False
                        linear_reasons.add("{1} cannot follow {0}".format(sym1, fol_sym))

                # Check for missing preceding/following symbols
                for prec_sym in self.mustprecede[sym1] - preceding_symbols:
                    linear_ok = False
                    linear_reasons.add("{1} must precede {0}".format(sym1, prec_sym))
                for fol_sym in self.mustfollow[sym1] - following_symbols:
                    linear_ok = False
                    linear_reasons.add("{1} must follow {0}".format(sym1, fol_sym))

            # N-gram statistics
            prob = self.ngram.seqprob(line_symbols_term)

            # Output
            print >> out_file, "\t".join([" ".join(line_symbols), str(gold),
                                          str(cooccur_ok), ", ".join(cooccur_reasons),
                                          str(linear_ok), ", ".join(linear_reasons),
                                          str(prob)])

        # Clean up
        test_file.close()
        out_file.close()