Ejemplo n.º 1
0
Archivo: lm.py Proyecto: rayruu/inf1820
    def __init__(self):
        self.bigrams = ConditionalFreqDist()
        self.unigrams = FreqDist()
        sentences = nltk.corpus.brown.sents(
            categories=nltk.corpus.brown.categories()[1:])

        for sent in sentences:
            # Vi utvider setningen med None foran, for å angi start av
            # setningen, og en None etter, for å markere setningsslutt.
            sent = [None] + sent + [None]
            for prev, word in bigrams(sent):
                self.bigrams[prev][word] += 1
                self.unigrams[word] += 1

        self.bigrams = ConditionalProbDist(self.bigrams, LaplaceProbDist)
        self.unigrams = LaplaceProbDist(self.unigrams)

        ############################# modified lm ####################################
        # regular expression:
        self.patterns = [
            (r'.*ing$', 'VBG'),  # gerunds
            (r'.*ed$', 'VBD'),  # simple past
            (r'.*es$', 'VBZ'),  # 3rd singular present
            (r'.*ould$', 'MD'),  # modals
            (r'.*\'s$', 'NN$'),  # possessive nouns
            (r'.*s$', 'NNS'),  # plural nouns
            (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
            (r'.*', 'NN')  # nouns (default)
        ]
        # regular expression (modified):
        self.patternsModified = [
            (r'(.*able|.*ish|.*ible)$', 'JJ'),  # adjectives              # 1
            (r'(The|the|A|a|An|an)$', 'AT'),  # articles                # 2
            (r'(a|an|my|some|the)$', 'DT'),  # determinative           # 3
            (r'(our|its|his|their|my|your|her|out|thy|mine|thine)$',
             'PP$'),  # determinative possesive # 4   
            (r'(.*ily|.*ly)$', 'ADV'),  # adverb                  # 5
            (r'(at|in|of|over|with)$', 'PP'),  # preposition             # 6
            (r'(and|because|but|if|or)$',
             'CNJ'),  # conjuction              # 7
            (r'([\.?!;:]+)$', '.'),  # sentence terminator     # 8
            (r'(\,)$', ','),  # comma                   # 9                    
            (r'(\-)$', '-'),  # dash                    # 10
            (r'.*ing$', 'VBG'),  # gerunds
            (r'.*ed$', 'VBD'),  # simple past
            (r'.*es$', 'VBZ'),  # 3rd singular present
            (r'.*ould$', 'MD'),  # modals
            (r'.*\'s$', 'NN$'),  # possessive nouns
            (r'.*s$', 'NNS'),  # plural nouns
            (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
            (r'.*', 'NN')  # nouns (default)
        ]
Ejemplo n.º 2
0
    def __init__(self):
        self.bigrams = ConditionalFreqDist()
        self.unigrams = FreqDist()
        sentences = nltk.corpus.brown.sents(
            categories=nltk.corpus.brown.categories()[1:])

        for sent in sentences:
            sent = [None] + sent + [None]
            for prev, word in bigrams(sent):
                self.bigrams[prev].inc(word)
                self.unigrams.inc(word)

        self.bigrams = ConditionalProbDist(self.bigrams, LaplaceProbDist)
        self.unigrams = LaplaceProbDist(self.unigrams)
Ejemplo n.º 3
0
Archivo: lm.py Proyecto: emmestl/UiO
    def __init__(self):
        self.bigrams = ConditionalFreqDist()
        self.unigrams = FreqDist()
        sentences = nltk.corpus.brown.sents(categories=nltk.corpus.brown.categories()[1:])

        for sent in sentences:
            # Vi utvider setningen med None foran, for å angi start av
            # setningen, og en None etter, for å markere setningsslutt.
            sent = [None] + sent + [None]
            for prev, word in bigrams(sent):
                self.bigrams[prev][word] += 1
                self.unigrams[word] += 1

        self.bigrams = ConditionalProbDist(self.bigrams, LaplaceProbDist)
        self.unigrams = LaplaceProbDist(self.unigrams)
Ejemplo n.º 4
0
Archivo: lm.py Proyecto: emmestl/UiO
    def __init__(self):
        self.bigrams = ConditionalFreqDist()
        self.unigrams = FreqDist()
        sentences = nltk.corpus.brown.sents(
            categories=nltk.corpus.brown.categories()[1:])

        for sent in sentences:
            # Vi utvider setningen med None foran, for å angi start av
            # setningen, og en None etter, for å markere setningsslutt.
            sent = [None] + sent + [None]
            for prev, word in bigrams(sent):
                self.bigrams[prev][word] += 1
                self.unigrams[word] += 1

        self.bigrams = ConditionalProbDist(self.bigrams, LaplaceProbDist)
        self.unigrams = LaplaceProbDist(self.unigrams)
Ejemplo n.º 5
0
def build_LM(in_file):
    """
    build language models for each label
    each line in in_file contains a label and a string separated by a space
    """
    print 'building language models...'

    lines = open(in_file, "r")
    observedFourGrams = set()
    LM = {}

    for line in lines:  #This loop generates the count language model without one-smoothing with punctuations removed.
        word_list = line.split()
        language = word_list[0]
        word_string = ' '.join(word_list[1:]).lower()
        fourGrams = ngrams(list(word_string), 4)

        if language not in LM:
            LM[language] = FreqDist()

        for gram in fourGrams:
            strGram = ''.join(gram)
            LM[language][strGram] += 1
            observedFourGrams.add(strGram)

    for language in LM:  #This loop converts each count model to a probabilistic model with add-one smoothing
        LM[language] = LaplaceProbDist(LM[language], len(observedFourGrams))

    return LM
Ejemplo n.º 6
0
    def _load_corpus_counts(cls):
        input_file = os.path.join(settings.DATA_DIR,
                'corpus', 'jp_char_corpus_counts.gz')
        freq_dist = FreqDist()
        with open(input_file, 'r') as istream:
            istream = gzip.GzipFile(fileobj=istream)
            istream = codecs.getreader('utf8')(istream)
            for line in istream:
                kanji, count = line.split()
                freq_dist.inc(kanji, count=int(count))

        return LaplaceProbDist(freq_dist)
Ejemplo n.º 7
0
class LM:
    def __init__(self):
        self.bigrams = ConditionalFreqDist()
        self.unigrams = FreqDist()
        sentences = nltk.corpus.brown.sents(
            categories=nltk.corpus.brown.categories()[1:])

        for sent in sentences:
            sent = [None] + sent + [None]
            for prev, word in bigrams(sent):
                self.bigrams[prev].inc(word)
                self.unigrams.inc(word)

        self.bigrams = ConditionalProbDist(self.bigrams, LaplaceProbDist)
        self.unigrams = LaplaceProbDist(self.unigrams)

    def p(self, w, prev):
        p = 0.5 * self.unigrams.prob(w)
        if prev in self.bigrams:
            p += self.bigrams[prev].prob(w)
        return p

    def logprob(self, s):
        p = 0
        for x, y in self.bigram(s):
            p = p + log(self.p(x, y), 2)
        return p

    def perplexity(self, sents):
        l = 0
        N = 0

        for line in sents:
            l += self.logprob(line)
            N += len(line)

            perplexity = pow(2, -l / N)

        return perplexity
Ejemplo n.º 8
0
class LM:
    def __init__(self):
        self.bigrams = ConditionalFreqDist()
        self.unigrams = FreqDist()
        sentences = nltk.corpus.brown.sents(
            categories=nltk.corpus.brown.categories()[1:])

        for sent in sentences:
            # Vi utvider setningen med None foran, for å angi start av
            # setningen, og en None etter, for å markere setningsslutt.
            sent = [None] + sent + [None]
            for prev, word in bigrams(sent):
                self.bigrams[prev].inc(word)
                self.unigrams.inc(word)

        self.bigrams = ConditionalProbDist(self.bigrams, LaplaceProbDist)
        self.unigrams = LaplaceProbDist(self.unigrams)

    def p(self, w, prev):
        p = 0.5 * self.unigrams.prob(w)
        if prev in self.bigrams:
            p += self.bigrams[prev].prob(w)
        return p

    def logprob(self, s):
        s = [None] + s + [None]
        logprob = 0.0
        for prev, word in bigrams(s):
            logprob += log(self.p(word, prev), 2)
        return logprob

    def perplexity(self, sents):
        logprob = 0.0
        words = 0
        for s in sents:
            words += len(s)
            logprob += self.logprob(s)
        return pow(2, -logprob / words)
Ejemplo n.º 9
0
    def _load_corpus_counts(cls):
        input_file = os.path.join(settings.DATA_DIR,
                                  'jp_char_corpus_counts.gz')
        freq_dist = FreqDist()
        with gzip.open(input_file, 'rb') as istream:

            for line in istream:

                kanji, count = line.split()

                # Decode
                kanji = kanji.decode()
                count = count.decode()

                freq_dist[kanji] += int(count)

        return LaplaceProbDist(freq_dist)
Ejemplo n.º 10
0
Archivo: lm.py Proyecto: emmestl/UiO
class LM:
    def __init__(self):
        self.bigrams = ConditionalFreqDist()
        self.unigrams = FreqDist()
        sentences = nltk.corpus.brown.sents(categories=nltk.corpus.brown.categories()[1:])

        for sent in sentences:
            # Vi utvider setningen med None foran, for å angi start av
            # setningen, og en None etter, for å markere setningsslutt.
            sent = [None] + sent + [None]
            for prev, word in bigrams(sent):
                self.bigrams[prev][word] += 1
                self.unigrams[word] += 1

        self.bigrams = ConditionalProbDist(self.bigrams, LaplaceProbDist)
        self.unigrams = LaplaceProbDist(self.unigrams)
        

    def p(self, w, prev):
        p = 0.5*self.unigrams.prob(w)
        if prev in self.bigrams:
            p += self.bigrams[prev].prob(w)
        return p

    def logprob(self, s):
        prob = 0

        if(isinstance(s, basestring)):
            s = s.split()
                
        for prev, w in bigrams(s):
            prob += log(self.p(w, prev), 2)

        return prob


    def perplexity(self, sents):
        l = 0
        N = 0
        for s in sents:
            l += self.logprob(s)
            N += len(s)

        return pow(2, -l/N)
Ejemplo n.º 11
0
Archivo: lm.py Proyecto: emmestl/UiO
class LM:
    def __init__(self):
        self.bigrams = ConditionalFreqDist()
        self.unigrams = FreqDist()
        sentences = nltk.corpus.brown.sents(
            categories=nltk.corpus.brown.categories()[1:])

        for sent in sentences:
            # Vi utvider setningen med None foran, for å angi start av
            # setningen, og en None etter, for å markere setningsslutt.
            sent = [None] + sent + [None]
            for prev, word in bigrams(sent):
                self.bigrams[prev][word] += 1
                self.unigrams[word] += 1

        self.bigrams = ConditionalProbDist(self.bigrams, LaplaceProbDist)
        self.unigrams = LaplaceProbDist(self.unigrams)

    def p(self, w, prev):
        p = 0.5 * self.unigrams.prob(w)
        if prev in self.bigrams:
            p += self.bigrams[prev].prob(w)
        return p

    def logprob(self, s):
        prob = 0

        if (isinstance(s, basestring)):
            s = s.split()

        for prev, w in bigrams(s):
            prob += log(self.p(w, prev), 2)

        return prob

    def perplexity(self, sents):
        l = 0
        N = 0
        for s in sents:
            l += self.logprob(s)
            N += len(s)

        return pow(2, -l / N)
Ejemplo n.º 12
0
 def __init__(self, vecs_path, data_path, min_count):
   '''
   Initialize class to track aligned geographic vector spaces
   
   Arguments:
    * vecs_path - Path to directory containing word2vec-style text files
    * data_path - Path to directory of text files ofwhite-spaced separated
      tokens vectors are derived from, with matching geography file names
    * min_count - The probability of a word with this frequency in the geography
      with the least amount of training data becomes a lower bound on the
      a word's maximum p(word|geo) for inclusion. MUST be at least word2vec
      minimum frequency
   '''
   self.vector_spaces = []
   self.name2id = dict()
   self.id2name = []
   
   freq_dists = []
   
   for f_name in os.listdir(vecs_path):
     self.name2id[f_name] = len(self.id2name)
     self.id2name.append(f_name)
     
     # Add vector space, frequency distribution for geography
     self.vector_spaces.append(KeyedVectors.load_word2vec_format(os.path.join(vecs_path, f_name)))
     print("Loaded " + f_name + " vectors")
     # freq_dists.append(FreqDist(chain(*[l.split() for l in open(os.path.join(data_path, f_name))])))
     fd = FreqDist()
     
     for l in open(os.path.join(data_path, f_name)):
       for w in l.split():
         fd[w] += 1
         
     print("Built " + f_name + " frequency distribution")
     
     freq_dists.append(fd)
     
   self.num_geos = len(self.id2name)
   
   # p(word|geography) distributions for each geography, with Laplace smoothing
   prob_dists = [LaplaceProbDist(fd) for fd in freq_dists]
   print("Built probability distributions")
   
   # Find the probability of a word with frequency min_count in the geographic region
   # with the least data
   min_prob = min(prob_dists, key = lambda pd: pd.freqdist().N()).prob(None)*(min_count + 1)
   
   # Build vocab from items whose probs in most overrepresented vocabularies
   # exceed threshold
   # Allows exclusion of low-probability items that is not biased against geographies
   # with less associated data
   self.vocab = list({w for pd in prob_dists for w in pd.samples() if pd.prob(w) >= min_prob})
   print("Loaded vocabulary")
   
   self.probs = BaseKeyedVectors(self.num_geos)
   self.probs.add(self.vocab, [np.array([pd.prob(w) for pd in prob_dists]) for w in self.vocab])
   print("Built probability vectors")
   
   # pmi = log(p(word|geo)/p(word)) = log(p(word|geo)) - log(p(word))
   # Let p(word) = avg p(word|geo) over all geographies
   # allows equal weighting of each geographic vector space regardless of token count
   pmi = np.log(self.probs.vectors) - np.log(self.probs.vectors.mean(axis=1).reshape(-1, 1))
   self.pmi = KeyedVectors(self.num_geos)
   self.pmi.add(self.vocab, pmi)
   print("Built PMI vectors")
Ejemplo n.º 13
0
from __future__ import division
from nltk.corpus.reader import ConllCorpusReader
from nltk.probability import FreqDist, DictionaryProbDist, LaplaceProbDist, SimpleGoodTuringProbDist, MLEProbDist

conllreader = ConllCorpusReader(".", "de-train.tt", ('words', 'pos'))  # getting a train corpus from file
states = ('VERB', 'NOUN', 'PRON', 'ADJ', 'ADV', 'ADP', 'CONJ', 'DET', 'NUM', 'PRT', 'X', '.')  # list of 12 POS tags
sentslen = len(conllreader.tagged_sents())  # getting number of sentences

tagfdist = FreqDist(pair[1] for pair in conllreader.tagged_words())   # getting frequence of (word,tag)

firsttagfdist = FreqDist(pair[0][1] for pair in conllreader.tagged_sents())  # getting frequence of first tags
A0j = DictionaryProbDist(dict(map(lambda (k, x): (k, x/sentslen), firsttagfdist.iteritems())))
A0jLap = LaplaceProbDist(firsttagfdist)
A0jGT = SimpleGoodTuringProbDist(firsttagfdist)
A0jMLE = MLEProbDist(firsttagfdist)

TagPair = []
words = conllreader.tagged_words()
for i in range(0, len(words)-1):
    TagPair.append((words[i][1], words[i+1][1]))

TagPairfdist = FreqDist(TagPair)
Aij = DictionaryProbDist(dict(map(lambda (k, x): (k, x/tagfdist.get(k[0])), TagPairfdist.iteritems())))
AijLap = LaplaceProbDist(TagPairfdist)
AijGT = SimpleGoodTuringProbDist(TagPairfdist)
AijMLE = MLEProbDist(TagPairfdist)

TagWordfdist = FreqDist(conllreader.tagged_words())
Biw = DictionaryProbDist(dict(map(lambda (k, x): (k, x/tagfdist.get(k[1])), TagWordfdist.iteritems())))
BiwLap = LaplaceProbDist(TagWordfdist)
BiwGT = SimpleGoodTuringProbDist(TagWordfdist)
Ejemplo n.º 14
0
fr = FreqDist(tokens)
print('X     |       P(X)')
print('___________________')
for s in fr.items():
    for d in vocab:
        if d == s[0]:
            print(d, '    |       ', (s[1] / len(tokens)).__round__(2))
UNK = 0
for d in vocab:
    r = [item for item in fr if item[0] != d]
    isemp = not all(r)
    if isemp == True:
        UNK += (r[1] / len(tokens)).__round__(2)
print('UNK   |       ', UNK)
print('== UNIGRAMS AFTER LAPLACE SMOOTHING ==')
lpt = LaplaceProbDist(fr)
print('X     |       P(X)')
print('___________________')
for d in vocab:
    print(d, '    |       ', lpt.prob(d).__round__(2))
UNK = 0
for d in vocab:
    r = [item for item in lpt.freqdist().items() if item[0] != d]
    isemp = not all(r)
    if isemp == True:
        UNK += lpt.prob(r[0])
print('UNK   |       ', UNK)
print('=========== BIGRAMS ===========')
file = open('sampledata.txt', 'r')
filetext = file.read()
filetext = filetext.replace('</s>', '')
Ejemplo n.º 15
0
Archivo: lm.py Proyecto: rayruu/inf1820
class LM:
    def __init__(self):
        self.bigrams = ConditionalFreqDist()
        self.unigrams = FreqDist()
        sentences = nltk.corpus.brown.sents(
            categories=nltk.corpus.brown.categories()[1:])

        for sent in sentences:
            # Vi utvider setningen med None foran, for å angi start av
            # setningen, og en None etter, for å markere setningsslutt.
            sent = [None] + sent + [None]
            for prev, word in bigrams(sent):
                self.bigrams[prev][word] += 1
                self.unigrams[word] += 1

        self.bigrams = ConditionalProbDist(self.bigrams, LaplaceProbDist)
        self.unigrams = LaplaceProbDist(self.unigrams)

        ############################# modified lm ####################################
        # regular expression:
        self.patterns = [
            (r'.*ing$', 'VBG'),  # gerunds
            (r'.*ed$', 'VBD'),  # simple past
            (r'.*es$', 'VBZ'),  # 3rd singular present
            (r'.*ould$', 'MD'),  # modals
            (r'.*\'s$', 'NN$'),  # possessive nouns
            (r'.*s$', 'NNS'),  # plural nouns
            (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
            (r'.*', 'NN')  # nouns (default)
        ]
        # regular expression (modified):
        self.patternsModified = [
            (r'(.*able|.*ish|.*ible)$', 'JJ'),  # adjectives              # 1
            (r'(The|the|A|a|An|an)$', 'AT'),  # articles                # 2
            (r'(a|an|my|some|the)$', 'DT'),  # determinative           # 3
            (r'(our|its|his|their|my|your|her|out|thy|mine|thine)$',
             'PP$'),  # determinative possesive # 4   
            (r'(.*ily|.*ly)$', 'ADV'),  # adverb                  # 5
            (r'(at|in|of|over|with)$', 'PP'),  # preposition             # 6
            (r'(and|because|but|if|or)$',
             'CNJ'),  # conjuction              # 7
            (r'([\.?!;:]+)$', '.'),  # sentence terminator     # 8
            (r'(\,)$', ','),  # comma                   # 9                    
            (r'(\-)$', '-'),  # dash                    # 10
            (r'.*ing$', 'VBG'),  # gerunds
            (r'.*ed$', 'VBD'),  # simple past
            (r'.*es$', 'VBZ'),  # 3rd singular present
            (r'.*ould$', 'MD'),  # modals
            (r'.*\'s$', 'NN$'),  # possessive nouns
            (r'.*s$', 'NNS'),  # plural nouns
            (r'^-?[0-9]+(.[0-9]+)?$', 'CD'),  # cardinal numbers
            (r'.*', 'NN')  # nouns (default)
        ]

    def p(self, w, prev):
        p = 0.5 * self.unigrams.prob(w)
        if prev in self.bigrams:
            p += self.bigrams[prev].prob(w)
        return p

    def logprob(self, s):
        P = 0.0
        s = [None] + s + [None]
        for prevWord, word in bigrams(s):
            P += log(self.p(word, prevWord), 2)

        return P

    def perplexity(self, sents):
        N = 0.0
        l = 0.0

        for sent in sents:
            N += len(sent)
            l += self.logprob(sent)

        return 2**(-l / N)

    def zipfity(self, lst):
        frekvensOrd = FreqDist()
        frekvens = []

        for setning in lst:
            frekvensOrd += FreqDist(word.lower() for word in setning)

        r = 1
        for word, antall in frekvensOrd.most_common(10):
            frekvens.append([word, antall, antall / r])
            r += 1

        return frekvens

    def regularTagger(self, lst, regex='patterns'):
        regexp_tagger = self.setRegexPatterns(regex)
        tagger = []

        for sentence in lst:
            tagger.append(regexp_tagger.tag(sentence))

        return tagger

    def analyseRegularTagger(self, name, regex='patterns'):
        regexp_tagger = self.setRegexPatterns(regex)

        return regexp_tagger.evaluate(
            nltk.corpus.brown.tagged_sents(categories=name))

    def setRegexPatterns(self, regex):
        if regex == "patterns":
            return RegexpTagger(self.patterns)
        elif regex == "modified":
            return RegexpTagger(self.patternsModified)
Ejemplo n.º 16
0
    plain_train = get_text(plain_path)
    # format the training data
    train_data = format_data(cipher_train, plain_train)

    # test data
    testc_path = cipher_folder + '/test_cipher.txt'
    testp_path = cipher_folder + '/test_plain.txt'
    testc = get_text(testc_path)
    testp = get_text(testp_path)
    # format the test data
    test_data = format_data(testc, testp)

    trainer = hmm.HiddenMarkovModelTrainer()

    #laplace estimator
    my_estimator = lambda fdist, bins: LaplaceProbDist(fdist, bins)

    if args.laplace_smoothing:
        if args.supplement:
            tagger = train_supervised2(trainer,
                                       train_data,
                                       extra_text(),
                                       estimator=my_estimator)
        else:
            tagger = trainer.train_supervised(train_data,
                                              estimator=my_estimator)
    else:
        if args.supplement:
            tagger = train_supervised2(trainer, train_data, extra_text())
        else:
            tagger = trainer.train_supervised(train_data)
Ejemplo n.º 17
0
def laplace_estimator(fdist, bins):
    return LaplaceProbDist(fdist, bins=bins)