Beispiel #1
0
from corpus import loadTextFromFile, getNGrams, getTokenCounts, prettyPrintFrequencyProfile, relativizeTokenCounts, tokenize
from math import log



def getMIScore(bigramprob, unigramprobaA, unigramprobB):
    return bigramprob * log(bigramprob / (unigramprobaA * unigramprobB) )

def getMIScoreFromFQP( bigrams, unigrams, bigram):
    tokenA, tokenB = bigram.split()
    return bigrams[bigram] * log( bigrams[bigram] / (unigrams[tokenA] * unigrams[tokenB]) )



tokens = tokenize( loadTextFromFile("pg873.txt") )

unigrams = getNGrams(tokens, 1)
relativizeTokenCounts( unigrams )

bigrams = getNGrams(tokens, 2)
#prettyPrintFrequencyProfile(bigrams, myreverse=False)

relativizeTokenCounts( bigrams )


# young King: likelihood ratio
lhr = bigrams["young Fisherman"] / (unigrams["young"] * unigrams["Fisherman"])

# young King - pointwise Mutual Information
pmi = bigrams["young Fisherman"] * log( lhr )
Beispiel #2
0
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-

import sys, codecs
sys.stdout = codecs.getwriter('utf-8')(sys.stdout.detach())

from corpus import loadTextFromFile, tokenize, getNGrams

myngrams = getNGrams(tokenize( loadTextFromFile("bbc-1.txt") ))

try:
    outfile = open("test2.dot", mode="w", encoding="utf-8")
    print("digraph g {", file=outfile)
    for bigram in myngrams:
        tokens = bigram.split()
        print('"' + tokens[0] + "\" -> \"" + tokens[1] + "\";", file=outfile)
    print("}", file=outfile)
    outfile.close()
except IOError:
    pass

Beispiel #3
0
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-

import sys, codecs
sys.stdout = codecs.getwriter('utf-8')(sys.stdout.detach())

from corpus import loadTextFromFile, tokenize, getNGrams

mytext = loadTextFromFile("cnn-1.txt")
mytokens = tokenize(mytext)
myngrams = getNGrams(mytokens, 2)

try:
   outputfile = open("sample-a1.xml", mode='w', encoding='utf-8')

   print("<bigrams>", file=outputfile)
   for bigram in myngrams:
      lefttoken, righttoken = bigram.split()
      print("<bigram>", file=outputfile)
      print("<lefttoken>" + lefttoken + "</lefttoken>", file=outputfile)
      print("<righttoken>" + righttoken + "</righttoken>", file=outputfile)
      print("<frequency>" + str(myngrams[bigram]) + "</frequency>", file=outputfile)
      print("</bigram>", file=outputfile)
   print("</bigrams>", file=outputfile)

   outputfile.close()
except IOError:
   print("Cannot open file...")

Beispiel #4
0
#!/usr/bin/env python3
# -*- coding: UTF-8 -*-

import sys, codecs
sys.stdout = codecs.getwriter('utf-8')(sys.stdout.detach())

from corpus import loadTextFromFile, tokenize, getNGrams

myngrams = getNGrams(tokenize(loadTextFromFile("bbc-1.txt")))

try:
    outfile = open("test2.dot", mode="w", encoding="utf-8")
    print("digraph g {", file=outfile)
    for bigram in myngrams:
        tokens = bigram.split()
        print('"' + tokens[0] + "\" -> \"" + tokens[1] + "\";", file=outfile)
    print("}", file=outfile)
    outfile.close()
except IOError:
    pass
Beispiel #5
0
# -*- coding: UTF-8 -*-

import sys, codecs

sys.stdout = codecs.getwriter('utf-8')(sys.stdout.detach())

# use this and then prefix all function names with corpus.
#import corpus
# or use this
from corpus import loadTextFromFile, tokenize, getTokenCounts, prettyPrintFrequencyProfile, relativizeTokenCounts

from math import log


mytext = loadTextFromFile("pg873.txt")

# tokenize mytext and return list of tokens
tokens = tokenize(mytext)

# count tokens
mydict = getTokenCounts(tokens)
relativizeTokenCounts(mydict)

# pretty-print tokens and frequencies
#prettyPrintFrequencyProfile(mydict, sortbyfrq=True, myreverse=True)

mytext = loadTextFromFile("sports-bbc.txt")
mysportsdict = getTokenCounts(tokenize(mytext))
relativizeTokenCounts(mysportsdict)
Beispiel #6
0
from corpus import loadTextFromFile, getNGrams, getTokenCounts, prettyPrintFrequencyProfile, relativizeTokenCounts, tokenize
from math import log


def getMIScore(bigramprob, unigramprobaA, unigramprobB):
    return bigramprob * log(bigramprob / (unigramprobaA * unigramprobB))


def getMIScoreFromFQP(bigrams, unigrams, bigram):
    tokenA, tokenB = bigram.split()
    return bigrams[bigram] * log(bigrams[bigram] /
                                 (unigrams[tokenA] * unigrams[tokenB]))


tokens = tokenize(loadTextFromFile("pg873.txt"))

unigrams = getNGrams(tokens, 1)
relativizeTokenCounts(unigrams)

bigrams = getNGrams(tokens, 2)
#prettyPrintFrequencyProfile(bigrams, myreverse=False)

relativizeTokenCounts(bigrams)

# young King: likelihood ratio
lhr = bigrams["young Fisherman"] / (unigrams["young"] * unigrams["Fisherman"])

# young King - pointwise Mutual Information
pmi = bigrams["young Fisherman"] * log(lhr)