Ejemplo n.º 1
0
	pxy = float(bigrams[bigram])/float(bigramcount)
	px  = float(tokens[token])/float(tokencount)
	py  = float(types[type])/float(tokencount)
	return pxy * math.log(pxy/(px * py), 2)


if __name__ == "__main__":
	for i in sys.argv[1:]:
		for x in glob.glob(os.path.normcase(i)):
			try:
				file = open(x, "r")
				for i in file.readlines():
					i = string.lower(string.strip(i))
					if i == "":
						continue
					wordlist = getTWordList(i)
					bigrams, bigramcount = getTBigrams(wordlist, bigrams, bigramcount, TOKEN, TYPE)
					tokens, tokencount = getTTokens(wordlist, tokens, tokencount, TOKEN)
					types, typecount = getTTokens(wordlist, types, typecount, TYPE)
					bigramsleft, bigramsright = getTLRBigrams(wordlist, bigramsleft, bigramsright, TOKEN, TYPE)
				file.close()
			except IOError:
				file.close()

	myTokens = sortNgrams(tokens)

	print "Left MI\tToken\tRight MI\tFrequency\tRelative Frequency"
	for x in range(min(len(myTokens), PRINTWORDS)):
		rmi, lmi = PMI(myTokens[x][0])
		print str(lmi) + "\t" + myTokens[x][0] + "\t" + str(rmi) + "\t" + str(myTokens[x][1]) + "\t" + str(float(myTokens[x][1])/float(tokencount))
Ejemplo n.º 2
0
	global bigrams, tokens, bigramcount, tokencount
	pxy = float(bigrams[bigram])/float(bigramcount)
	px = float(tokens[token2])/float(tokencount)
	py  = float(tokens[token1])/float(tokencount)
	return py * math.log(py/(pxy/px), 2)


if __name__ == "__main__":
	for i in sys.argv[1:]:
		for x in glob.glob(os.path.normcase(i)):
			try:
				file = open(x, "r")
				for i in file.readlines():
					i = string.lower(string.strip(i))
					if i == "":
						continue
					wordlist = getTWordList(i)
					bigrams, bigramcount = getTBigrams(wordlist, bigrams, bigramcount, TOKEN, TOKEN)
					tokens, tokencount = getTTokens(wordlist, tokens, tokencount, TOKEN)
					bigramsleft, bigramsright = getTLRBigrams(wordlist, bigramsleft, bigramsright, TOKEN, TOKEN)
				file.close()
			except IOError:
				file.close()

	myTokens = sortNgrams(tokens)

	print "Left RE\tToken\tRight RE\tFrequency\tRelative Frequency"
	for x in range(min(len(myTokens), PRINTWORDS)):
		rre, lre = PRE(myTokens[x][0])
		print str(lre) + "\t" + myTokens[x][0] + "\t" + str(rre) + "\t" + str(myTokens[x][1]) + "\t" + str(float(myTokens[x][1])/float(tokencount))
Ejemplo n.º 3
0
	global bigrams, tokens, bigramcount, tokencount
	pxy = float(bigrams[bigram])/float(bigramcount)
	px  = float(tokens[token1])/float(tokencount)
	py  = float(tokens[token2])/float(tokencount)
	return pxy * math.log(pxy/(px * py), 2)


if __name__ == "__main__":
	for i in sys.argv[1:]:
		for x in glob.glob(os.path.normcase(i)):
			try:
				file = open(x, "r")
				for i in file.readlines():
					i = string.lower(string.strip(i))
					if i == "":
						continue
					wordlist = getTWordList(i)
					bigrams, bigramcount = getTBigrams(wordlist, bigrams, bigramcount, TYPE, TYPE)
					tokens, tokencount = getTTokens(wordlist, tokens, tokencount, TYPE)
					bigramsleft, bigramsright = getTLRBigrams(wordlist, bigramsleft, bigramsright, TYPE, TYPE)
				file.close()
			except IOError:
				file.close()

	myTokens = sortNgrams(tokens)

	print "Left MI\tToken\tRight MI\tFrequency\tRelative Frequency"
	for x in range(min(len(myTokens), PRINTWORDS)):
		rmi, lmi = PMI(myTokens[x][0])
		print str(lmi) + "\t" + myTokens[x][0] + "\t" + str(rmi) + "\t" + str(myTokens[x][1]) + "\t" + str(float(myTokens[x][1])/float(tokencount))