from ngrampy.LineFile import *
import os
GOOGLE_ENGLISH_DIR = "/home/piantado/Desktop/mit/Corpora/GoogleNGrams/3/"
VOCAB_FILE = "Vocabulary/EnglishVocabulary.txt"

# Read the vocabulary file
vocabulary = [ l.strip() for l in open(VOCAB_FILE, "r") ]

#rawG = LineFile(["test3.txt"], header=["w1", "w2", "w3", "cnt123"]) # for debugging
rawG = LineFile([GOOGLE_ENGLISH_DIR+x for x in os.listdir(GOOGLE_ENGLISH_DIR)], header=["w1", "w2", "w3", "cnt123"]) 

rawG.clean() # already done!
rawG.restrict_vocabulary("w1 w2 w3", vocabulary) # in fields w1 and w2, restrict our vocabulary
rawG.sort(keys="w1 w2 w3") # Since we collapsed case, etc. This could also be rawG.sort(keys=["w1","w2","w3"]) in the other format.
rawG.resum_equal("w1 w2 w3", "cnt123" )

# Where we store all lines
G = rawG.copy()

# Now go through and compute what we want
G1 = rawG.copy() # start with a copy
G1.delete_columns( "w2 w3" ) # delete the columns we don't want
G1.sort("w1" ) # sort this by the one we do want 
G1.resum_equal( "w1", "cnt123" ) # resum equal
G1.rename_column("cnt123", "cnt1") # rename the column since its now a sum of 1
G.sort("w1") # sort our target by w
G.merge(G1, keys1="w1", tocopy="cnt1") # merge in
G1.delete() # and delete this temporary

G2 = rawG.copy()
Beispiel #2
0
ASSERT_SORTED = True # if you want an extra check on sorting

parser = argparse.ArgumentParser(description='Compute average surprisal from google style data')
parser.add_argument('--in', dest='in', type=str, default="/home/piantado/Desktop/mit/Corpora/GoogleNGrams/2/*", nargs="?", help='The directory with google files (e.g. Google/3gms/)')
parser.add_argument('--path', dest='path', type=str, default="/tmp/GoogleSurprisal", nargs="?", help='Where the database file lives')
args = vars(parser.parse_args())	

print "# Loading files"
G = LineFile( glob.glob(args['in']), header=["w1", "w2", "cnt12"], path=args['path']) 
print "# Cleaning"
G.clean(columns=3)

# Since we collapsed case, go through and re-sum the triple counts
print "# Resumming for case collapsing"
G.sort(keys="w1 w2") 
G.resum_equal("w1 w2", "cnt12", assert_sorted=ASSERT_SORTED ) # in collapsing case, etc., we need to re-sum

# Now go through and
print "# Making marginal counts"
G.make_marginal_column("cnt1", "w1", "cnt12") 

# and compute surprisal
print "# Sorting by word"
G.sort("w2")

print "# Computing surprisal"
G.print_average_surprisal("w2", "cnt12", "cnt1", assert_sorted=ASSERT_SORTED)

# And remove my temporary file:
print "# Removing my temporary file"
Beispiel #3
0
GOOGLE_ENGLISH_DIR = "/home/piantado/Desktop/mit/Corpora/GoogleNGrams/3/"
VOCAB_FILE = "Vocabulary/EnglishVocabulary.txt"

# Read the vocabulary file
vocabulary = [l.strip() for l in open(VOCAB_FILE, "r")]

#rawG = LineFile(["test3.txt"], header=["w1", "w2", "w3", "cnt123"]) # for debugging
rawG = LineFile(
    [GOOGLE_ENGLISH_DIR + x for x in os.listdir(GOOGLE_ENGLISH_DIR)],
    header=["w1", "w2", "w3", "cnt123"])

rawG.clean()  # already done!
rawG.restrict_vocabulary(
    "w1 w2 w3", vocabulary)  # in fields w1 and w2, restrict our vocabulary
rawG.sort(
    keys="w1 w2 w3"
)  # Since we collapsed case, etc. This could also be rawG.sort(keys=["w1","w2","w3"]) in the other format.
rawG.resum_equal("w1 w2 w3", "cnt123")

# Where we store all lines
G = rawG.copy()

# Now go through and compute what we want
G1 = rawG.copy()  # start with a copy
G1.delete_columns("w2 w3")  # delete the columns we don't want
G1.sort("w1")  # sort this by the one we do want
G1.resum_equal("w1", "cnt123")  # resum equal
G1.rename_column("cnt123",
                 "cnt1")  # rename the column since its now a sum of 1
G.sort("w1")  # sort our target by w
G.merge(G1, keys1="w1", tocopy="cnt1")  # merge in
Beispiel #4
0
ASSERT_SORTED = True # if you want an extra check on sorting

parser = argparse.ArgumentParser(description='Compute average surprisal from google style data')
parser.add_argument('--in', dest='in', type=str, default="/home/piantado/Desktop/mit/Corpora/GoogleNGrams/3/*", nargs="?", help='The directory with google files (e.g. Google/3gms/')
parser.add_argument('--path', dest='path', type=str, default="/tmp/GoogleSurprisal", nargs="?", help='Where the database file lives')
args = vars(parser.parse_args())	

print "# Loading files"
G = LineFile( glob.glob(args['in']), header=["w1", "w2", "cnt12"], path=args['path']) 
print "# Cleaning"
G.clean(columns=3)

# Since we collapsed case, go through and re-sum the triple counts
print "# Resumming for case collapsing"
G.sort(keys="w1 w2") 
G.resum_equal("w1 w2", "cnt12", assert_sorted=ASSERT_SORTED ) # in collapsing case, etc., we need to re-sum

# Now go through and 
Gcontext = G.copy()
#print "# Sorting by context"
#Gcontext.sort("w1 w2") # sort this by the one we do want 
print "# Computing context sum"
Gcontext.resum_equal( "w1", "cnt12", assert_sorted=ASSERT_SORTED ) # resum equal
Gcontext.rename_column("cnt12", "cnt1") # rename the column since its now a sum of 1
print "# Sorting by context"
Gcontext.sort("w1") # sort our target by w
print "# Merging"
G.merge(Gcontext, keys1="w1", tocopy="cnt1", assert_sorted=ASSERT_SORTED) # merge in
print "# Deleting Gcontext"
Gcontext.delete() # and delete this temporary

# This will copy the file, make a new one, and then print out possible lines
G = LineFile(files=["/ssd/trigram-stats"],
             path="/ssd/subsampled-stimuli",
             header="w1 w2 w3 c123 c1 c2 c3 c12 c23 unigram bigram trigram")

# Now throw out the porno words
porno_vocabulary = [l.strip() for l in open(BAD_WORD_FILE, "r")]
G.restrict_vocabulary("w1 w2 w3", porno_vocabulary, invert=True)

# and then subsample
G.subsample_lines(N=SUBSAMPLE_N)

# and make sure we are sorted for the below
G.sort("unigram bigram trigram", dtype=float)
G.head()  # just a peek

item_number = 0
line_stack = []
for l in G.lines(tmp=False, parts=False):
    # extrac the columns from line
    unigram, bigram, trigram = G.extract_columns(l,
                                                 keys="unigram bigram trigram",
                                                 dtype=float)

    # now remove things which cannot possibly match anymore
    while len(line_stack) > 0 and not check_tolerance(
            unigram,
            G.extract_columns(line_stack[0], keys="unigram", dtype=float)[0]):
        del line_stack[0]
Beispiel #6
0
	"""
	return abs(x-y) / ((x+y)/2.) < tolerance

# This will copy the file, make a new one, and then print out possible lines
G = LineFile(files=["/ssd/trigram-stats"], path="/ssd/subsampled-stimuli", header="w1 w2 w3 c123 c1 c2 c3 c12 c23 unigram bigram trigram")

# Now throw out the porno words
#porno_vocabulary = [ l.strip() for l in open(BAD_WORD_FILE, "r") ]
#G.restrict_vocabulary("w1 w2 w3", porno_vocabulary, invert=True)

# draw a subsample
#if SUBSAMPLE_N is not None:
	#G.subsample_lines(N=SUBSAMPLE_N)

# we need to resort  this so that we can have w1 and w3 equal and then all the n-grams matched
G.sort("w1 w3 unigram bigram trigram", lines=1000000)
G.head()

item_number = 0
line_stack = []
for l in G.lines(tmp=False, parts=False):
	# extract the columns from line
	w1, w3, unigram, bigram, trigram =  G.extract_columns(l, keys="w1 w3 unigram bigram trigram", dtype=[str, str, float, float, float])
	
	# now remove things which cannot possibly match anymore
	while len(line_stack) > 0:
		w1_, w3_, unigram_, bigram_, trigram =  G.extract_columns(line_stack[0], keys="w1 w3 unigram bigram trigram", dtype=[str, str, float, float, float])
		
		if not (w1_ == w1 and w3_ == w3 and check_tolerance(unigram, unigram_)):
			del line_stack[0]
			
		A handy function to check if some variables are within tolerance percent of each other
	"""
	return abs(x-y) / ((x+y)/2.) < tolerance

# This will copy the file, make a new one, and then print out possible lines
G = LineFile(files=["/ssd/trigram-stats"], path="/ssd/subsampled-stimuli", header="w1 w2 w3 c123 c1 c2 c3 c12 c23 unigram bigram trigram")

# Now throw out the porno words
porno_vocabulary = [ l.strip() for l in open(BAD_WORD_FILE, "r") ]
G.restrict_vocabulary("w1 w2 w3", porno_vocabulary, invert=True)

# and then subsample
G.subsample_lines(N=SUBSAMPLE_N)

# and make sure we are sorted for the below
G.sort("unigram bigram trigram", dtype=float)
G.head() # just a peek

item_number = 0
line_stack = []
for l in G.lines(tmp=False, parts=False):
	# extrac the columns from line
	unigram, bigram, trigram =  G.extract_columns(l, keys="unigram bigram trigram", dtype=float)
	
	# now remove things which cannot possibly match anymore
	while len(line_stack) > 0 and not check_tolerance(unigram, G.extract_columns(line_stack[0], keys="unigram", dtype=float)[0]):
		del line_stack[0]
	
	# now go through the line_stack and try out each 
	# it must already be within tolerance on unigram, or it would have been removed
	for x in line_stack: