ASSERT_SORTED = True # if you want an extra check on sorting parser = argparse.ArgumentParser(description='Compute average surprisal from google style data') parser.add_argument('--in', dest='in', type=str, default="/home/piantado/Desktop/mit/Corpora/GoogleNGrams/2/*", nargs="?", help='The directory with google files (e.g. Google/3gms/)') parser.add_argument('--path', dest='path', type=str, default="/tmp/GoogleSurprisal", nargs="?", help='Where the database file lives') args = vars(parser.parse_args()) print "# Loading files" G = LineFile( glob.glob(args['in']), header=["w1", "w2", "cnt12"], path=args['path']) print "# Cleaning" G.clean(columns=3) # Since we collapsed case, go through and re-sum the triple counts print "# Resumming for case collapsing" G.sort(keys="w1 w2") G.resum_equal("w1 w2", "cnt12", assert_sorted=ASSERT_SORTED ) # in collapsing case, etc., we need to re-sum # Now go through and print "# Making marginal counts" G.make_marginal_column("cnt1", "w1", "cnt12") # and compute surprisal print "# Sorting by word" G.sort("w2") print "# Computing surprisal" G.print_average_surprisal("w2", "cnt12", "cnt1", assert_sorted=ASSERT_SORTED) # And remove my temporary file: print "# Removing my temporary file" G.delete_tmp()
# Read the vocabulary file vocabulary = [l.strip() for l in open(VOCAB_FILE, "r")] #rawG = LineFile(["test3.txt"], header=["w1", "w2", "w3", "cnt123"]) # for debugging rawG = LineFile( [GOOGLE_ENGLISH_DIR + x for x in os.listdir(GOOGLE_ENGLISH_DIR)], header=["w1", "w2", "w3", "cnt123"]) rawG.clean() # already done! rawG.restrict_vocabulary( "w1 w2 w3", vocabulary) # in fields w1 and w2, restrict our vocabulary rawG.sort( keys="w1 w2 w3" ) # Since we collapsed case, etc. This could also be rawG.sort(keys=["w1","w2","w3"]) in the other format. rawG.resum_equal("w1 w2 w3", "cnt123") # Where we store all lines G = rawG.copy() # Now go through and compute what we want G1 = rawG.copy() # start with a copy G1.delete_columns("w2 w3") # delete the columns we don't want G1.sort("w1") # sort this by the one we do want G1.resum_equal("w1", "cnt123") # resum equal G1.rename_column("cnt123", "cnt1") # rename the column since its now a sum of 1 G.sort("w1") # sort our target by w G.merge(G1, keys1="w1", tocopy="cnt1") # merge in G1.delete() # and delete this temporary
from ngrampy.LineFile import * import os GOOGLE_ENGLISH_DIR = "/home/piantado/Desktop/mit/Corpora/GoogleNGrams/3/" VOCAB_FILE = "Vocabulary/EnglishVocabulary.txt" # Read the vocabulary file vocabulary = [ l.strip() for l in open(VOCAB_FILE, "r") ] #rawG = LineFile(["test3.txt"], header=["w1", "w2", "w3", "cnt123"]) # for debugging rawG = LineFile([GOOGLE_ENGLISH_DIR+x for x in os.listdir(GOOGLE_ENGLISH_DIR)], header=["w1", "w2", "w3", "cnt123"]) rawG.clean() # already done! rawG.restrict_vocabulary("w1 w2 w3", vocabulary) # in fields w1 and w2, restrict our vocabulary rawG.sort(keys="w1 w2 w3") # Since we collapsed case, etc. This could also be rawG.sort(keys=["w1","w2","w3"]) in the other format. rawG.resum_equal("w1 w2 w3", "cnt123" ) # Where we store all lines G = rawG.copy() # Now go through and compute what we want G1 = rawG.copy() # start with a copy G1.delete_columns( "w2 w3" ) # delete the columns we don't want G1.sort("w1" ) # sort this by the one we do want G1.resum_equal( "w1", "cnt123" ) # resum equal G1.rename_column("cnt123", "cnt1") # rename the column since its now a sum of 1 G.sort("w1") # sort our target by w G.merge(G1, keys1="w1", tocopy="cnt1") # merge in G1.delete() # and delete this temporary G2 = rawG.copy()
ASSERT_SORTED = True # if you want an extra check on sorting parser = argparse.ArgumentParser(description='Compute average surprisal from google style data') parser.add_argument('--in', dest='in', type=str, default="/home/piantado/Desktop/mit/Corpora/GoogleNGrams/3/*", nargs="?", help='The directory with google files (e.g. Google/3gms/') parser.add_argument('--path', dest='path', type=str, default="/tmp/GoogleSurprisal", nargs="?", help='Where the database file lives') args = vars(parser.parse_args()) print "# Loading files" G = LineFile( glob.glob(args['in']), header=["w1", "w2", "cnt12"], path=args['path']) print "# Cleaning" G.clean(columns=3) # Since we collapsed case, go through and re-sum the triple counts print "# Resumming for case collapsing" G.sort(keys="w1 w2") G.resum_equal("w1 w2", "cnt12", assert_sorted=ASSERT_SORTED ) # in collapsing case, etc., we need to re-sum # Now go through and Gcontext = G.copy() #print "# Sorting by context" #Gcontext.sort("w1 w2") # sort this by the one we do want print "# Computing context sum" Gcontext.resum_equal( "w1", "cnt12", assert_sorted=ASSERT_SORTED ) # resum equal Gcontext.rename_column("cnt12", "cnt1") # rename the column since its now a sum of 1 print "# Sorting by context" Gcontext.sort("w1") # sort our target by w print "# Merging" G.merge(Gcontext, keys1="w1", tocopy="cnt1", assert_sorted=ASSERT_SORTED) # merge in print "# Deleting Gcontext" Gcontext.delete() # and delete this temporary