from ngrampy.LineFile import * import os GOOGLE_ENGLISH_DIR = "/home/piantado/Desktop/mit/Corpora/GoogleNGrams/3/" VOCAB_FILE = "Vocabulary/EnglishVocabulary.txt" # Read the vocabulary file vocabulary = [ l.strip() for l in open(VOCAB_FILE, "r") ] #rawG = LineFile(["test3.txt"], header=["w1", "w2", "w3", "cnt123"]) # for debugging rawG = LineFile([GOOGLE_ENGLISH_DIR+x for x in os.listdir(GOOGLE_ENGLISH_DIR)], header=["w1", "w2", "w3", "cnt123"]) rawG.clean() # already done! rawG.restrict_vocabulary("w1 w2 w3", vocabulary) # in fields w1 and w2, restrict our vocabulary rawG.sort(keys="w1 w2 w3") # Since we collapsed case, etc. This could also be rawG.sort(keys=["w1","w2","w3"]) in the other format. rawG.resum_equal("w1 w2 w3", "cnt123" ) # Where we store all lines G = rawG.copy() # Now go through and compute what we want G1 = rawG.copy() # start with a copy G1.delete_columns( "w2 w3" ) # delete the columns we don't want G1.sort("w1" ) # sort this by the one we do want G1.resum_equal( "w1", "cnt123" ) # resum equal G1.rename_column("cnt123", "cnt1") # rename the column since its now a sum of 1 G.sort("w1") # sort our target by w G.merge(G1, keys1="w1", tocopy="cnt1") # merge in G1.delete() # and delete this temporary G2 = rawG.copy()
def check_tolerance(x, y): """ A handy function to check if some variables are within tolerance percent of each other """ return abs(x - y) / ((x + y) / 2.) < tolerance # This will copy the file, make a new one, and then print out possible lines G = LineFile(files=["/ssd/trigram-stats"], path="/ssd/subsampled-stimuli", header="w1 w2 w3 c123 c1 c2 c3 c12 c23 unigram bigram trigram") # Now throw out the porno words porno_vocabulary = [l.strip() for l in open(BAD_WORD_FILE, "r")] G.restrict_vocabulary("w1 w2 w3", porno_vocabulary, invert=True) # and then subsample G.subsample_lines(N=SUBSAMPLE_N) # and make sure we are sorted for the below G.sort("unigram bigram trigram", dtype=float) G.head() # just a peek item_number = 0 line_stack = [] for l in G.lines(tmp=False, parts=False): # extrac the columns from line unigram, bigram, trigram = G.extract_columns(l, keys="unigram bigram trigram", dtype=float)
from ngrampy.LineFile import * import os GOOGLE_ENGLISH_DIR = "/home/piantado/Desktop/mit/Corpora/GoogleNGrams/3/" VOCAB_FILE = "Vocabulary/EnglishVocabulary.txt" # Read the vocabulary file vocabulary = [l.strip() for l in open(VOCAB_FILE, "r")] #rawG = LineFile(["test3.txt"], header=["w1", "w2", "w3", "cnt123"]) # for debugging rawG = LineFile( [GOOGLE_ENGLISH_DIR + x for x in os.listdir(GOOGLE_ENGLISH_DIR)], header=["w1", "w2", "w3", "cnt123"]) rawG.clean() # already done! rawG.restrict_vocabulary( "w1 w2 w3", vocabulary) # in fields w1 and w2, restrict our vocabulary rawG.sort( keys="w1 w2 w3" ) # Since we collapsed case, etc. This could also be rawG.sort(keys=["w1","w2","w3"]) in the other format. rawG.resum_equal("w1 w2 w3", "cnt123") # Where we store all lines G = rawG.copy() # Now go through and compute what we want G1 = rawG.copy() # start with a copy G1.delete_columns("w2 w3") # delete the columns we don't want G1.sort("w1") # sort this by the one we do want G1.resum_equal("w1", "cnt123") # resum equal G1.rename_column("cnt123", "cnt1") # rename the column since its now a sum of 1
SUBSAMPLE_N = 15000 tolerance = 0.001 BAD_WORD_FILE = "badwords.txt" def check_tolerance(x,y): """ A handy function to check if some variables are within tolerance percent of each other """ return abs(x-y) / ((x+y)/2.) < tolerance # This will copy the file, make a new one, and then print out possible lines G = LineFile(files=["/ssd/trigram-stats"], path="/ssd/subsampled-stimuli", header="w1 w2 w3 c123 c1 c2 c3 c12 c23 unigram bigram trigram") # Now throw out the porno words porno_vocabulary = [ l.strip() for l in open(BAD_WORD_FILE, "r") ] G.restrict_vocabulary("w1 w2 w3", porno_vocabulary, invert=True) # and then subsample G.subsample_lines(N=SUBSAMPLE_N) # and make sure we are sorted for the below G.sort("unigram bigram trigram", dtype=float) G.head() # just a peek item_number = 0 line_stack = [] for l in G.lines(tmp=False, parts=False): # extrac the columns from line unigram, bigram, trigram = G.extract_columns(l, keys="unigram bigram trigram", dtype=float) # now remove things which cannot possibly match anymore