def prunengrams(n, freqlist, simpleskipgrams): global DOTOKENIZE, DOCLASSER, DOSKIPGRAMS, MINTOKENS log("Pruning " + str(n) + "-grams...", stream=sys.stderr) for ngram, count in freqlist[n]: if count < MINTOKENS: del freqlist[n][ngram] if DOINDEX: del index[ngram] if DOSKIPGRAMS: skipgram = ( (ngram[0],) , (ngram[-1],) ) if skipgram in simpleskipgrams[n] and simpleskipgrams[n][skipgram][None] <= count: #note: if skip-grams are not found on the same n-level, they are pruned because of this early-pruning del simpleskipgrams[n][skipgram] log("Retained " + str(len(freqlist[n])) + " " + str(n) + "-grams after pruning", stream=sys.stderr)
def countngrams(classer, n, freqlist, simpleskipgrams, skips, index, linecount=0): global DOTOKENIZE, DOCLASSER, DOSKIPGRAMS, DOINDEX, MINLENGTH log("Counting "+str(n)+"-grams ...", stream=sys.stderr) f.seek(0) gaps = list(consecutivegaps(n)) for i, line in enumerate(f): if (i % 10000 == 0): if linecount == 0: log("\tLine " + str(i+1) + " - (" + str(n) + "-grams)", stream=sys.stderr) else: log("\tLine " + str(i+1) + " of " + str(linecount) + " - " + str( round(((i+1) / float(linecount)) * 100)) + "% " + " (" + str(n) + "-grams)" , stream=sys.stderr) if DOTOKENIZE: line = crude_tokenizer(line.strip()) else: line = [ x for x in line.strip().split(' ') if x ] for ngram in Windower(line,n): if DOCLASSER: ngram = tuple(classer.encodeseq(ngram)) if n - 1 in freqlist: count = (ngram[1:] in freqlist[n-1] and ngram[:-1] in freqlist[n-1]) else: count = True if count: freqlist[n].count(ngram) if DOINDEX: try: index[ngram].add(i) except KeyError: index[ngram] = set((i,)) if DOSKIPGRAMS and n >= 2 and ngram[0] != '<begin>' and ngram[-1] != '<end>': for beginindex, length in gaps: preskip = ngram[:beginindex] postskip = ngram[beginindex+length:] if len(preskip) >= MINLENGTH and not (preskip in freqlist[len(preskip)]): continue #this skip-gram isn't going to make it over the min threshold if len(postskip) >= MINLENGTH and not (postskip in freqlist[len(postskip)]): continue #this skip-gram isn't going to make it over the min threshold skipgram = (preskip, postskip) body = ngram[beginindex:beginindex+length] if not skipgram in simpleskipgrams[n]: #using None key for overall count to save computation time later simpleskipgrams[n][skipgram] = {None: 1} else: simpleskipgrams[n][skipgram][None] += 1 if body in simpleskipgrams[n][skipgram]: if DOINDEX: simpleskipgrams[n][skipgram][body].add(i) else: simpleskipgrams[n][skipgram][body] += 1 else: if DOINDEX: simpleskipgrams[n][skipgram][body] = set((i,)) else: simpleskipgrams[n][skipgram][body] = 1 log("Found " + str(len(freqlist[n])) + " " + str(n) + "-grams and " + str(len(simpleskipgrams[n])) + " skip-grams", stream=sys.stderr) return i+1
def countngrams(classer, n, freqlist, simpleskipgrams, skips, index, linecount=0): global DOTOKENIZE, DOCLASSER, DOSKIPGRAMS, DOINDEX log("Counting "+str(n)+"-grams ...", stream=sys.stderr) f.seek(0) for i, line in enumerate(f): if (i % 10000 == 0): if linecount == 0: log("\tLine " + str(i+1) + " - (" + str(n) + "-grams)", stream=sys.stderr) else: log("\tLine " + str(i+1) + " of " + str(linecount) + " - " + str( round(((i+1) / float(linecount)) * 100)) + "% " + " (" + str(n) + "-grams)" , stream=sys.stderr) if DOTOKENIZE: line = crude_tokenizer(line.strip()) else: line = [ x for x in line.strip().split(' ') if x ] for ngram in Windower(line,n): if DOCLASSER: ngram = tuple(classer.encodeseq(ngram)) if n - 1 in freqlist: count = (ngram[1:] in freqlist[n-1] and ngram[:-1] in freqlist[n-1]) else: count = True if count: freqlist[n].count(ngram) if DOINDEX: try: index[ngram].add(i) except KeyError: index[ngram] = set((i,)) if DOSKIPGRAMS and n >= 3 and ngram[0] != '<begin>' and ngram[-1] != '<end>': skipgram = ( (ngram[0],) , (ngram[-1],) ) body = tuple(ngram[1:-1]) if not skipgram in simpleskipgrams[n]: #using None key for overall count to save computation time later simpleskipgrams[n][skipgram] = {None: 1} else: simpleskipgrams[n][skipgram][None] += 1 if body in simpleskipgrams[n][skipgram]: if DOINDEX: simpleskipgrams[n][skipgram][body].add(i) else: simpleskipgrams[n][skipgram][body] += 1 else: if DOINDEX: simpleskipgrams[n][skipgram][body] = set((i,)) else: simpleskipgrams[n][skipgram][body] = 1 #simpleskipgrams[n].count( skipgram ) #try: # skips[skipgram].append( ngram[1:-1] ) #except: # skips[skipgram] = [ ngram[1:-1] ] log("Found " + str(len(freqlist[n])) + " " + str(n) + "-grams", stream=sys.stderr) return i+1
def pruneskipgrams(n, simpleskipgrams, skips): global MINSKIPTYPES, MINSKIPGRAMTOKENS, MINSKIPTOKENS l = len(simpleskipgrams[n]) log("Pruning skip-" + str(n) + "-grams... (" +str(l)+")", stream=sys.stderr) for i, (skipgram, data) in enumerate(simpleskipgrams[n].items()): if i % 100000 == 0: log('\t\t@' + str(i),stream=sys.stderr) typecount = len(data) - 1 #Minus the meta None/count entry prune = False if typecount < MINSKIPTYPES or data[None] < MINSKIPGRAMTOKENS: prune = True else: cacheditems = data.items() modified = False for skip,data2 in list(data.items()): if skip: if DOINDEX: count = len(data2) else: count = data2 if count < MINSKIPTOKENS: modified = True #prune this skip-content only simpleskipgrams[n][skipgram][None] -= count del simpleskipgrams[n][skipgram][skip] del cacheditems if modified: #recompute, things have changed typecount = len(simpleskipgrams[n][skipgram]) - 1 #Minus the meta None/count entry if typecount < MINSKIPTYPES or simpleskipgrams[n][skipgram][None] < MINSKIPGRAMTOKENS: prune = True if prune: del simpleskipgrams[n][skipgram] log("\t" +str(len(simpleskipgrams[n])) + " left after pruning",stream=sys.stderr)
def buildclasser(): global DOTOKENIZE, ENCODING, outputprefix log("Counting unigrams (for classer) ...",stream=sys.stderr) freqlist = FrequencyList() f = open(corpusfile) for i, line in enumerate(f): if (i % 10000 == 0): log("\tLine " + str(i+1) + " - (classer construction)", stream=sys.stderr) if DOTOKENIZE: line = crude_tokenizer(line.strip()) line = line.strip().split(' ') freqlist.append(['<begin>'] + line + ['<end>']) f.close() log("Building classer ...", stream=sys.stderr) classer = Classer(freqlist) classer.save(outputprefix + '.cls') log("\t" + str(len(classer)) + " classes found", stream=sys.stderr) return classer
def buildcompgraph(freqlist): compgraph = DiGraph() for n in freqlist: log("Computing compositionality graph (processing " +str(n) + "-grams)", stream=sys.stderr) l = len(freqlist[n]) for i, (ngram, count) in enumerate(freqlist[n]): if (i % 10000 == 0): log('\t' + str(float(round((i/float(l))*100,2))) + '%',stream=sys.stderr) for n2 in range(MINLENGTH,n): for subngram in Windower(ngram,n2): if subngram in freqlist[n2]: compgraph.add_edge(subngram, ngram) log("Writing compositionality graph to file", stream=sys.stderr) write_gpickle(compgraph, outputprefix + '.compgraph') return compgraph
def expandskipgrams(n, simpleskipgrams, skips): #OLD: OBSOLETE log("Expanding skip-" + str(n) + "-grams...",stream=sys.stderr) cacheitems = list(simpleskipgrams[n].items()) expansionsize = 0 for p, (skipgram, data) in enumerate(cacheitems): if p % 1000 == 0: log( '\t\t@' + str(p) + ' - ' + str(expansionsize) + ' new skip-grams thus-far',stream=sys.stderr) if len(data) ** 2 >= 1000000: log( '\t\t\t@' + str(p) + ' -- ' + str(len(data)**2) + ' comparisons',stream=sys.stderr) processed = {} skipdata = data.items() for skip, skipindex in skipdata: if skip: for skip2, skipindex2 in skipdata: if skip != skip2 and skip2 and not (skip2,skip) in processed: processed[(skip,skip2)] = True left = [] right = [] position = 0 consecutive = True gap = 0 prev = None gapbegin = 0 gapsize = 1 for i in xrange(0,len(skip)): w = skip[i] if w == skip2[i]: if position == 0: left.append(w) elif position == 1: right.append(w) else: if position == 0: gapbegin = i position = 1 elif position == 1 and prev: #multiple gaps consecutive = False break else: gapsize += 1 prev = w if not consecutive: continue #content of new gap newskip = skip2[gapbegin:gapbegin+gapsize] newskipgram = ( skipgram[0] + tuple(left), tuple(right) + skipgram[-1] ) if DOINDEX: newskipindex = skipindex | skipindex2 #a union set else: newskipindex = skipindex + skipindex2 #a count (int) try: simpleskipgrams[n][newskipgram][None] += newskipindex except: simpleskipgrams[n][newskipgram] = {None: newskipindex} expansionsize += 1 try: if DOINDEX: simpleskipgrams[n][newskipgram][newskip].update(newskipindex) else: simpleskipgrams[n][newskipgram][newskip] += newskipindex except: simpleskipgrams[n][newskipgram][newskip] = newskipindex if len(data) ** 2 >= 1000000: log( '\t\t\t(next)',stream=sys.stderr) log("Found " + str(len(simpleskipgrams[n])) + " skip-" + str(n) + "-grams, of which "+str(expansionsize) + " from expansion step)", stream=sys.stderr)
# # pruneskipgrams(n, simpleskipgrams, skips) if DOCOMPOSITIONALITY: compgraph = buildcompgraph(freqlist) totalcount = 0 for n in freqlist: totalcount += sum([ f for f in freqlist[n].values() ]) log("Writing n-grams to file", stream=sys.stderr) f = open(outputprefix + '.phraselist', 'w') f.write('#N\tN-GRAM\tOCCURRENCE-COUNT\tNORMALISED-IN-NGRAM-CLASS\tNORMALISED-OVER-ALL\tSUBCOUNT\tSUPERCOUNT\n') for n in freqlist: for ngram, count in freqlist[n]: if DOCLASSER: ngram_s = " ".join(classer.decodeseq(ngram)) else: ngram_s = " ".join(ngram) if DOCOMPOSITIONALITY: subcount = str(len(compgraph.out_edges(ngram))) supercount = str(len(compgraph.in_edges(ngram))) else: subcount = '-' supercount = '-'