def includepath(self,feat): if len(SimEngine.paths_to_include)>0: return getpathtype(feat) in SimEngine.paths_to_include elif len(SimEngine.blacklist)>0: return getpathtype(feat) not in SimEngine.blacklist else: return True
def computepathtotals(self): self.pathtotals={} for feature in self.features.keys(): pathtype=getpathtype(feature) sofar=self.pathtotals.get(pathtype,0.0) self.pathtotals[pathtype]=sofar+float(self.features[feature])
def showsuffix(self, path, minorder=1, maxorder=1): feats = {} for feat in self.features.keys(): thispath = getpathtype(feat) order = getorder(feat) if thispath.endswith(path) and order >= minorder and order <= maxorder: feats[feat] = self.features[feat] values = sorted(feats.items(), key=itemgetter(1), reverse=True) print values
def compute_typetotals(self,type,cds=False): #compute totals for different paths over all entries (using column totals given in feattots) if not self.coltots_loaded[type]: self.load_coltotals(type,cds) print "Computing path totals C<*,t,*>" self.typetots={} for feature in self.feattots.keys(): pathtype=getpathtype(feature) sofar=self.typetots.get(pathtype,0.0) self.typetots[pathtype]=sofar+float(self.feattots[feature])
def reweight(self,weighting,feattots,typetots,grandtot=0,ppmithreshold=0,saliency=0): self.featureweights={} self.lgth=-1 self.wdth=-1 for feature in self.features.keys(): freq=float(self.features[feature]) # C<w1,p,w2> try: total=float(self.pathtotals[getpathtype(feature)]) # C<w1,p,*> except: total=0.0001 print "Warning: no path total for %s: %s"%(feature,getpathtype(feature)) feattot=float(feattots[feature]) #C<*,p,w2> typetot=float(typetots[getpathtype(feature)]) #C<*,p,*> entrytotal=float(self.total) # C<w1,*,*> if "ttest" in weighting: expected = (total*feattot)/(typetot*typetot) #incorrect! this should be the type total for the entry not the total obs=freq/typetot score= (obs-expected)/math.pow(expected,0.5) if score>ppmithreshold: self.featureweights[feature]=score else: try: if "gof_ppmi" in weighting: pmi=math.log10((freq*grandtot)/(feattot*entrytotal)) else: pmi=math.log10((freq*typetot)/(feattot*total)) except: pmi=0 shifted_pmi=pmi-ppmithreshold if shifted_pmi>0: if "pnppmi" in weighting: shifted_pmi=shifted_pmi * total/entrytotal if "plmi" in weighting: shifted_pmi=shifted_pmi * freq/typetot self.featureweights[feature]=shifted_pmi self.reducesaliency(saliency)
def reducesaliency(self,saliency,saliencyperpath=False): if saliency==0: return else: #print "Carrying out saliency reduction / context selection to top",str(saliency) feats=sorted(self.featureweights.items(),key=itemgetter(1),reverse=True) self.featureweights={} donetypes={} all=0 for tuple in feats: feature=tuple[0] pathtype=getpathtype(feature) done=donetypes.get(pathtype,0) if (saliencyperpath and done<saliency)or(not saliencyperpath and all<saliency): self.featureweights[feature]=tuple[1] donetypes[pathtype]=done+1 all+=1
def profile(self, minorder=0, maxorder=10): paths = {} totalweight = 0 thisorderweight = 0 for feat in self.features.keys(): path = getpathtype(feat) order = getorder(feat) weight = self.features[feat] sofar = paths.get(path, 0) if order >= minorder and order <= maxorder: paths[path] = sofar + weight thisorderweight += weight totalweight += weight print "total weight of features", totalweight print "total weight of required order features", thisorderweight profile = sorted(paths.items(), key=itemgetter(1), reverse=True) print profile