def get_class(interaction): if opts.training_csv is not None: intid=jpath('interaction.id',interaction) if intid in intid2class: # otherwise None is returned return intid2class[intid] else: c=jpath(opts.classpath,interaction) if isinstance(c,list): # eg if the class is set inside interaction.tags return c[0] else: return c
def get_class(interaction): if opts.training_csv is not None: intid = jpath('interaction.id', interaction) if intid in intid2class: # otherwise None is returned return intid2class[intid] else: c = jpath(opts.classpath, interaction) if isinstance(c, list): # eg if the class is set inside interaction.tags return c[0] else: return c
def is_satisfied_by(self,interaction): text=jpath(self.path,interaction) if text is None: return False else: words=WordComboFeature.split(text.lower()) return has_subsequence(words,self.combo)
def report_confusion(interactions,targets,fvectors,title): logging.info("Confusion report for: %s",title) expectedvsactuals=zip(map(get_classname,targets),map(get_classname,clf.predict(fvectors))) confusion_matrix(expectedvsactuals) for i,(exp,act) in enumerate(expectedvsactuals): if exp!=act: logging.info("exp:act (%s,%s): %s |%s",exp,act, nvl(jpath(featurepath,interactions[i])),\ '|'.join([selected_features[idx].string() for idx,satisfied in enumerate(fvectors[i]) if satisfied]).encode('utf-8','ignore'))
def is_satisfied_by(self,interaction): text=jpath(self.path,interaction) if text is None: return False else: words=wordsplitter.split(text.lower()) (w,x)=self.pair return w in words and x in words
def report_confusion(interactions, targets, fvectors, title): logging.info("Confusion report for: %s", title) expectedvsactuals = zip(map(get_classname, targets), map(get_classname, clf.predict(fvectors))) confusion_matrix(expectedvsactuals) for i, (exp, act) in enumerate(expectedvsactuals): if exp != act: logging.info("exp:act (%s,%s): %s |%s",exp,act, nvl(jpath(featurepath,interactions[i])),\ '|'.join([selected_features[idx].string() for idx,satisfied in enumerate(fvectors[i]) if satisfied]).encode('utf-8','ignore'))
def is_satisfied_by(self,interaction): global chunk_cache text=jpath(self.path,interaction) if text is None: return False else: text=text.lower() if text not in chunk_cache: chunk_cache[text]=chunk(text) return contains(chunk_cache[text],(self.word[:-1],self.word[-1]),2)
def is_satisfied_by(self,interaction): text=jpath(self.path,interaction) if text is None: return False if not isinstance(text,list): # we need to handle the case when the path returns a list, eg for hashtags, so standardise on that text=[text] for t in text: t=t.lower() if self.term==t: return True return False
def is_satisfied_by(self,interaction): global chunk_cache text=jpath(self.path,interaction) if text is None: return False if not isinstance(text,list): # we need to handle the case when the path returns a list, e.g. for shareurls, so standardise on that text=[text] for t in text: t=t.lower() if t not in chunk_cache: chunk_cache[t]=chunk(t) if contains(chunk_cache[t],self.chunks,2): return True return False
try: if not ctr % 500 and ctr>0: print >>sys.stderr, "Looking for candidate features in interaction %i" %ctr j=json.loads(i) C=get_class(j) if C is not None: if ctr % opts.test_period == 0: test_interactions.append(j) test_targets.append(get_classid(C)) else: interactions.append(j) targets.append(get_classid(C)) content=jpath(featurepath,j) if content is not None: content=content.lower() normalised_content=re.sub(r'(https?)://t.co/\w+','',content) #http://t.co/rerJLq49gl normalised_content=re.sub(r'@\w+','',normalised_content) # need to detect @mentions before chunking for unigrams! listofngrams=list() words=[w for w in wordsplitter.split(normalised_content) if w not in stopwords and len(w)>0] if config.useunigrams: listofngrams+=[(w,) for w in words] if config.usebigrams: listofngrams+=ngrams(2,content) if config.usetrigrams: listofngrams+=ngrams(3,content) for ngram in listofngrams: #if len(ngram)==1 and ngram[0] in stopwords: # ignore unigrams consisting of a stopword
try: if not ctr % 500 and ctr > 0: print >> sys.stderr, "Looking for candidate features in interaction %i" % ctr j = json.loads(i) C = get_class(j) if C is not None: if ctr % opts.test_period == 0: test_interactions.append(j) test_targets.append(get_classid(C)) else: interactions.append(j) targets.append(get_classid(C)) content = jpath(featurepath, j) if content is not None: content = content.lower() normalised_content = re.sub( r'(https?)://t.co/\w+', '', content) #http://t.co/rerJLq49gl normalised_content = re.sub( r'@\w+', '', normalised_content ) # need to detect @mentions before chunking for unigrams! listofngrams = list() words = [ w for w in wordsplitter.split(normalised_content) if w not in stopwords and len(w) > 0 ] if config.useunigrams:
def is_satisfied_by(self,interaction): text=jpath(self.path,interaction) if text is None: return False else: return self.chars in text # NB CASE SENSITIVE
def is_satisfied_by(self,interaction): x=jpath(self.path,interaction) if x is None: return False return x==self.val
def is_satisfied_by(self,interaction): text=jpath(self.path,interaction) if text is None: return False else: return re.search(self.regex,text) is not None
def is_satisfied_by(self,interaction): return jpath(self.path,interaction) is not None