def get_class(interaction):
  if opts.training_csv is not None:
    intid=jpath('interaction.id',interaction)
    if intid in intid2class: # otherwise None is returned
      return intid2class[intid]
  else:
    c=jpath(opts.classpath,interaction)
    if isinstance(c,list): # eg if the class is set inside interaction.tags
      return c[0]
    else:
      return c
def get_class(interaction):
    if opts.training_csv is not None:
        intid = jpath('interaction.id', interaction)
        if intid in intid2class:  # otherwise None is returned
            return intid2class[intid]
    else:
        c = jpath(opts.classpath, interaction)
        if isinstance(c,
                      list):  # eg if the class is set inside interaction.tags
            return c[0]
        else:
            return c
Ejemplo n.º 3
0
 def is_satisfied_by(self,interaction):
   text=jpath(self.path,interaction)
   if text is None:
     return False
   else:
     words=WordComboFeature.split(text.lower())
     return has_subsequence(words,self.combo)
def report_confusion(interactions,targets,fvectors,title):
  logging.info("Confusion report for: %s",title)
  expectedvsactuals=zip(map(get_classname,targets),map(get_classname,clf.predict(fvectors)))
  confusion_matrix(expectedvsactuals)
  for i,(exp,act) in enumerate(expectedvsactuals):
    if exp!=act:
      logging.info("exp:act (%s,%s): %s |%s",exp,act, nvl(jpath(featurepath,interactions[i])),\
        '|'.join([selected_features[idx].string() for idx,satisfied in enumerate(fvectors[i]) if satisfied]).encode('utf-8','ignore'))
Ejemplo n.º 5
0
 def is_satisfied_by(self,interaction):
   text=jpath(self.path,interaction)
   if text is None:
     return False
   else:
     words=wordsplitter.split(text.lower())
     (w,x)=self.pair
     return w in words and x in words
def report_confusion(interactions, targets, fvectors, title):
    logging.info("Confusion report for: %s", title)
    expectedvsactuals = zip(map(get_classname, targets),
                            map(get_classname, clf.predict(fvectors)))
    confusion_matrix(expectedvsactuals)
    for i, (exp, act) in enumerate(expectedvsactuals):
        if exp != act:
            logging.info("exp:act (%s,%s): %s |%s",exp,act, nvl(jpath(featurepath,interactions[i])),\
              '|'.join([selected_features[idx].string() for idx,satisfied in enumerate(fvectors[i]) if satisfied]).encode('utf-8','ignore'))
Ejemplo n.º 7
0
 def is_satisfied_by(self,interaction):
   global chunk_cache
   text=jpath(self.path,interaction)
   if text is None:
     return False
   else:
     text=text.lower()
     if text not in chunk_cache:
       chunk_cache[text]=chunk(text)
     return contains(chunk_cache[text],(self.word[:-1],self.word[-1]),2)
Ejemplo n.º 8
0
 def is_satisfied_by(self,interaction):
   text=jpath(self.path,interaction)
   if text is None:
     return False
   if not isinstance(text,list): # we need to handle the case when the path returns a list, eg for hashtags, so standardise on that
     text=[text]
   for t in text:
     t=t.lower()
     if self.term==t:
       return True
   return False
Ejemplo n.º 9
0
  def is_satisfied_by(self,interaction):
    global chunk_cache
    text=jpath(self.path,interaction)
    if text is None:
      return False

    if not isinstance(text,list): # we need to handle the case when the path returns a list, e.g. for shareurls, so standardise on that
      text=[text]
    for t in text:
      t=t.lower()
      if t not in chunk_cache:
        chunk_cache[t]=chunk(t)
      if contains(chunk_cache[t],self.chunks,2):
        return True
    return False
    try:
      if not ctr % 500 and ctr>0:
        print >>sys.stderr, "Looking for candidate features in interaction %i" %ctr
      j=json.loads(i)
      C=get_class(j)
      if C is not None:

        if ctr % opts.test_period == 0:
          test_interactions.append(j)
          test_targets.append(get_classid(C))

        else:
          interactions.append(j)
          targets.append(get_classid(C))

          content=jpath(featurepath,j)
          if content is not None:
            content=content.lower()
            normalised_content=re.sub(r'(https?)://t.co/\w+','',content) #http://t.co/rerJLq49gl
            normalised_content=re.sub(r'@\w+','',normalised_content) # need to detect @mentions before chunking for unigrams!
            listofngrams=list()
            words=[w for w in wordsplitter.split(normalised_content) if w not in stopwords and len(w)>0]

            if config.useunigrams:
              listofngrams+=[(w,) for w in words]
            if config.usebigrams:
              listofngrams+=ngrams(2,content)
            if config.usetrigrams:
              listofngrams+=ngrams(3,content)
            for ngram in listofngrams:
              #if len(ngram)==1 and ngram[0] in stopwords: # ignore unigrams consisting of a stopword
        try:
            if not ctr % 500 and ctr > 0:
                print >> sys.stderr, "Looking for candidate features in interaction %i" % ctr
            j = json.loads(i)
            C = get_class(j)
            if C is not None:

                if ctr % opts.test_period == 0:
                    test_interactions.append(j)
                    test_targets.append(get_classid(C))

                else:
                    interactions.append(j)
                    targets.append(get_classid(C))

                    content = jpath(featurepath, j)
                    if content is not None:
                        content = content.lower()
                        normalised_content = re.sub(
                            r'(https?)://t.co/\w+', '',
                            content)  #http://t.co/rerJLq49gl
                        normalised_content = re.sub(
                            r'@\w+', '', normalised_content
                        )  # need to detect @mentions before chunking for unigrams!
                        listofngrams = list()
                        words = [
                            w for w in wordsplitter.split(normalised_content)
                            if w not in stopwords and len(w) > 0
                        ]

                        if config.useunigrams:
Ejemplo n.º 12
0
 def is_satisfied_by(self,interaction):
   text=jpath(self.path,interaction)
   if text is None:
     return False
   else:
     return self.chars in text # NB CASE SENSITIVE
Ejemplo n.º 13
0
 def is_satisfied_by(self,interaction):
   x=jpath(self.path,interaction)
   if x is None:
     return False
   return x==self.val
Ejemplo n.º 14
0
 def is_satisfied_by(self,interaction):
   text=jpath(self.path,interaction)
   if text is None:
     return False
   else:
     return re.search(self.regex,text) is not None
Ejemplo n.º 15
0
 def is_satisfied_by(self,interaction):
   return jpath(self.path,interaction) is not None