Example #1
0
 def __init__(self, s):
   Predicate.__init__(self)
   s = zen2han(s)
   self.q = s
   if s.startswith('-') or s.startswith('!'):
     s = s[1:]
     self.neg = True
   self.r0 = self.r1 = self.r2 = []
   self.sentid = None
   self.setup_feats(s)
   #print (self.checkpat, self.matchpat, self.r0, self.r1, self.r2)
   return
Example #2
0
 def __init__(self, s):
     Predicate.__init__(self)
     s = zen2han(s)
     self.q = s
     if s.startswith('-') or s.startswith('!'):
         s = s[1:]
         self.neg = True
     self.r0 = self.r1 = self.r2 = []
     self.sentid = None
     self.setup_feats(s)
     #print (self.checkpat, self.matchpat, self.r0, self.r1, self.r2)
     return
Example #3
0
 def index_doc(self, doc, maxsents=100000):
   if self.maker == None:
     self.create_new_idx()
   docid = len(self.docinfo)+1
   self.docinfo.append((docid, doc))
   if 2 <= self.verbose:
     print >>sys.stderr, 'Reading: %r' % doc
   elif 1 <= self.verbose:
     sys.stderr.write('.'); sys.stderr.flush()
   terms = self.terms
   # other features
   add_features(terms, docid, 0,
                ( PROP_LABEL+x for x in self.corpus.loc_labels(doc.loc) ))
   add_features(terms, docid, 0, doc.get_feats())
   # sents
   sentid = 0
   title = doc.get_title()
   if title and sentid < maxsents:
     title = zen2han(rmsp(title))
     self.maker.add(pack('>cii', PROP_SENT, docid, sentid), title.encode('utf-8'))
     add_features(terms, docid, sentid, set(doc.splitterms(title)))
     sentid += 1
   for sent in doc.get_sents():
     sent = zen2han(rmsp(sent))
     if not sent: continue
     self.maker.add(pack('>cii', PROP_SENT, docid, sentid), sent.encode('utf-8'))
     add_features(terms, docid, sentid, set(doc.splitterms(sent)))
     sentid += 1
     if maxsents <= sentid: break
   if ((self.max_docs_threshold and self.max_docs_threshold <= len(self.docinfo)) or 
       (self.max_terms_threshold and self.max_terms_threshold <= len(terms))):
     self.flush()
   for subdoc in doc.get_subdocs():
     if subdoc:
       self.index_doc(subdoc, maxsents=maxsents)
   return True