def __init__(self, s): Predicate.__init__(self) s = zen2han(s) self.q = s if s.startswith('-') or s.startswith('!'): s = s[1:] self.neg = True self.r0 = self.r1 = self.r2 = [] self.sentid = None self.setup_feats(s) #print (self.checkpat, self.matchpat, self.r0, self.r1, self.r2) return
def index_doc(self, doc, maxsents=100000): if self.maker == None: self.create_new_idx() docid = len(self.docinfo)+1 self.docinfo.append((docid, doc)) if 2 <= self.verbose: print >>sys.stderr, 'Reading: %r' % doc elif 1 <= self.verbose: sys.stderr.write('.'); sys.stderr.flush() terms = self.terms # other features add_features(terms, docid, 0, ( PROP_LABEL+x for x in self.corpus.loc_labels(doc.loc) )) add_features(terms, docid, 0, doc.get_feats()) # sents sentid = 0 title = doc.get_title() if title and sentid < maxsents: title = zen2han(rmsp(title)) self.maker.add(pack('>cii', PROP_SENT, docid, sentid), title.encode('utf-8')) add_features(terms, docid, sentid, set(doc.splitterms(title))) sentid += 1 for sent in doc.get_sents(): sent = zen2han(rmsp(sent)) if not sent: continue self.maker.add(pack('>cii', PROP_SENT, docid, sentid), sent.encode('utf-8')) add_features(terms, docid, sentid, set(doc.splitterms(sent))) sentid += 1 if maxsents <= sentid: break if ((self.max_docs_threshold and self.max_docs_threshold <= len(self.docinfo)) or (self.max_terms_threshold and self.max_terms_threshold <= len(terms))): self.flush() for subdoc in doc.get_subdocs(): if subdoc: self.index_doc(subdoc, maxsents=maxsents) return True