def preprocess(self): """ Preprocess the suspicious and source document. """ susp_fp = codecs.open(self.susp, 'r', 'utf-8') self.susp_text = susp_fp.read() self.susp_bow = Preprocessing.tokenize(self.susp_text, self.susp_offsets, self.susp_sents) Preprocessing.ss_treat(self.susp_bow, self.susp_offsets, self.min_sentlen, self.rssent) susp_fp.close() src_fp = codecs.open(self.src, 'r', 'utf-8') self.src_text = src_fp.read() self.src_bow = Preprocessing.tokenize(self.src_text, self.src_offsets, self.src_sents) Preprocessing.ss_treat(self.src_bow, self.src_offsets, self.min_sentlen, self.rssent) src_fp.close()