def test_with_container(self): kwargs = self.basic_kwargs() kwargs[CONTAINER_NAME] = u'sentence' kwargs[CONTAINER_VALUE_REGEX] = u'length' filt = Filter(**kwargs) outs = set(filt.filter(self.segmentstorage(), self.documentstorage())) self.assertEqual(set(outs), set(self.second_copy_lemmas()))
def test_second_apply_removes_previous_segments(self): filt = Filter(**self.basic_kwargs()) segmentstorage = self.segmentstorage() filt.apply(segmentstorage, self.documentstorage()) filt.apply(segmentstorage, self.documentstorage()) # second apply copies = set(segmentstorage.load(name=u'lemma:copy')) self.assertEqual(copies, set(self.first_copy_lemmas()) | set(self.second_copy_lemmas()))
def test_with_document_regex(self): kwargs = self.basic_kwargs() kwargs[DOCUMENT_REGEX] = u's' kwargs[DOCUMENT_NEG_REGEX] = u'sic' filt = Filter(**kwargs) outs = set(filt.filter(self.segmentstorage(), self.documentstorage())) self.assertEqual(outs, set(self.second_copy_lemmas()))
def apply_filter(self, name): try: settings = self._setstorage.load(encode_name(name)) filt = Filter(**settings) filt.apply(self._segstorage, self._docstorage) except Exception, e: return json.dumps({"result": "FAIL", "error": str(e)})
def test_with_mixin(self): kwargs = self.basic_kwargs() kwargs[MIXIN_NAME] = u'mixin' filt = Filter(**kwargs) segmentstorage = self.segmentstorage() segmentstorage.save([self.mixin1()]) outs = set(filt.filter(segmentstorage, self.documentstorage())) self.assertEqual(outs, set(self.first_copy_lemmas()) | set(self.second_copy_lemmas()) | set([self.mixin_copy1()]))
def test_basic_regex(self): kwargs = self.basic_kwargs() kwargs[SEGMENT_VALUE_REGEX] = u'was|sick|\d+' kwargs[SEGMENT_NEG_REGEX] = u'was' kwargs[OUTPUT_NAME] = u'lemma' kwargs[CREATES_SEGMENT] = False filt = Filter(**kwargs) outs = filt.filter(self.segmentstorage(), self.documentstorage()) self.assertEqual(set(outs), set([self.lemma3(), self.lemma7()]))
def test_basic_creation(self): kwargs = self.basic_kwargs() kwargs[SEGMENT_VALUE_REGEX] = u'was|sick|\d+' kwargs[SEGMENT_NEG_REGEX] = u'was' kwargs[OUTPUT_NAME] = u'lemma' kwargs[CREATES_SEGMENT] = True segmentstorage = SegmentStorage() filt = Filter(**kwargs) filt.apply(segmentstorage, self.documentstorage()) segs = segmentstorage.load(name=u'lemma') self.assertEqual(set(segs), set([self.lemma3(), self.lemma7()]))
def test_splitter_full(self): kwargs = self.basic_kwargs() kwargs[SPLITTER_LEFT] = u'e' kwargs[SPLITTER_REGEX] = u' ' kwargs[SPLITTER_RIGHT] = u'...' kwargs[SPLITTER_NEG_REGEX] = u'was' kwargs[SEGMENT_NAME] = u'sentence' kwargs[OUTPUT_NAME] = u'fragment' filt = Filter(**kwargs) segmentstorage = self.segmentstorage() segmentstorage.save([self.sentence1(), self.sentence2()]) outs = set(filt.filter(segmentstorage, self.documentstorage())) self.assertEqual(outs, set(self.fragments()))
def preview_sample(self, name): try: settings = self._setstorage.load(encode_name(name)) filt = Filter(**settings) limit = 300 query_limit = 10000 context_size = 60 # preview basic basic_segs = head(filt.filter_basic(self._segstorage, self._docstorage, limit=query_limit), limit) basic = segments_html(basic_segs, self._docstorage, context_size) # preview container container_segs = head(filt.filter_container(basic_segs, self._segstorage), limit) container = segments_html(container_segs, self._docstorage, context_size) # preview splitter splitter_segs = head(filt.filter_splitter(container_segs), limit) splitter = segments_html(splitter_segs, self._docstorage, context_size) # preview mixin mixin_segs = head(filt.filter_mixin(splitter_segs, self._segstorage), limit) mixin = segments_html(mixin_segs, self._docstorage, context_size) # preview final output output_segs = head(filt.filter(self._segstorage, self._docstorage), limit) output = segments_html(output_segs, self._docstorage, context_size) data = {"basic": basic, "container": container, "mixin": mixin, "splitter": splitter, "output": output} return json.dumps({"result": "OK", "data": data}) except Exception, e: return json.dumps({"result": "FAIL", "error": str(e)})
def test_with_document_prefix(self): kwargs = self.basic_kwargs() kwargs[DOCUMENT_PREFIX] = u'DOCUMENT B' filt = Filter(**kwargs) outs = set(filt.filter(self.segmentstorage(), self.documentstorage())) self.assertEqual(outs, set(self.second_copy_lemmas()))
def test_basic_copy(self): filt = Filter(**self.basic_kwargs()) outs = set(filt.filter(self.segmentstorage(), self.documentstorage())) self.assertEqual(outs, set(self.first_copy_lemmas()) | set(self.second_copy_lemmas()))
def test_update(self): filt = Filter(**self.basic_kwargs()) filt.update(self.second_keywords()) self.assertEqual(filt[FILTER_NAME], u'second_filter') self.assertEqual(filt[SEGMENT_NAME], u'second:lemma') self.assertEqual(filt[OUTPUT_NAME], u'second:lemma:copy')