def test_with_container(self):
     kwargs = self.basic_kwargs()
     kwargs[CONTAINER_NAME] = u'sentence'
     kwargs[CONTAINER_VALUE_REGEX] = u'length'
     filt = Filter(**kwargs)
     outs = set(filt.filter(self.segmentstorage(), self.documentstorage()))
     self.assertEqual(set(outs), set(self.second_copy_lemmas()))
 def test_second_apply_removes_previous_segments(self):
     filt = Filter(**self.basic_kwargs())
     segmentstorage = self.segmentstorage()
     filt.apply(segmentstorage, self.documentstorage())
     filt.apply(segmentstorage, self.documentstorage()) # second apply
     copies = set(segmentstorage.load(name=u'lemma:copy'))
     self.assertEqual(copies, set(self.first_copy_lemmas()) | set(self.second_copy_lemmas()))
 def test_with_document_regex(self):
     kwargs = self.basic_kwargs()
     kwargs[DOCUMENT_REGEX] = u's'
     kwargs[DOCUMENT_NEG_REGEX] = u'sic'
     filt = Filter(**kwargs)
     outs = set(filt.filter(self.segmentstorage(), self.documentstorage()))
     self.assertEqual(outs, set(self.second_copy_lemmas()))
 def apply_filter(self, name):
     try:
         settings = self._setstorage.load(encode_name(name))
         filt = Filter(**settings)
         filt.apply(self._segstorage, self._docstorage)
     except Exception, e:
         return json.dumps({"result": "FAIL", "error": str(e)})
 def test_with_mixin(self):
     kwargs = self.basic_kwargs()
     kwargs[MIXIN_NAME] = u'mixin'
     filt = Filter(**kwargs)
     segmentstorage = self.segmentstorage()
     segmentstorage.save([self.mixin1()])
     outs = set(filt.filter(segmentstorage, self.documentstorage()))
     self.assertEqual(outs, set(self.first_copy_lemmas()) | set(self.second_copy_lemmas()) | set([self.mixin_copy1()]))
 def test_basic_regex(self):
     kwargs = self.basic_kwargs()
     kwargs[SEGMENT_VALUE_REGEX] = u'was|sick|\d+'
     kwargs[SEGMENT_NEG_REGEX] = u'was'
     kwargs[OUTPUT_NAME] = u'lemma'
     kwargs[CREATES_SEGMENT] = False
     filt = Filter(**kwargs)
     outs = filt.filter(self.segmentstorage(), self.documentstorage())
     self.assertEqual(set(outs), set([self.lemma3(), self.lemma7()]))
 def test_basic_creation(self):
     kwargs = self.basic_kwargs()
     kwargs[SEGMENT_VALUE_REGEX] = u'was|sick|\d+'
     kwargs[SEGMENT_NEG_REGEX] = u'was'
     kwargs[OUTPUT_NAME] = u'lemma'
     kwargs[CREATES_SEGMENT] = True
     segmentstorage = SegmentStorage()
     filt = Filter(**kwargs)
     filt.apply(segmentstorage, self.documentstorage())
     segs = segmentstorage.load(name=u'lemma')
     self.assertEqual(set(segs), set([self.lemma3(), self.lemma7()]))
 def test_splitter_full(self):
     kwargs = self.basic_kwargs()
     kwargs[SPLITTER_LEFT] = u'e'
     kwargs[SPLITTER_REGEX] = u' '
     kwargs[SPLITTER_RIGHT] = u'...'
     kwargs[SPLITTER_NEG_REGEX] = u'was'
     kwargs[SEGMENT_NAME] = u'sentence'
     kwargs[OUTPUT_NAME] = u'fragment'
     filt = Filter(**kwargs)
     segmentstorage = self.segmentstorage()
     segmentstorage.save([self.sentence1(), self.sentence2()])
     outs = set(filt.filter(segmentstorage, self.documentstorage()))
     self.assertEqual(outs, set(self.fragments()))
    def preview_sample(self, name):
        try:
            settings = self._setstorage.load(encode_name(name))
            filt = Filter(**settings)
            limit = 300
            query_limit = 10000
            context_size = 60

            # preview basic
            basic_segs = head(filt.filter_basic(self._segstorage, self._docstorage, limit=query_limit), limit)
            basic = segments_html(basic_segs, self._docstorage, context_size)

            # preview container
            container_segs = head(filt.filter_container(basic_segs, self._segstorage), limit)
            container = segments_html(container_segs, self._docstorage, context_size)

            # preview splitter
            splitter_segs = head(filt.filter_splitter(container_segs), limit)
            splitter = segments_html(splitter_segs, self._docstorage, context_size)

            # preview mixin
            mixin_segs = head(filt.filter_mixin(splitter_segs, self._segstorage), limit)
            mixin = segments_html(mixin_segs, self._docstorage, context_size)

            # preview final output
            output_segs = head(filt.filter(self._segstorage, self._docstorage), limit)
            output = segments_html(output_segs, self._docstorage, context_size)

            data = {"basic": basic, "container": container, "mixin": mixin, "splitter": splitter, "output": output}

            return json.dumps({"result": "OK", "data": data})
        except Exception, e:
            return json.dumps({"result": "FAIL", "error": str(e)})
 def test_with_document_prefix(self):
     kwargs = self.basic_kwargs()
     kwargs[DOCUMENT_PREFIX] = u'DOCUMENT B'
     filt = Filter(**kwargs)
     outs = set(filt.filter(self.segmentstorage(), self.documentstorage()))
     self.assertEqual(outs, set(self.second_copy_lemmas()))
 def test_basic_copy(self):
     filt = Filter(**self.basic_kwargs())
     outs = set(filt.filter(self.segmentstorage(), self.documentstorage()))
     self.assertEqual(outs, set(self.first_copy_lemmas()) | set(self.second_copy_lemmas()))
 def test_update(self):
     filt = Filter(**self.basic_kwargs())
     filt.update(self.second_keywords())
     self.assertEqual(filt[FILTER_NAME], u'second_filter')
     self.assertEqual(filt[SEGMENT_NAME], u'second:lemma')
     self.assertEqual(filt[OUTPUT_NAME], u'second:lemma:copy')