Ejemplo n.º 1
0
    def iter_text(self, file, encoding=None):
        if isinstance(file, io.TextIOBase):
            stream = file
        else:
            stream = codecs.getreader(encoding or "latin1")(file)
        regex_stream = RegexStream(stream, self.URL_REGEX)

        for match, text in regex_stream.stream():
            yield (text, bool(match))
Ejemplo n.º 2
0
    def iter_text(self, file, encoding=None):
        if isinstance(file, io.TextIOBase):
            stream = file
        else:
            stream = codecs.getreader(encoding or 'latin1')(file)

        regex_stream = RegexStream(stream, self.URL_REGEX)

        for match, text in regex_stream.stream():
            if match:
                yield (text, 'import' if match.group(3) else 'url')
            else:
                yield (text, False)
Ejemplo n.º 3
0
    def test_stream(self):
        my_file = io.StringIO('fish dog   horse bat dolphin')
        pattern = re.compile(r'(horse|dog|bat)')
        streamer = RegexStream(my_file, pattern, read_size=5, overlap_size=2)

        fragments = list([(bool(match), text)
                          for match, text in streamer.stream()])

        self.assertEqual([
            (False, 'fish '),
            (True, 'dog'),
            (False, '  '),
            (False, ' '),
            (True, 'horse'),
            (False, ' '),
            (True, 'bat'),
            (False, ' dolp'),
            (False, 'hin'),
        ], fragments)
Ejemplo n.º 4
0
    def test_stream(self):
        my_file = io.StringIO('fish dog   horse bat dolphin')
        pattern = re.compile(r'(horse|dog|bat)')
        streamer = RegexStream(my_file, pattern, read_size=5, overlap_size=2)

        fragments = list(
            [(bool(match), text) for match, text in streamer.stream()])

        self.assertEqual(
            [
                (False, 'fish '),
                (True, 'dog'),
                (False, '  '),
                (False, ' '),
                (True, 'horse'),
                (False, ' '),
                (True, 'bat'),
                (False, ' dolp'),
                (False, 'hin'),
            ],
            fragments
        )