Exemple #1
0
    def Scan(self, path, matcher):
        """Scans given file searching for occurrences of given pattern.

    Args:
      path: A path to the file that needs to be searched.
      matcher: A matcher object specifying a pattern to search for.

    Yields:
      `BufferReference` objects pointing to file parts with matching content.
    """
        streamer = streaming.FileStreamer(chunk_size=self.CHUNK_SIZE,
                                          overlap_size=self.OVERLAP_SIZE)

        offset = self.params.start_offset
        amount = self.params.length
        for chunk in streamer.StreamFilePath(path,
                                             offset=offset,
                                             amount=amount):
            for span in chunk.Scan(matcher):
                ctx_begin = max(span.begin - self.params.bytes_before, 0)
                ctx_end = min(span.end + self.params.bytes_after,
                              len(chunk.data))
                ctx_data = chunk.data[ctx_begin:ctx_end]

                yield rdf_client.BufferReference(offset=chunk.offset +
                                                 ctx_begin,
                                                 length=len(ctx_data),
                                                 data=ctx_data)

                if self.params.mode == self.params.Mode.FIRST_HIT:
                    return
Exemple #2
0
    def testSingleChunk(self):
        with open(self.temp_filepath, "wb") as fd:
            fd.write("abcdef")

        streamer = streaming.FileStreamer(chunk_size=8, overlap_size=2)
        chunks = list(streamer.StreamFilePath(self.temp_filepath, amount=7))

        self.assertEqual(len(chunks), 1)
        self.assertEqual(chunks[0].data, "abcdef")
        self.assertEqual(chunks[0].offset, 0)
        self.assertEqual(chunks[0].overlap, 0)
Exemple #3
0
    def testSmallAmount(self):
        with open(self.temp_filepath, "wb") as fd:
            fd.write("abc")

        streamer = streaming.FileStreamer(chunk_size=1, overlap_size=0)
        chunks = list(streamer.StreamFilePath(self.temp_filepath, amount=2))

        self.assertEqual(len(chunks), 2)
        self.assertEqual(chunks[0].data, "a")
        self.assertEqual(chunks[1].data, "b")
        self.assertEqual(chunks[0].offset, 0)
        self.assertEqual(chunks[1].offset, 1)
        self.assertEqual(chunks[0].overlap, 0)
        self.assertEqual(chunks[1].overlap, 0)
Exemple #4
0
    def testOneByteOverlap(self):
        with open(self.temp_filepath, "wb") as fd:
            fd.write("abcdef")

        streamer = streaming.FileStreamer(chunk_size=3, overlap_size=1)
        chunks = list(streamer.StreamFilePath(self.temp_filepath, amount=8))

        self.assertEqual(len(chunks), 3)
        self.assertEqual(chunks[0].data, "abc")
        self.assertEqual(chunks[1].data, "cde")
        self.assertEqual(chunks[2].data, "ef")
        self.assertEqual(chunks[0].offset, 0)
        self.assertEqual(chunks[1].offset, 2)
        self.assertEqual(chunks[2].offset, 4)
        self.assertEqual(chunks[0].overlap, 0)
        self.assertEqual(chunks[1].overlap, 1)
        self.assertEqual(chunks[2].overlap, 1)
Exemple #5
0
    def testShorterOverlap(self):
        with open(self.temp_filepath, "wb") as fd:
            fd.write("abcdefg")

        streamer = streaming.FileStreamer(chunk_size=4, overlap_size=2)
        chunks = list(streamer.StreamFilePath(self.temp_filepath, amount=1024))

        self.assertEqual(len(chunks), 3)
        self.assertEqual(chunks[0].data, "abcd")
        self.assertEqual(chunks[1].data, "cdef")
        self.assertEqual(chunks[2].data, "efg")
        self.assertEqual(chunks[0].offset, 0)
        self.assertEqual(chunks[1].offset, 2)
        self.assertEqual(chunks[2].offset, 4)
        self.assertEqual(chunks[0].overlap, 0)
        self.assertEqual(chunks[1].overlap, 2)
        self.assertEqual(chunks[2].overlap, 2)
Exemple #6
0
    def testOffset(self):
        with open(self.temp_filepath, "wb") as fd:
            fd.write("abcdefghi")

        streamer = streaming.FileStreamer(chunk_size=3, overlap_size=2)
        stream = streamer.StreamFilePath(self.temp_filepath,
                                         offset=4,
                                         amount=108)
        chunks = list(stream)

        self.assertEqual(len(chunks), 3)
        self.assertEqual(chunks[0].data, "efg")
        self.assertEqual(chunks[1].data, "fgh")
        self.assertEqual(chunks[2].data, "ghi")
        self.assertEqual(chunks[0].offset, 4)
        self.assertEqual(chunks[1].offset, 5)
        self.assertEqual(chunks[2].offset, 6)
        self.assertEqual(chunks[0].overlap, 0)
        self.assertEqual(chunks[1].overlap, 2)
        self.assertEqual(chunks[2].overlap, 2)
Exemple #7
0
    def testNoData(self):
        streamer = streaming.FileStreamer(chunk_size=3, overlap_size=1)
        chunks = list(streamer.StreamFilePath(self.temp_filepath, amount=5))

        self.assertEqual(len(chunks), 0)