def Scan(self, path, matcher): """Scans given file searching for occurrences of given pattern. Args: path: A path to the file that needs to be searched. matcher: A matcher object specifying a pattern to search for. Yields: `BufferReference` objects pointing to file parts with matching content. """ streamer = streaming.FileStreamer(chunk_size=self.CHUNK_SIZE, overlap_size=self.OVERLAP_SIZE) offset = self.params.start_offset amount = self.params.length for chunk in streamer.StreamFilePath(path, offset=offset, amount=amount): for span in chunk.Scan(matcher): ctx_begin = max(span.begin - self.params.bytes_before, 0) ctx_end = min(span.end + self.params.bytes_after, len(chunk.data)) ctx_data = chunk.data[ctx_begin:ctx_end] yield rdf_client.BufferReference(offset=chunk.offset + ctx_begin, length=len(ctx_data), data=ctx_data) if self.params.mode == self.params.Mode.FIRST_HIT: return
def testSingleChunk(self): with open(self.temp_filepath, "wb") as fd: fd.write("abcdef") streamer = streaming.FileStreamer(chunk_size=8, overlap_size=2) chunks = list(streamer.StreamFilePath(self.temp_filepath, amount=7)) self.assertEqual(len(chunks), 1) self.assertEqual(chunks[0].data, "abcdef") self.assertEqual(chunks[0].offset, 0) self.assertEqual(chunks[0].overlap, 0)
def testSmallAmount(self): with open(self.temp_filepath, "wb") as fd: fd.write("abc") streamer = streaming.FileStreamer(chunk_size=1, overlap_size=0) chunks = list(streamer.StreamFilePath(self.temp_filepath, amount=2)) self.assertEqual(len(chunks), 2) self.assertEqual(chunks[0].data, "a") self.assertEqual(chunks[1].data, "b") self.assertEqual(chunks[0].offset, 0) self.assertEqual(chunks[1].offset, 1) self.assertEqual(chunks[0].overlap, 0) self.assertEqual(chunks[1].overlap, 0)
def testOneByteOverlap(self): with open(self.temp_filepath, "wb") as fd: fd.write("abcdef") streamer = streaming.FileStreamer(chunk_size=3, overlap_size=1) chunks = list(streamer.StreamFilePath(self.temp_filepath, amount=8)) self.assertEqual(len(chunks), 3) self.assertEqual(chunks[0].data, "abc") self.assertEqual(chunks[1].data, "cde") self.assertEqual(chunks[2].data, "ef") self.assertEqual(chunks[0].offset, 0) self.assertEqual(chunks[1].offset, 2) self.assertEqual(chunks[2].offset, 4) self.assertEqual(chunks[0].overlap, 0) self.assertEqual(chunks[1].overlap, 1) self.assertEqual(chunks[2].overlap, 1)
def testShorterOverlap(self): with open(self.temp_filepath, "wb") as fd: fd.write("abcdefg") streamer = streaming.FileStreamer(chunk_size=4, overlap_size=2) chunks = list(streamer.StreamFilePath(self.temp_filepath, amount=1024)) self.assertEqual(len(chunks), 3) self.assertEqual(chunks[0].data, "abcd") self.assertEqual(chunks[1].data, "cdef") self.assertEqual(chunks[2].data, "efg") self.assertEqual(chunks[0].offset, 0) self.assertEqual(chunks[1].offset, 2) self.assertEqual(chunks[2].offset, 4) self.assertEqual(chunks[0].overlap, 0) self.assertEqual(chunks[1].overlap, 2) self.assertEqual(chunks[2].overlap, 2)
def testOffset(self): with open(self.temp_filepath, "wb") as fd: fd.write("abcdefghi") streamer = streaming.FileStreamer(chunk_size=3, overlap_size=2) stream = streamer.StreamFilePath(self.temp_filepath, offset=4, amount=108) chunks = list(stream) self.assertEqual(len(chunks), 3) self.assertEqual(chunks[0].data, "efg") self.assertEqual(chunks[1].data, "fgh") self.assertEqual(chunks[2].data, "ghi") self.assertEqual(chunks[0].offset, 4) self.assertEqual(chunks[1].offset, 5) self.assertEqual(chunks[2].offset, 6) self.assertEqual(chunks[0].overlap, 0) self.assertEqual(chunks[1].overlap, 2) self.assertEqual(chunks[2].overlap, 2)
def testNoData(self): streamer = streaming.FileStreamer(chunk_size=3, overlap_size=1) chunks = list(streamer.StreamFilePath(self.temp_filepath, amount=5)) self.assertEqual(len(chunks), 0)