Beispiel #1
0
	def __init__(self, files, macros=None, numproc=None, ignorecase=False,
			inmemory=False):
		super().__init__(files, macros, numproc)
		self.macros = None
		self.flags = re.MULTILINE
		if ignorecase:
			self.flags |= re.IGNORECASE
		if macros:
			with openread(macros) as tmp:
				self.macros = dict(line.strip().split('=', 1) for line in tmp)
		self.fileno = {filename: n for n, filename in enumerate(sorted(files))}
		maxmtime = max(os.stat(a).st_mtime for a in files)
		path = os.path.dirname(next(iter(sorted(files))))
		self.lineidxpath = os.path.join(path, 'treesearchline.idx')
		if os.path.exists(self.lineidxpath):
			mtime = os.stat(self.lineidxpath).st_mtime
			tmp = MultiRoaringBitmap.fromfile(self.lineidxpath)
		else:
			mtime, tmp = 0, []
		if len(tmp) == len(files) and mtime > maxmtime:
			self.lineindex = tmp
		else:
			tmp = [_indexfile(name) for name in sorted(files)]
			self.lineindex = MultiRoaringBitmap(tmp, filename=self.lineidxpath)
		if inmemory:
			for filename in self.files:
				fileno = os.open(filename, os.O_RDONLY)
				buf = mmap.mmap(fileno, 0, access=mmap.ACCESS_READ)
				self.files[filename] = (fileno, buf)
		self.pool = concurrent.futures.ProcessPoolExecutor(self.numproc)
Beispiel #2
0
 def test_clamp(self, multi):
     a, b = sorted(sample(multi[0], 2))
     ref = set.intersection(*[set(x) for x in multi]) & set(range(a, b))
     mrb = MultiRoaringBitmap([RoaringBitmap(x) for x in multi])
     rb = mrb.intersection(list(range(len(mrb))), start=a, stop=b)
     assert a <= rb.min() and rb.max() < b
     assert ref == rb
Beispiel #3
0
	def test_none(self, multi):
		orig = [RoaringBitmap(a) for a in multi]
		orig.insert(4, None)
		mrb = MultiRoaringBitmap(orig)
		assert len(orig) == len(mrb)
		for rb1, rb2 in zip(orig, mrb):
			assert rb1 == rb2
		assert mrb.intersection([4, 5]) is None
Beispiel #4
0
 def test_none(self, multi):
     orig = [RoaringBitmap(a) for a in multi]
     orig.insert(4, RoaringBitmap())
     mrb = MultiRoaringBitmap(orig)
     assert len(orig) == len(mrb)
     for rb1, rb2 in zip(orig, mrb):
         assert rb1 == rb2
     assert mrb.intersection([4, 5]) is None
Beispiel #5
0
	def test_jaccard(self, multi):
		mrb = MultiRoaringBitmap([ImmutableRoaringBitmap(a) for a in multi])
		indices1 = array.array(b'L' if PY2 else 'L', [0, 6, 8])
		indices2 = array.array(b'L' if PY2 else 'L', [1, 7, 6])
		res = mrb.jaccard_dist(indices1, indices2)
		ref = array.array(b'd' if PY2 else 'd', [mrb[i].jaccard_dist(mrb[j])
				for i, j in zip(indices1, indices2)])
		assert res == ref
Beispiel #6
0
	def test_clamp(self, multi):
		a, b = sorted(random.sample(multi[0], 2))
		ref = set.intersection(
				*[set(x) for x in multi]) & set(range(a, b))
		mrb = MultiRoaringBitmap([RoaringBitmap(x) for x in multi])
		rb = mrb.intersection(list(range(len(mrb))), start=a, stop=b)
		assert a <= rb.min() and rb.max() < b
		assert ref == rb
Beispiel #7
0
 def test_eq(self, multi):
     orig = [RoaringBitmap(a) for a in multi]
     mrb = MultiRoaringBitmap(orig)
     mrb2 = MultiRoaringBitmap(orig)
     mrb3 = MultiRoaringBitmap(orig[1:])
     assert mrb == orig
     assert mrb == mrb2
     assert mrb != orig[1:]
     assert mrb != mrb3
Beispiel #8
0
 def test_jaccard(self, multi):
     mrb = MultiRoaringBitmap([ImmutableRoaringBitmap(a) for a in multi])
     indices1 = array.array(b'L' if PY2 else 'L', [0, 6, 8])
     indices2 = array.array(b'L' if PY2 else 'L', [1, 7, 6])
     res = mrb.jaccard_dist(indices1, indices2)
     ref = array.array(
         b'd' if PY2 else 'd',
         [mrb[i].jaccard_dist(mrb[j]) for i, j in zip(indices1, indices2)])
     assert res == ref
Beispiel #9
0
 def test_serialize(self, multi):
     orig = [RoaringBitmap(a) for a in multi]
     mrb = MultiRoaringBitmap(orig)
     with tempfile.NamedTemporaryFile(delete=False) as tmp:
         mrb2 = MultiRoaringBitmap(orig, filename=tmp.name)
         del mrb2
         mrb_deserialized = MultiRoaringBitmap.fromfile(tmp.name)
         assert len(orig) == len(mrb)
         assert len(orig) == len(mrb_deserialized)
         for rb1, rb2, rb3 in zip(orig, mrb, mrb_deserialized):
             assert rb1 == rb2
             assert rb1 == rb3
             rb3._checkconsistency()
             assert type(rb3) == ImmutableRoaringBitmap
Beispiel #10
0
 def test_andor_len_pairwise(self, multi):
     mrb = MultiRoaringBitmap([ImmutableRoaringBitmap(a) for a in multi])
     indices1 = array.array(b'L' if PY2 else 'L', [0, 6, 8])
     indices2 = array.array(b'L' if PY2 else 'L', [1, 7, 6])
     res1 = array.array(b'L' if PY2 else 'L', [0] * len(indices1))
     res2 = array.array(b'L' if PY2 else 'L', [0] * len(indices1))
     mrb.andor_len_pairwise(indices1, indices2, res1, res2)
     ref1 = array.array(b'L' if PY2 else 'L')
     ref2 = array.array(b'L' if PY2 else 'L')
     for i, j in zip(indices1, indices2):
         ref1.append(len(mrb[i] & mrb[j]))
         ref2.append(len(mrb[i] | mrb[j]))
     assert res1 == ref1
     assert res2 == ref2
Beispiel #11
0
    def test_multi2(self):
        for_multi_pre = []
        for x in range(3):
            for_multi = []
            for i in range(5):
                for_multi += [RoaringBitmap(sample(range(99999), 200))]
            mrb = MultiRoaringBitmap(for_multi)
            for_multi_pre += [mrb[0], mrb[1]]

        assert type(for_multi_pre) is list
        for_multi_pre[-1]
        list(for_multi_pre)
Beispiel #12
0
 def test_multi1(self):
     for_multi = []
     for i in range(5):
         for_multi += [RoaringBitmap(sample(range(99999), 200))]
     mrb = MultiRoaringBitmap(for_multi)
     assert len(mrb) == 5
     assert mrb[4] == for_multi[4]
     with pytest.raises(IndexError):
         mrb[5]
     assert mrb[-1] == for_multi[-1]
     list(mrb)
     for n, rb in enumerate(mrb):
         assert rb == for_multi[n], n
Beispiel #13
0
	def test_serialize(self, multi):
		orig = [RoaringBitmap(a) for a in multi]
		mrb = MultiRoaringBitmap(orig)
		with tempfile.NamedTemporaryFile(delete=False) as tmp:
			mrb2 = MultiRoaringBitmap(orig, filename=tmp.name)
			del mrb2
			mrb_deserialized = MultiRoaringBitmap.fromfile(tmp.name)
			assert len(orig) == len(mrb)
			assert len(orig) == len(mrb_deserialized)
			for rb1, rb2, rb3 in zip(orig, mrb, mrb_deserialized):
				assert rb1 == rb2
				assert rb1 == rb3
				rb3._checkconsistency()
				assert type(rb3) == ImmutableRoaringBitmap
Beispiel #14
0
def _regex_run_batch(patterns, filename, fileno, lineidxpath,
		start=None, end=None, maxresults=None, sents=False):
	"""Run a batch of queries on a single file."""
	mrb = MultiRoaringBitmap.fromfile(lineidxpath)
	lineindex = mrb.get(fileno)
	if sents:
		result = []
	else:
		result = array.array('I')
	if start and start >= len(lineindex):
		return result
	with open(filename, 'rb') as tmp:
		data = mmap.mmap(tmp.fileno(), 0, access=mmap.ACCESS_READ)
		try:
			startidx = lineindex.select(start - 1 if start else 0)
			endidx = (lineindex.select(end) if end is not None
					and end < len(lineindex) else len(data))
			if sents:
				for pattern in patterns:
					for match in islice(
							pattern.finditer(data, startidx, endidx),
							maxresults):
						mstart = match.start()
						mend = match.end()
						lineno = lineindex.rank(mstart)
						offset, nextoffset = _getoffsets(
								lineno, lineindex, data)
						sent = data[offset:nextoffset].decode('utf8')
						mstart = len(data[offset:mstart].decode('utf8'))
						mend = len(data[offset:mend].decode('utf8'))
						# sentno, sent, high1, high2
						result.append((lineno, sent, range(mstart, mend), ()))
			else:
				for pattern in patterns:
					try:
						result.append(pattern.count(data, startidx, endidx))
					except AttributeError:
						result.append(len(pattern.findall(
								data, startidx, endidx)))
		finally:
			data.close()
			if hasattr(mrb, 'close'):
				mrb.close()
			del mrb
	return result
Beispiel #15
0
class RegexSearcher(CorpusSearcher):
	"""Search a plain text file in UTF-8 with regular expressions.

	Assumes that non-empty lines correspond to sentences; empty lines
	do not count towards line numbers (e.g., when used as paragraph breaks).

	:param macros: a file containing lines of the form ``'name=regex'``;
		an occurrence of ``'{name}'`` will be replaced with ``regex`` when it
		appears in a query.
	:param ignorecase: ignore case in all queries."""

	def __init__(self, files, macros=None, numproc=None, ignorecase=False,
			inmemory=False):
		super().__init__(files, macros, numproc)
		self.macros = None
		self.flags = re.MULTILINE
		if ignorecase:
			self.flags |= re.IGNORECASE
		if macros:
			with openread(macros) as tmp:
				self.macros = dict(line.strip().split('=', 1) for line in tmp)
		self.fileno = {filename: n for n, filename in enumerate(sorted(files))}
		maxmtime = max(os.stat(a).st_mtime for a in files)
		path = os.path.dirname(next(iter(sorted(files))))
		self.lineidxpath = os.path.join(path, 'treesearchline.idx')
		if os.path.exists(self.lineidxpath):
			mtime = os.stat(self.lineidxpath).st_mtime
			tmp = MultiRoaringBitmap.fromfile(self.lineidxpath)
		else:
			mtime, tmp = 0, []
		if len(tmp) == len(files) and mtime > maxmtime:
			self.lineindex = tmp
		else:
			tmp = [_indexfile(name) for name in sorted(files)]
			self.lineindex = MultiRoaringBitmap(tmp, filename=self.lineidxpath)
		if inmemory:
			for filename in self.files:
				fileno = os.open(filename, os.O_RDONLY)
				buf = mmap.mmap(fileno, 0, access=mmap.ACCESS_READ)
				self.files[filename] = (fileno, buf)
		self.pool = concurrent.futures.ProcessPoolExecutor(self.numproc)

	def close(self):
		if self.files is None:
			return
		for val in self.files.values():
			if val is not None:
				fileno, buf = val
				buf.close()
				os.close(fileno)
		if hasattr(self.lineindex, 'close'):
			self.lineindex.close()
		self.files = None

	def counts(self, query, subset=None, start=None, end=None, indices=False,
			breakdown=False):
		if breakdown and indices:
			raise NotImplementedError
		subset = subset or self.files
		if self.macros is not None:
			query = query.format(**self.macros)
		result = OrderedDict()
		jobs = {}
		pattern = _regex_parse_query(query, self.flags)
		for filename in subset:
			try:
				result[filename] = self.cache[
						'counts', query, filename, start, end, indices, False,
						breakdown]
			except KeyError:
				jobs[self._submit(_regex_run_query, pattern, filename,
						self.fileno[filename], self.lineidxpath, start, end,
						None, indices, False, breakdown)] = filename
		for future in self._as_completed(jobs):
			filename = jobs[future]
			self.cache['counts', query, filename, start, end, indices, False,
					breakdown] = result[filename] = future.result()
		return result

	def sents(self, query, subset=None, start=None, end=None, maxresults=100,
			brackets=False):
		if brackets:
			raise ValueError('not applicable with plain text corpus.')
		subset = subset or self.files
		if self.macros is not None:
			query = query.format(**self.macros)
		result = []
		jobs = {}
		for filename in subset:
			try:
				x, maxresults2 = self.cache['sents', query, filename,
						start, end, True, True]
			except KeyError:
				maxresults2 = 0
			if not maxresults or maxresults > maxresults2:
				jobs[self._submit(
						_regex_query if self.numproc == 1 else _regex_query_mp,
						query, filename, self.fileno[filename],
						self.lineidxpath, self.flags, start, end, maxresults,
						True, True)] = filename
			else:
				result.extend(x[:maxresults])
		for future in self._as_completed(jobs):
			filename = jobs[future]
			x = []
			for sentno, sent, start, end in future.result():
				highlight = range(start, end)
				x.append((filename, sentno, sent, highlight, ()))
			self.cache['sents', query, filename, start, end, True, True
					] = x, maxresults
			result.extend(x)
		return result

	def trees(self, query, subset=None, start=None, end=None, maxresults=10,
			nofunc=False, nomorph=False):
		raise ValueError('not applicable with plain text corpus.')

	def batchcounts(self, queries, subset=None, start=None, end=None):
		if self.macros is None:
			patterns = [_regex_parse_query(query, self.flags)
					for query in queries]
		else:
			patterns = [_regex_parse_query(query.format(
					**self.macros), self.flags) for query in queries]
		chunksize = max(int(len(patterns) / (self.numproc * 4)), 1)
		chunkedpatterns = [patterns[n:n + chunksize]
				for n in range(0, len(patterns), chunksize)]
		result = OrderedDict((name, [])
				for name in subset or self.files)
		for filename in subset or self.files:
			result = array.array('I')
			for tmp in self._map(_regex_run_batch, chunkedpatterns,
					filename=filename, fileno=self.fileno[filename],
					lineidxpath=self.lineidxpath, start=start, end=end):
				result.extend(tmp)
			yield filename, result

	def batchsents(self, queries, subset=None, start=None, end=None,
			maxresults=100, brackets=False):
		"""Variant of sents() to run a batch of queries."""
		if brackets:
			raise ValueError('not applicable with plain text corpus.')
		if self.macros is None:
			patterns = [_regex_parse_query(query, self.flags)
					for query in queries]
		else:
			patterns = [_regex_parse_query(query.format(
				**self.macros), self.flags) for query in queries]
		chunksize = max(int(len(patterns) / (self.numproc * 4)), 1)
		chunkedpatterns = [patterns[n:n + chunksize]
				for n in range(0, len(patterns), chunksize)]
		result = OrderedDict((name, [])
				for name in subset or self.files)
		for filename in subset or self.files:
			result = []
			for tmp in self._map(_regex_run_batch, chunkedpatterns,
					filename=filename, fileno=self.fileno[filename],
					lineidxpath=self.lineidxpath, start=start, end=end,
					maxresults=maxresults, sents=True):
				result.extend(tmp)
			yield filename, result

	def extract(self, filename, indices,
			nofunc=False, nomorph=False, sents=True):
		if not sents:
			raise ValueError('not applicable with plain text corpus.')
		lineindex = self.lineindex[self.fileno[filename]]
		result = []
		if self.files[filename] is not None:
			_, data = self.files[filename]
			for lineno in indices:
				offset, nextoffset = _getoffsets(lineno, lineindex, data)
				result.append(data[offset:nextoffset].decode('utf8'))
		else:
			with open(filename, 'rb') as tmp:
				for lineno in indices:
					offset, nextoffset = _getoffsets(lineno, lineindex, None)
					tmp.seek(offset)
					result.append(tmp.read(nextoffset - offset
							).rstrip(b'\n').decode('utf8'))
		return result

	def getinfo(self, filename):
		numlines = len(self.lineindex[self.fileno[filename]])
		if self.files[filename] is None:
			with openread(filename) as inp:
				data = inp.read()
		else:
			_, data = self.files[filename]
		numwords = data.count(' ') + numlines
		return CorpusInfo(
				len=numlines, numwords=numwords,
				numnodes=0, maxnodes=None)
Beispiel #16
0
 def test_aggregateand(self, multi):
     ref = set(multi[0])
     res1 = ref.intersection(*[set(a) for a in multi[1:]])
     mrb = MultiRoaringBitmap([ImmutableRoaringBitmap(a) for a in multi])
     res2 = mrb.intersection(list(range(len(mrb))))
     assert res1 == res2
Beispiel #17
0
	def test_aggregateand(self, multi):
		ref = set(multi[0])
		res1 = ref.intersection(*[set(a) for a in multi[1:]])
		mrb = MultiRoaringBitmap([ImmutableRoaringBitmap(a) for a in multi])
		res2 = mrb.intersection(list(range(len(mrb))))
		assert res1 == res2
Beispiel #18
0
 def test_init(self, multi):
     orig = [RoaringBitmap(a) for a in multi]
     mrb = MultiRoaringBitmap(orig)
     assert len(orig) == len(mrb)
     for rb1, rb2 in zip(orig, mrb):
         assert rb1 == rb2
Beispiel #19
0
def _regex_run_query(pattern, filename, fileno, lineidxpath,
		start=None, end=None, maxresults=None, indices=False, sents=False,
		breakdown=False):
	"""Run a prepared query on a single file."""
	mrb = MultiRoaringBitmap.fromfile(lineidxpath)
	lineindex = mrb.get(fileno)
	if indices and sents:
		result = []
	elif indices:
		result = array.array('I')
	elif breakdown:
		result = Counter()
	else:
		result = 0
	lastline = end is None or end > len(lineindex) - 1
	if lastline:
		end = len(lineindex) - 1
	if start and start > len(lineindex):
		return result
	startidx = lineindex.select(start - 1 if start else 0)
	endidx = lineindex.select(end)
	with open(filename, 'rb') as tmp:
		if startidx == 0 and lastline:
			chunkoffset = 0
			data = mmap.mmap(tmp.fileno(), 0, access=mmap.ACCESS_READ)
		else:
			chunkoffset = startidx
			tmp.seek(chunkoffset)
			startidx, endidx = 0, endidx - chunkoffset
			data = tmp.read(endidx)
		try:
			if (start or 0) >= len(lineindex):
				return result
			if indices or sents:
				for match in islice(
						pattern.finditer(data, startidx, endidx), maxresults):
					mstart = match.start()
					mend = match.end()
					lineno = lineindex.rank(mstart + chunkoffset)
					if not sents:
						result.append(lineno)
						continue
					offset, nextoffset = _getoffsets(lineno, lineindex, None)
					offset -= chunkoffset
					nextoffset -= chunkoffset
					sent = data[offset:nextoffset].rstrip(b'\n').decode('utf8')
					mstart = len(data[offset:mstart].decode('utf8'))
					mend = len(data[offset:mend].decode('utf8'))
					# (lineno, sent, startspan, endspan)
					result.append((lineno, sent, mstart, mend))
			else:
				if breakdown:
					matches = pattern.findall(
							data, startidx, endidx)[:maxresults]
					result.update(a.decode('utf8') for a in matches)
				else:
					try:
						result = pattern.count(data, startidx, endidx)
					except AttributeError:
						result = len(pattern.findall(data, startidx, endidx))
					result = max(result, maxresults or 0)
		finally:
			if isinstance(data, mmap.mmap):
				data.close()
			if hasattr(mrb, 'close'):
				mrb.close()
			del mrb
	return result