def __init__(self, files, macros=None, numproc=None, ignorecase=False, inmemory=False): super().__init__(files, macros, numproc) self.macros = None self.flags = re.MULTILINE if ignorecase: self.flags |= re.IGNORECASE if macros: with openread(macros) as tmp: self.macros = dict(line.strip().split('=', 1) for line in tmp) self.fileno = {filename: n for n, filename in enumerate(sorted(files))} maxmtime = max(os.stat(a).st_mtime for a in files) path = os.path.dirname(next(iter(sorted(files)))) self.lineidxpath = os.path.join(path, 'treesearchline.idx') if os.path.exists(self.lineidxpath): mtime = os.stat(self.lineidxpath).st_mtime tmp = MultiRoaringBitmap.fromfile(self.lineidxpath) else: mtime, tmp = 0, [] if len(tmp) == len(files) and mtime > maxmtime: self.lineindex = tmp else: tmp = [_indexfile(name) for name in sorted(files)] self.lineindex = MultiRoaringBitmap(tmp, filename=self.lineidxpath) if inmemory: for filename in self.files: fileno = os.open(filename, os.O_RDONLY) buf = mmap.mmap(fileno, 0, access=mmap.ACCESS_READ) self.files[filename] = (fileno, buf) self.pool = concurrent.futures.ProcessPoolExecutor(self.numproc)
def test_clamp(self, multi): a, b = sorted(sample(multi[0], 2)) ref = set.intersection(*[set(x) for x in multi]) & set(range(a, b)) mrb = MultiRoaringBitmap([RoaringBitmap(x) for x in multi]) rb = mrb.intersection(list(range(len(mrb))), start=a, stop=b) assert a <= rb.min() and rb.max() < b assert ref == rb
def test_none(self, multi): orig = [RoaringBitmap(a) for a in multi] orig.insert(4, None) mrb = MultiRoaringBitmap(orig) assert len(orig) == len(mrb) for rb1, rb2 in zip(orig, mrb): assert rb1 == rb2 assert mrb.intersection([4, 5]) is None
def test_none(self, multi): orig = [RoaringBitmap(a) for a in multi] orig.insert(4, RoaringBitmap()) mrb = MultiRoaringBitmap(orig) assert len(orig) == len(mrb) for rb1, rb2 in zip(orig, mrb): assert rb1 == rb2 assert mrb.intersection([4, 5]) is None
def test_jaccard(self, multi): mrb = MultiRoaringBitmap([ImmutableRoaringBitmap(a) for a in multi]) indices1 = array.array(b'L' if PY2 else 'L', [0, 6, 8]) indices2 = array.array(b'L' if PY2 else 'L', [1, 7, 6]) res = mrb.jaccard_dist(indices1, indices2) ref = array.array(b'd' if PY2 else 'd', [mrb[i].jaccard_dist(mrb[j]) for i, j in zip(indices1, indices2)]) assert res == ref
def test_clamp(self, multi): a, b = sorted(random.sample(multi[0], 2)) ref = set.intersection( *[set(x) for x in multi]) & set(range(a, b)) mrb = MultiRoaringBitmap([RoaringBitmap(x) for x in multi]) rb = mrb.intersection(list(range(len(mrb))), start=a, stop=b) assert a <= rb.min() and rb.max() < b assert ref == rb
def test_eq(self, multi): orig = [RoaringBitmap(a) for a in multi] mrb = MultiRoaringBitmap(orig) mrb2 = MultiRoaringBitmap(orig) mrb3 = MultiRoaringBitmap(orig[1:]) assert mrb == orig assert mrb == mrb2 assert mrb != orig[1:] assert mrb != mrb3
def test_jaccard(self, multi): mrb = MultiRoaringBitmap([ImmutableRoaringBitmap(a) for a in multi]) indices1 = array.array(b'L' if PY2 else 'L', [0, 6, 8]) indices2 = array.array(b'L' if PY2 else 'L', [1, 7, 6]) res = mrb.jaccard_dist(indices1, indices2) ref = array.array( b'd' if PY2 else 'd', [mrb[i].jaccard_dist(mrb[j]) for i, j in zip(indices1, indices2)]) assert res == ref
def test_serialize(self, multi): orig = [RoaringBitmap(a) for a in multi] mrb = MultiRoaringBitmap(orig) with tempfile.NamedTemporaryFile(delete=False) as tmp: mrb2 = MultiRoaringBitmap(orig, filename=tmp.name) del mrb2 mrb_deserialized = MultiRoaringBitmap.fromfile(tmp.name) assert len(orig) == len(mrb) assert len(orig) == len(mrb_deserialized) for rb1, rb2, rb3 in zip(orig, mrb, mrb_deserialized): assert rb1 == rb2 assert rb1 == rb3 rb3._checkconsistency() assert type(rb3) == ImmutableRoaringBitmap
def test_andor_len_pairwise(self, multi): mrb = MultiRoaringBitmap([ImmutableRoaringBitmap(a) for a in multi]) indices1 = array.array(b'L' if PY2 else 'L', [0, 6, 8]) indices2 = array.array(b'L' if PY2 else 'L', [1, 7, 6]) res1 = array.array(b'L' if PY2 else 'L', [0] * len(indices1)) res2 = array.array(b'L' if PY2 else 'L', [0] * len(indices1)) mrb.andor_len_pairwise(indices1, indices2, res1, res2) ref1 = array.array(b'L' if PY2 else 'L') ref2 = array.array(b'L' if PY2 else 'L') for i, j in zip(indices1, indices2): ref1.append(len(mrb[i] & mrb[j])) ref2.append(len(mrb[i] | mrb[j])) assert res1 == ref1 assert res2 == ref2
def test_multi2(self): for_multi_pre = [] for x in range(3): for_multi = [] for i in range(5): for_multi += [RoaringBitmap(sample(range(99999), 200))] mrb = MultiRoaringBitmap(for_multi) for_multi_pre += [mrb[0], mrb[1]] assert type(for_multi_pre) is list for_multi_pre[-1] list(for_multi_pre)
def test_multi1(self): for_multi = [] for i in range(5): for_multi += [RoaringBitmap(sample(range(99999), 200))] mrb = MultiRoaringBitmap(for_multi) assert len(mrb) == 5 assert mrb[4] == for_multi[4] with pytest.raises(IndexError): mrb[5] assert mrb[-1] == for_multi[-1] list(mrb) for n, rb in enumerate(mrb): assert rb == for_multi[n], n
def _regex_run_batch(patterns, filename, fileno, lineidxpath, start=None, end=None, maxresults=None, sents=False): """Run a batch of queries on a single file.""" mrb = MultiRoaringBitmap.fromfile(lineidxpath) lineindex = mrb.get(fileno) if sents: result = [] else: result = array.array('I') if start and start >= len(lineindex): return result with open(filename, 'rb') as tmp: data = mmap.mmap(tmp.fileno(), 0, access=mmap.ACCESS_READ) try: startidx = lineindex.select(start - 1 if start else 0) endidx = (lineindex.select(end) if end is not None and end < len(lineindex) else len(data)) if sents: for pattern in patterns: for match in islice( pattern.finditer(data, startidx, endidx), maxresults): mstart = match.start() mend = match.end() lineno = lineindex.rank(mstart) offset, nextoffset = _getoffsets( lineno, lineindex, data) sent = data[offset:nextoffset].decode('utf8') mstart = len(data[offset:mstart].decode('utf8')) mend = len(data[offset:mend].decode('utf8')) # sentno, sent, high1, high2 result.append((lineno, sent, range(mstart, mend), ())) else: for pattern in patterns: try: result.append(pattern.count(data, startidx, endidx)) except AttributeError: result.append(len(pattern.findall( data, startidx, endidx))) finally: data.close() if hasattr(mrb, 'close'): mrb.close() del mrb return result
class RegexSearcher(CorpusSearcher): """Search a plain text file in UTF-8 with regular expressions. Assumes that non-empty lines correspond to sentences; empty lines do not count towards line numbers (e.g., when used as paragraph breaks). :param macros: a file containing lines of the form ``'name=regex'``; an occurrence of ``'{name}'`` will be replaced with ``regex`` when it appears in a query. :param ignorecase: ignore case in all queries.""" def __init__(self, files, macros=None, numproc=None, ignorecase=False, inmemory=False): super().__init__(files, macros, numproc) self.macros = None self.flags = re.MULTILINE if ignorecase: self.flags |= re.IGNORECASE if macros: with openread(macros) as tmp: self.macros = dict(line.strip().split('=', 1) for line in tmp) self.fileno = {filename: n for n, filename in enumerate(sorted(files))} maxmtime = max(os.stat(a).st_mtime for a in files) path = os.path.dirname(next(iter(sorted(files)))) self.lineidxpath = os.path.join(path, 'treesearchline.idx') if os.path.exists(self.lineidxpath): mtime = os.stat(self.lineidxpath).st_mtime tmp = MultiRoaringBitmap.fromfile(self.lineidxpath) else: mtime, tmp = 0, [] if len(tmp) == len(files) and mtime > maxmtime: self.lineindex = tmp else: tmp = [_indexfile(name) for name in sorted(files)] self.lineindex = MultiRoaringBitmap(tmp, filename=self.lineidxpath) if inmemory: for filename in self.files: fileno = os.open(filename, os.O_RDONLY) buf = mmap.mmap(fileno, 0, access=mmap.ACCESS_READ) self.files[filename] = (fileno, buf) self.pool = concurrent.futures.ProcessPoolExecutor(self.numproc) def close(self): if self.files is None: return for val in self.files.values(): if val is not None: fileno, buf = val buf.close() os.close(fileno) if hasattr(self.lineindex, 'close'): self.lineindex.close() self.files = None def counts(self, query, subset=None, start=None, end=None, indices=False, breakdown=False): if breakdown and indices: raise NotImplementedError subset = subset or self.files if self.macros is not None: query = query.format(**self.macros) result = OrderedDict() jobs = {} pattern = _regex_parse_query(query, self.flags) for filename in subset: try: result[filename] = self.cache[ 'counts', query, filename, start, end, indices, False, breakdown] except KeyError: jobs[self._submit(_regex_run_query, pattern, filename, self.fileno[filename], self.lineidxpath, start, end, None, indices, False, breakdown)] = filename for future in self._as_completed(jobs): filename = jobs[future] self.cache['counts', query, filename, start, end, indices, False, breakdown] = result[filename] = future.result() return result def sents(self, query, subset=None, start=None, end=None, maxresults=100, brackets=False): if brackets: raise ValueError('not applicable with plain text corpus.') subset = subset or self.files if self.macros is not None: query = query.format(**self.macros) result = [] jobs = {} for filename in subset: try: x, maxresults2 = self.cache['sents', query, filename, start, end, True, True] except KeyError: maxresults2 = 0 if not maxresults or maxresults > maxresults2: jobs[self._submit( _regex_query if self.numproc == 1 else _regex_query_mp, query, filename, self.fileno[filename], self.lineidxpath, self.flags, start, end, maxresults, True, True)] = filename else: result.extend(x[:maxresults]) for future in self._as_completed(jobs): filename = jobs[future] x = [] for sentno, sent, start, end in future.result(): highlight = range(start, end) x.append((filename, sentno, sent, highlight, ())) self.cache['sents', query, filename, start, end, True, True ] = x, maxresults result.extend(x) return result def trees(self, query, subset=None, start=None, end=None, maxresults=10, nofunc=False, nomorph=False): raise ValueError('not applicable with plain text corpus.') def batchcounts(self, queries, subset=None, start=None, end=None): if self.macros is None: patterns = [_regex_parse_query(query, self.flags) for query in queries] else: patterns = [_regex_parse_query(query.format( **self.macros), self.flags) for query in queries] chunksize = max(int(len(patterns) / (self.numproc * 4)), 1) chunkedpatterns = [patterns[n:n + chunksize] for n in range(0, len(patterns), chunksize)] result = OrderedDict((name, []) for name in subset or self.files) for filename in subset or self.files: result = array.array('I') for tmp in self._map(_regex_run_batch, chunkedpatterns, filename=filename, fileno=self.fileno[filename], lineidxpath=self.lineidxpath, start=start, end=end): result.extend(tmp) yield filename, result def batchsents(self, queries, subset=None, start=None, end=None, maxresults=100, brackets=False): """Variant of sents() to run a batch of queries.""" if brackets: raise ValueError('not applicable with plain text corpus.') if self.macros is None: patterns = [_regex_parse_query(query, self.flags) for query in queries] else: patterns = [_regex_parse_query(query.format( **self.macros), self.flags) for query in queries] chunksize = max(int(len(patterns) / (self.numproc * 4)), 1) chunkedpatterns = [patterns[n:n + chunksize] for n in range(0, len(patterns), chunksize)] result = OrderedDict((name, []) for name in subset or self.files) for filename in subset or self.files: result = [] for tmp in self._map(_regex_run_batch, chunkedpatterns, filename=filename, fileno=self.fileno[filename], lineidxpath=self.lineidxpath, start=start, end=end, maxresults=maxresults, sents=True): result.extend(tmp) yield filename, result def extract(self, filename, indices, nofunc=False, nomorph=False, sents=True): if not sents: raise ValueError('not applicable with plain text corpus.') lineindex = self.lineindex[self.fileno[filename]] result = [] if self.files[filename] is not None: _, data = self.files[filename] for lineno in indices: offset, nextoffset = _getoffsets(lineno, lineindex, data) result.append(data[offset:nextoffset].decode('utf8')) else: with open(filename, 'rb') as tmp: for lineno in indices: offset, nextoffset = _getoffsets(lineno, lineindex, None) tmp.seek(offset) result.append(tmp.read(nextoffset - offset ).rstrip(b'\n').decode('utf8')) return result def getinfo(self, filename): numlines = len(self.lineindex[self.fileno[filename]]) if self.files[filename] is None: with openread(filename) as inp: data = inp.read() else: _, data = self.files[filename] numwords = data.count(' ') + numlines return CorpusInfo( len=numlines, numwords=numwords, numnodes=0, maxnodes=None)
def test_aggregateand(self, multi): ref = set(multi[0]) res1 = ref.intersection(*[set(a) for a in multi[1:]]) mrb = MultiRoaringBitmap([ImmutableRoaringBitmap(a) for a in multi]) res2 = mrb.intersection(list(range(len(mrb)))) assert res1 == res2
def test_init(self, multi): orig = [RoaringBitmap(a) for a in multi] mrb = MultiRoaringBitmap(orig) assert len(orig) == len(mrb) for rb1, rb2 in zip(orig, mrb): assert rb1 == rb2
def _regex_run_query(pattern, filename, fileno, lineidxpath, start=None, end=None, maxresults=None, indices=False, sents=False, breakdown=False): """Run a prepared query on a single file.""" mrb = MultiRoaringBitmap.fromfile(lineidxpath) lineindex = mrb.get(fileno) if indices and sents: result = [] elif indices: result = array.array('I') elif breakdown: result = Counter() else: result = 0 lastline = end is None or end > len(lineindex) - 1 if lastline: end = len(lineindex) - 1 if start and start > len(lineindex): return result startidx = lineindex.select(start - 1 if start else 0) endidx = lineindex.select(end) with open(filename, 'rb') as tmp: if startidx == 0 and lastline: chunkoffset = 0 data = mmap.mmap(tmp.fileno(), 0, access=mmap.ACCESS_READ) else: chunkoffset = startidx tmp.seek(chunkoffset) startidx, endidx = 0, endidx - chunkoffset data = tmp.read(endidx) try: if (start or 0) >= len(lineindex): return result if indices or sents: for match in islice( pattern.finditer(data, startidx, endidx), maxresults): mstart = match.start() mend = match.end() lineno = lineindex.rank(mstart + chunkoffset) if not sents: result.append(lineno) continue offset, nextoffset = _getoffsets(lineno, lineindex, None) offset -= chunkoffset nextoffset -= chunkoffset sent = data[offset:nextoffset].rstrip(b'\n').decode('utf8') mstart = len(data[offset:mstart].decode('utf8')) mend = len(data[offset:mend].decode('utf8')) # (lineno, sent, startspan, endspan) result.append((lineno, sent, mstart, mend)) else: if breakdown: matches = pattern.findall( data, startidx, endidx)[:maxresults] result.update(a.decode('utf8') for a in matches) else: try: result = pattern.count(data, startidx, endidx) except AttributeError: result = len(pattern.findall(data, startidx, endidx)) result = max(result, maxresults or 0) finally: if isinstance(data, mmap.mmap): data.close() if hasattr(mrb, 'close'): mrb.close() del mrb return result