def get_detail_context(corp, pos, hitlen=1, detail_left_ctx=40, detail_right_ctx=40, attrs=None, structs='', detail_ctx_incr=60): data = {} corpus_encoding = corp.get_conf('ENCODING') wrapdetail = corp.get_conf('WRAPDETAIL') if wrapdetail: data['wrapdetail'] = '<%s>' % wrapdetail if not wrapdetail in structs.split(','): data['deletewrap'] = True structs = wrapdetail + ',' + structs else: data['wrapdetail'] = '' try: maxdetail = int(corp.get_conf('MAXDETAIL')) if maxdetail == 0: maxdetail = int(corp.get_conf('MAXCONTEXT')) if maxdetail == 0: maxdetail = sys.maxint except: maxdetail = 0 if maxdetail: if detail_left_ctx > maxdetail: detail_left_ctx = maxdetail if detail_right_ctx > maxdetail: detail_right_ctx = maxdetail if detail_left_ctx > pos: detail_left_ctx = pos query_attrs = 'word' if attrs is None else ','.join(attrs) cr = manatee.CorpRegion(corp, query_attrs, structs) region_left = tokens2strclass(cr.region(pos - detail_left_ctx, pos)) region_kwic = tokens2strclass(cr.region(pos, pos + hitlen)) region_right = tokens2strclass( cr.region(pos + hitlen, pos + hitlen + detail_right_ctx)) for seg in region_left + region_kwic + region_right: seg['str'] = import_string(seg['str'].replace('===NONE===', ''), from_encoding=corpus_encoding) for seg in region_kwic: if not seg['class']: seg['class'] = 'coll' data['content'] = region_left + region_kwic + region_right refbase = [('pos', pos)] if hitlen != 1: refbase.append(('hitlen', hitlen)) data['expand_left_args'] = dict( refbase + [('detail_left_ctx', detail_left_ctx + detail_ctx_incr), ('detail_right_ctx', detail_right_ctx)]) data['expand_right_args'] = dict(refbase + [('detail_left_ctx', detail_left_ctx), ('detail_right_ctx', detail_right_ctx + detail_ctx_incr)]) data['righttoleft'] = corp.get_conf('RIGHTTOLEFT') data['pos'] = pos data['maxdetail'] = maxdetail return data
def run(self): # Check whether query is prepared. if self.corpus is None: raise QueryError('You must specify the corpus to do a search.') if self.attributes is None: raise QueryError('You must specify at least one attribute to do a search.') if self.structures is None: raise QueryError('You must specify at least one structure to do a search.') if self.references is None: raise QueryError('You must specify at least one reference to do a search.') if self.container is None and not issubclass(type(self.processor), Nonprocessor): raise QueryError('You must specify the container to do a search.') if self.string is None or self.string is '': raise QueryError('You must set the string property to a search string.') # Check whether processor of proper type if self.processor and not issubclass(type(self.processor), Processor): raise QueryError('The processor class must inherit from SeaCOW.Processor.') # Emit heuristic warning that container might end up being to small. # This warns about the behviour reported 2020 by EP. q_pattern = r'.* within *<' + self.container + r'(| [^>]+)/>.*' q_string = r'within <' + self.container + r'/>' if not re.match(q_pattern, self.string): print("WARNING! Your query should probably end in '" + q_string + "' or your match might exceed the exported container.") if self.context_left == 0 or self.context_right == 0: print(" ... especially because at least one of your contexts is 0!") print(" ... Watch out for 'Index anomaly' warnings.") print # Allow the processor to engage in preparatory action/check whether everything is fine. if self.processor: self.processor.prepare(self) # Set up and run query. h_corpus = manatee.Corpus(self.corpus) if self.subcorpus is not None: # If subcorpus name is given (instead of path), figure out full path to subcorpus .subc file. if not "/" in self.subcorpus: self.subcorpus = h_corpus.get_conf("PATH") + "subcorp/" + re.sub("\.subc$", "", self.subcorpus.strip(" /")) + ".subc" if os.path.exists(self.subcorpus): h_corpus = manatee.SubCorpus (h_corpus, self.subcorpus) else: raise QueryError('The requested subcorpus cannot be found.') if not issubclass(type(self.processor), Nonprocessor): h_region = manatee.CorpRegion(h_corpus, ','.join(self.attributes), ','.join(self.structures)) h_cont = h_corpus.get_struct(self.container) h_refs = [h_corpus.get_attr(r) for r in self.references] start_time = time.time() results = h_corpus.eval_query(self.string) # Process results. counter = 0 dup_no = 0 # In case class is "Noprocessor", we do not process the stream. if issubclass(type(self.processor), Nonprocessor): # Store the hit count as reported. self.hits = results.count_rest() else: while not results.end() and (self.max_hits < 0 or counter < self.max_hits): # Skip randomly if random subset desired. if self.random_subset > 0 and random.random() > self.random_subset: results.next() continue kwic_beg = results.peek_beg() # Match begin. kwic_end = results.peek_end() # Match end. cont_beg_num = h_cont.num_at_pos(kwic_beg)-self.context_left # Container at match begin. cont_end_num = h_cont.num_at_pos(kwic_beg)+self.context_right # Container at match end. # If hit not in desired region, drop. if cont_beg_num < 0 or cont_end_num < 0: results.next() continue cont_beg_pos = h_cont.beg(cont_beg_num) # Pos at container begin. cont_end_pos = h_cont.end(cont_end_num) # Pos at container end. refs = [h_refs[i].pos2str(kwic_beg) for i in range(0, len(h_refs))] region = h_region.region(cont_beg_pos, cont_end_pos, '\t', '\t') # Deduping. if type(self.bloom) is pybloom_live.ScalableBloomFilter: dd_region = ''.join([region[i].strip().lower() for i in range(0, len(region), 1+len(self.attributes))]) if {dd_region : 0} in self.bloom: dup_no += 1 results.next() continue else: self.bloom.add({dd_region : 0}) # Call the processor. if self.processor: self.processor.process(self, region, refs, kwic_beg - cont_beg_pos, kwic_end - kwic_beg) # Advance stream/loop. results.next() counter = counter + 1 # After loop but inside "if not Nonprocessor", set hit count. self.hits = counter self.querytime = strftime("%Y-%m-%d %H:%M:%S", gmtime()) self.duplicates = dup_no self.elapsed = time.time()-start_time # Allow the processor to finalise its job. if self.processor: self.processor.finalise(self)
def get_detail_context(corp, pos, hitlen=1, detail_left_ctx=40, detail_right_ctx=40, addattrs=None, structs='', detail_ctx_incr=60): data = {} if addattrs is None: addattrs = [] corpus_encoding = corp.get_conf('ENCODING') wrapdetail = corp.get_conf('WRAPDETAIL') if wrapdetail: data['wrapdetail'] = '<%s>' % wrapdetail if not wrapdetail in structs.split(','): data['deletewrap'] = True structs = wrapdetail + ',' + structs else: data['wrapdetail'] = '' try: maxdetail = int(corp.get_conf('MAXDETAIL')) if maxdetail == 0: maxdetail = int(corp.get_conf('MAXCONTEXT')) if maxdetail == 0: maxdetail = sys.maxint except: maxdetail = 0 if maxdetail: if detail_left_ctx > maxdetail: detail_left_ctx = maxdetail if detail_right_ctx > maxdetail: detail_right_ctx = maxdetail if detail_left_ctx > pos: detail_left_ctx = pos attrs = ','.join(['word'] + addattrs) cr = manatee.CorpRegion(corp, attrs, structs) region_left = tokens2strclass(cr.region(pos - detail_left_ctx, pos)) region_kwic = tokens2strclass(cr.region(pos, pos + hitlen)) region_right = tokens2strclass( cr.region(pos + hitlen, pos + hitlen + detail_right_ctx)) for seg in region_left + region_kwic + region_right: seg['str'] = import_string(seg['str'].replace('===NONE===', ''), from_encoding=corpus_encoding) for seg in region_kwic: if not seg['class']: seg['class'] = 'coll' data['content'] = region_left + region_kwic + region_right refbase = 'pos=%i&' % pos if hitlen != 1: refbase += 'hitlen=%i&' % hitlen data['leftlink'] = refbase + ( 'detail_left_ctx=%i&detail_right_ctx=%i' % (detail_left_ctx + detail_ctx_incr, detail_right_ctx)) data['rightlink'] = refbase + ( 'detail_left_ctx=%i&detail_right_ctx=%i' % (detail_left_ctx, detail_right_ctx + detail_ctx_incr)) data['righttoleft'] = corp.get_conf('RIGHTTOLEFT') data['pos'] = pos data['maxdetail'] = maxdetail return data
def get_detail_context(corp: KCorpus, pos, hitlen=1, detail_left_ctx=40, detail_right_ctx=40, attrs=None, structs='', detail_ctx_incr=60): data = {} wrapdetail = corp.get_conf('WRAPDETAIL') if wrapdetail: data['wrapdetail'] = '<%s>' % wrapdetail if not wrapdetail in structs.split(','): data['deletewrap'] = True structs = wrapdetail + ',' + structs else: data['wrapdetail'] = '' try: maxdetail = int(corp.get_conf('MAXDETAIL')) if maxdetail == 0: maxdetail = int(corp.get_conf('MAXCONTEXT')) if maxdetail == 0: maxdetail = sys.maxsize except: maxdetail = 0 if maxdetail: if detail_left_ctx > maxdetail: detail_left_ctx = maxdetail if detail_right_ctx > maxdetail: detail_right_ctx = maxdetail if detail_left_ctx > pos: detail_left_ctx = pos query_attrs = 'word' if attrs is None else ','.join(attrs) # we get left and right overlapping regions with kwic region to get also structures between regions cr = manatee.CorpRegion(corp.unwrap(), query_attrs, structs) region_left = tokens2strclass(cr.region(pos - detail_left_ctx, pos + 1)) region_kwic = tokens2strclass(cr.region(pos, pos + hitlen)) region_right = tokens2strclass( cr.region(pos + hitlen - 1, pos + hitlen + detail_right_ctx)) for seg in region_left + region_kwic + region_right: seg['str'] = seg['str'].replace('===NONE===', '') # here we subtract kwic region from left and right regions... left_kwic_part = tokens2strclass(cr.region(pos, pos + 1))[0]['str'] if region_left[-1]['str'].endswith(left_kwic_part): region_left[-1]['str'] = region_left[-1]['str'].rsplit( left_kwic_part, 1)[0] right_kwic_part = tokens2strclass(cr.region(pos + hitlen - 1, pos + hitlen))[0]['str'] if region_right[0]['str'].startswith(right_kwic_part): region_right[0]['str'] = region_right[0]['str'].split( right_kwic_part, 1)[1] # ...and remove empty strings region_left = [v for v in region_left if v['str']] region_right = [v for v in region_right if v['str']] for seg in region_kwic: if not seg['class']: seg['class'] = 'coll' data['content'] = region_left + region_kwic + region_right refbase = [('pos', pos)] if hitlen != 1: refbase.append(('hitlen', hitlen)) data['expand_left_args'] = dict( refbase + [('detail_left_ctx', detail_left_ctx + detail_ctx_incr), ('detail_right_ctx', detail_right_ctx)]) data['expand_right_args'] = dict(refbase + [('detail_left_ctx', detail_left_ctx), ('detail_right_ctx', detail_right_ctx + detail_ctx_incr)]) data['righttoleft'] = corp.get_conf('RIGHTTOLEFT') data['pos'] = pos data['maxdetail'] = maxdetail return data
def cow_query(query, corpus=DEFAULT_CORPUS, container='s', max_hits=-1, random_subset=-1, deduping=False, context_left=0, context_right=0, attributes=DEFAULT_ATTRS, structures=DEFAULT_STRUCTURES, references=DEFAULT_REFS): result = list() # Set up and run query. h_corpus = manatee.Corpus(corpus) h_region = manatee.CorpRegion(h_corpus, ','.join(attributes), ','.join(structures)) h_cont = h_corpus.get_struct(container) h_refs = [h_corpus.get_attr(r) for r in references] start_time = time.time() results = h_corpus.eval_query(query) # Process results. counter = 0 dup_no = 0 if deduping: dups = dict() while not results.end() and (max_hits < 0 or counter < max_hits): # Skip randomly if random subset desired. if random_subset > 0 and random.random() > random_subset: results.next() continue kwic_beg = results.peek_beg() # Match begin. kwic_end = results.peek_end() # Match end. cont_beg_num = h_cont.num_at_pos( kwic_beg) - context_left # Container at match begin. cont_end_num = h_cont.num_at_pos( kwic_beg) + context_right # Container at match end. # If hit not in desired region, drop. if cont_beg_num < 0 or cont_end_num < 0: results.next() continue cont_beg_pos = h_cont.beg(cont_beg_num) # Pos at container begin. cont_end_pos = h_cont.end(cont_end_num) # Pos at container end. # TODO RS Memory and time (likely malloc, CPU load actually *lower*) lost in next 2 lines! refs = [h_refs[i].pos2str(kwic_beg) for i in range(0, len(h_refs))] region = h_region.region(cont_beg_pos, cont_end_pos, '\t', '\t') # Deduping. if deduping: dd_region = ''.join([ region[i].strip().lower() for i in range(0, len(region), 1 + len(attributes)) ]) if dd_region in dups: dup_no += 1 results.next() continue else: dups.update({dd_region: 0}) result.append({ 'match_offset': kwic_beg - cont_beg_pos, 'match_length': kwic_end - kwic_beg, 'meta': refs, 'region': region }) # Advance stream/loop. results.next() counter += 1 end_time = time.time() result = { 'query' : query, 'corpus' : corpus, 'container' : '<'+container+'/>', 'hits' : counter, 'max_hits' : max_hits, \ 'random_subset' : random_subset, 'context_left' : context_left, 'context_right' : context_right, 'attributes' : attributes, \ 'structures' : structures, 'references' : references, 'datetime' : strftime("%Y-%m-%d %H:%M:%S", gmtime()), \ 'elapsed' : end_time-start_time, 'deduping' : str(deduping), 'duplicates' : dup_no, 'concordance' : result } return result