Exemple #1
0
def get_detail_context(corp,
                       pos,
                       hitlen=1,
                       detail_left_ctx=40,
                       detail_right_ctx=40,
                       attrs=None,
                       structs='',
                       detail_ctx_incr=60):
    data = {}
    corpus_encoding = corp.get_conf('ENCODING')
    wrapdetail = corp.get_conf('WRAPDETAIL')
    if wrapdetail:
        data['wrapdetail'] = '<%s>' % wrapdetail
        if not wrapdetail in structs.split(','):
            data['deletewrap'] = True
        structs = wrapdetail + ',' + structs
    else:
        data['wrapdetail'] = ''
    try:
        maxdetail = int(corp.get_conf('MAXDETAIL'))
        if maxdetail == 0:
            maxdetail = int(corp.get_conf('MAXCONTEXT'))
            if maxdetail == 0:
                maxdetail = sys.maxint
    except:
        maxdetail = 0
    if maxdetail:
        if detail_left_ctx > maxdetail:
            detail_left_ctx = maxdetail
        if detail_right_ctx > maxdetail:
            detail_right_ctx = maxdetail
    if detail_left_ctx > pos:
        detail_left_ctx = pos
    query_attrs = 'word' if attrs is None else ','.join(attrs)
    cr = manatee.CorpRegion(corp, query_attrs, structs)
    region_left = tokens2strclass(cr.region(pos - detail_left_ctx, pos))
    region_kwic = tokens2strclass(cr.region(pos, pos + hitlen))
    region_right = tokens2strclass(
        cr.region(pos + hitlen, pos + hitlen + detail_right_ctx))
    for seg in region_left + region_kwic + region_right:
        seg['str'] = import_string(seg['str'].replace('===NONE===', ''),
                                   from_encoding=corpus_encoding)
    for seg in region_kwic:
        if not seg['class']:
            seg['class'] = 'coll'
    data['content'] = region_left + region_kwic + region_right
    refbase = [('pos', pos)]
    if hitlen != 1:
        refbase.append(('hitlen', hitlen))
    data['expand_left_args'] = dict(
        refbase + [('detail_left_ctx', detail_left_ctx +
                    detail_ctx_incr), ('detail_right_ctx', detail_right_ctx)])
    data['expand_right_args'] = dict(refbase +
                                     [('detail_left_ctx', detail_left_ctx),
                                      ('detail_right_ctx',
                                       detail_right_ctx + detail_ctx_incr)])
    data['righttoleft'] = corp.get_conf('RIGHTTOLEFT')
    data['pos'] = pos
    data['maxdetail'] = maxdetail
    return data
Exemple #2
0
  def run(self):

    # Check whether query is prepared.
    if self.corpus is None:
      raise QueryError('You must specify the corpus to do a search.')
    if self.attributes is None:
      raise QueryError('You must specify at least one attribute to do a search.')
    if self.structures is None:
      raise QueryError('You must specify at least one structure to do a search.')
    if self.references is None:
      raise QueryError('You must specify at least one reference to do a search.')
    if self.container  is None and not issubclass(type(self.processor), Nonprocessor):
      raise QueryError('You must specify the container to do a search.')
    if self.string is None or self.string is '':
      raise QueryError('You must set the string property to a search string.')

    # Check whether processor of proper type
    if self.processor and not issubclass(type(self.processor), Processor):
      raise QueryError('The processor class must inherit from SeaCOW.Processor.')

    # Emit heuristic warning that container might end up being to small.
    # This warns about the behviour reported 2020 by EP.
    q_pattern = r'.* within *<' + self.container + r'(| [^>]+)/>.*'
    q_string = r'within <' + self.container + r'/>'
    if not re.match(q_pattern, self.string):
      print("WARNING! Your query should probably end in '" + q_string + "' or your match might exceed the exported container.")
      if self.context_left == 0 or self.context_right == 0:
        print(" ... especially because at least one of your contexts is 0!")
      print(" ... Watch out for 'Index anomaly' warnings.")
      print


    # Allow the processor to engage in preparatory action/check whether everything is fine.
    if self.processor:
      self.processor.prepare(self)

    # Set up and run query.
    h_corpus      = manatee.Corpus(self.corpus)
    if self.subcorpus is not None:
        # If subcorpus name is given (instead of path), figure out full path to subcorpus .subc file.
        if not "/" in self.subcorpus:
            self.subcorpus = h_corpus.get_conf("PATH") + "subcorp/" + re.sub("\.subc$", "", self.subcorpus.strip(" /")) + ".subc"
        if os.path.exists(self.subcorpus):
            h_corpus = manatee.SubCorpus (h_corpus, self.subcorpus)
        else:
            raise QueryError('The requested subcorpus cannot be found.')

    if not issubclass(type(self.processor), Nonprocessor):
      h_region      = manatee.CorpRegion(h_corpus, ','.join(self.attributes), ','.join(self.structures))
      h_cont        = h_corpus.get_struct(self.container)
      h_refs        = [h_corpus.get_attr(r) for r in self.references]

    start_time    = time.time()
    results       = h_corpus.eval_query(self.string)

    # Process results.
    counter  = 0
    dup_no   = 0

    # In case class is "Noprocessor", we do not process the stream.
    if issubclass(type(self.processor), Nonprocessor):

      # Store the hit count as reported.
      self.hits = results.count_rest()
    else:
      while not results.end() and (self.max_hits < 0 or counter < self.max_hits):

        # Skip randomly if random subset desired.
        if self.random_subset > 0 and random.random() > self.random_subset:
          results.next()
          continue

        kwic_beg = results.peek_beg()                                  # Match begin.
        kwic_end = results.peek_end()                                  # Match end.
        cont_beg_num = h_cont.num_at_pos(kwic_beg)-self.context_left   # Container at match begin.
        cont_end_num = h_cont.num_at_pos(kwic_beg)+self.context_right  # Container at match end.

        # If hit not in desired region, drop.
        if cont_beg_num < 0 or cont_end_num < 0:
          results.next()
          continue

        cont_beg_pos = h_cont.beg(cont_beg_num)                   # Pos at container begin.
        cont_end_pos = h_cont.end(cont_end_num)                   # Pos at container end.

        refs = [h_refs[i].pos2str(kwic_beg) for i in range(0, len(h_refs))]
        region = h_region.region(cont_beg_pos, cont_end_pos, '\t', '\t')

        # Deduping.
        if type(self.bloom) is pybloom_live.ScalableBloomFilter:
          dd_region = ''.join([region[i].strip().lower() for i in range(0, len(region), 1+len(self.attributes))])
          if {dd_region : 0} in self.bloom:
            dup_no += 1
            results.next()
            continue
          else:
            self.bloom.add({dd_region : 0})

        # Call the processor.
        if self.processor:
          self.processor.process(self, region, refs, kwic_beg - cont_beg_pos, kwic_end - kwic_beg)

        # Advance stream/loop.
        results.next()
        counter = counter + 1

      # After loop but inside "if not Nonprocessor", set hit count.
      self.hits          = counter

    self.querytime     = strftime("%Y-%m-%d %H:%M:%S", gmtime())
    self.duplicates    = dup_no
    self.elapsed       = time.time()-start_time

    # Allow the processor to finalise its job.
    if self.processor:
      self.processor.finalise(self)
Exemple #3
0
def get_detail_context(corp,
                       pos,
                       hitlen=1,
                       detail_left_ctx=40,
                       detail_right_ctx=40,
                       addattrs=None,
                       structs='',
                       detail_ctx_incr=60):
    data = {}
    if addattrs is None:
        addattrs = []
    corpus_encoding = corp.get_conf('ENCODING')
    wrapdetail = corp.get_conf('WRAPDETAIL')
    if wrapdetail:
        data['wrapdetail'] = '<%s>' % wrapdetail
        if not wrapdetail in structs.split(','):
            data['deletewrap'] = True
        structs = wrapdetail + ',' + structs
    else:
        data['wrapdetail'] = ''
    try:
        maxdetail = int(corp.get_conf('MAXDETAIL'))
        if maxdetail == 0:
            maxdetail = int(corp.get_conf('MAXCONTEXT'))
            if maxdetail == 0:
                maxdetail = sys.maxint
    except:
        maxdetail = 0
    if maxdetail:
        if detail_left_ctx > maxdetail:
            detail_left_ctx = maxdetail
        if detail_right_ctx > maxdetail:
            detail_right_ctx = maxdetail
    if detail_left_ctx > pos:
        detail_left_ctx = pos
    attrs = ','.join(['word'] + addattrs)
    cr = manatee.CorpRegion(corp, attrs, structs)
    region_left = tokens2strclass(cr.region(pos - detail_left_ctx, pos))
    region_kwic = tokens2strclass(cr.region(pos, pos + hitlen))
    region_right = tokens2strclass(
        cr.region(pos + hitlen, pos + hitlen + detail_right_ctx))
    for seg in region_left + region_kwic + region_right:
        seg['str'] = import_string(seg['str'].replace('===NONE===', ''),
                                   from_encoding=corpus_encoding)
    for seg in region_kwic:
        if not seg['class']:
            seg['class'] = 'coll'
    data['content'] = region_left + region_kwic + region_right
    refbase = 'pos=%i&' % pos
    if hitlen != 1:
        refbase += 'hitlen=%i&' % hitlen
    data['leftlink'] = refbase + (
        'detail_left_ctx=%i&detail_right_ctx=%i' %
        (detail_left_ctx + detail_ctx_incr, detail_right_ctx))
    data['rightlink'] = refbase + (
        'detail_left_ctx=%i&detail_right_ctx=%i' %
        (detail_left_ctx, detail_right_ctx + detail_ctx_incr))
    data['righttoleft'] = corp.get_conf('RIGHTTOLEFT')
    data['pos'] = pos
    data['maxdetail'] = maxdetail
    return data
Exemple #4
0
def get_detail_context(corp: KCorpus,
                       pos,
                       hitlen=1,
                       detail_left_ctx=40,
                       detail_right_ctx=40,
                       attrs=None,
                       structs='',
                       detail_ctx_incr=60):
    data = {}
    wrapdetail = corp.get_conf('WRAPDETAIL')
    if wrapdetail:
        data['wrapdetail'] = '<%s>' % wrapdetail
        if not wrapdetail in structs.split(','):
            data['deletewrap'] = True
        structs = wrapdetail + ',' + structs
    else:
        data['wrapdetail'] = ''
    try:
        maxdetail = int(corp.get_conf('MAXDETAIL'))
        if maxdetail == 0:
            maxdetail = int(corp.get_conf('MAXCONTEXT'))
            if maxdetail == 0:
                maxdetail = sys.maxsize
    except:
        maxdetail = 0
    if maxdetail:
        if detail_left_ctx > maxdetail:
            detail_left_ctx = maxdetail
        if detail_right_ctx > maxdetail:
            detail_right_ctx = maxdetail
    if detail_left_ctx > pos:
        detail_left_ctx = pos
    query_attrs = 'word' if attrs is None else ','.join(attrs)

    # we get left and right overlapping regions with kwic region to get also structures between regions
    cr = manatee.CorpRegion(corp.unwrap(), query_attrs, structs)
    region_left = tokens2strclass(cr.region(pos - detail_left_ctx, pos + 1))
    region_kwic = tokens2strclass(cr.region(pos, pos + hitlen))
    region_right = tokens2strclass(
        cr.region(pos + hitlen - 1, pos + hitlen + detail_right_ctx))
    for seg in region_left + region_kwic + region_right:
        seg['str'] = seg['str'].replace('===NONE===', '')

    # here we subtract kwic region from left and right regions...
    left_kwic_part = tokens2strclass(cr.region(pos, pos + 1))[0]['str']
    if region_left[-1]['str'].endswith(left_kwic_part):
        region_left[-1]['str'] = region_left[-1]['str'].rsplit(
            left_kwic_part, 1)[0]
    right_kwic_part = tokens2strclass(cr.region(pos + hitlen - 1,
                                                pos + hitlen))[0]['str']
    if region_right[0]['str'].startswith(right_kwic_part):
        region_right[0]['str'] = region_right[0]['str'].split(
            right_kwic_part, 1)[1]

    # ...and remove empty strings
    region_left = [v for v in region_left if v['str']]
    region_right = [v for v in region_right if v['str']]

    for seg in region_kwic:
        if not seg['class']:
            seg['class'] = 'coll'
    data['content'] = region_left + region_kwic + region_right
    refbase = [('pos', pos)]
    if hitlen != 1:
        refbase.append(('hitlen', hitlen))
    data['expand_left_args'] = dict(
        refbase + [('detail_left_ctx', detail_left_ctx +
                    detail_ctx_incr), ('detail_right_ctx', detail_right_ctx)])
    data['expand_right_args'] = dict(refbase +
                                     [('detail_left_ctx', detail_left_ctx),
                                      ('detail_right_ctx',
                                       detail_right_ctx + detail_ctx_incr)])
    data['righttoleft'] = corp.get_conf('RIGHTTOLEFT')
    data['pos'] = pos
    data['maxdetail'] = maxdetail
    return data
Exemple #5
0
def cow_query(query,
              corpus=DEFAULT_CORPUS,
              container='s',
              max_hits=-1,
              random_subset=-1,
              deduping=False,
              context_left=0,
              context_right=0,
              attributes=DEFAULT_ATTRS,
              structures=DEFAULT_STRUCTURES,
              references=DEFAULT_REFS):

    result = list()

    # Set up and run query.
    h_corpus = manatee.Corpus(corpus)
    h_region = manatee.CorpRegion(h_corpus, ','.join(attributes),
                                  ','.join(structures))
    h_cont = h_corpus.get_struct(container)
    h_refs = [h_corpus.get_attr(r) for r in references]
    start_time = time.time()
    results = h_corpus.eval_query(query)

    # Process results.
    counter = 0
    dup_no = 0
    if deduping: dups = dict()
    while not results.end() and (max_hits < 0 or counter < max_hits):

        # Skip randomly if random subset desired.
        if random_subset > 0 and random.random() > random_subset:
            results.next()
            continue

        kwic_beg = results.peek_beg()  # Match begin.
        kwic_end = results.peek_end()  # Match end.
        cont_beg_num = h_cont.num_at_pos(
            kwic_beg) - context_left  # Container at match begin.
        cont_end_num = h_cont.num_at_pos(
            kwic_beg) + context_right  # Container at match end.

        # If hit not in desired region, drop.
        if cont_beg_num < 0 or cont_end_num < 0:
            results.next()
            continue

        cont_beg_pos = h_cont.beg(cont_beg_num)  # Pos at container begin.
        cont_end_pos = h_cont.end(cont_end_num)  # Pos at container end.

        # TODO RS Memory and time (likely malloc, CPU load actually *lower*) lost in next 2 lines!
        refs = [h_refs[i].pos2str(kwic_beg) for i in range(0, len(h_refs))]
        region = h_region.region(cont_beg_pos, cont_end_pos, '\t', '\t')

        # Deduping.
        if deduping:
            dd_region = ''.join([
                region[i].strip().lower()
                for i in range(0, len(region), 1 + len(attributes))
            ])
            if dd_region in dups:
                dup_no += 1
                results.next()
                continue
            else:
                dups.update({dd_region: 0})

        result.append({
            'match_offset': kwic_beg - cont_beg_pos,
            'match_length': kwic_end - kwic_beg,
            'meta': refs,
            'region': region
        })

        # Advance stream/loop.
        results.next()
        counter += 1

    end_time = time.time()
    result = { 'query' : query, 'corpus' : corpus, 'container' : '<'+container+'/>', 'hits' : counter, 'max_hits' : max_hits, \
      'random_subset' : random_subset, 'context_left' : context_left, 'context_right' : context_right, 'attributes' : attributes, \
      'structures' : structures, 'references' : references, 'datetime' : strftime("%Y-%m-%d %H:%M:%S", gmtime()), \
      'elapsed' : end_time-start_time, 'deduping' : str(deduping), 'duplicates' : dup_no, 'concordance' : result }

    return result