Esempio n. 1
0
 def _open_subcorpus(self, corpname: str, subcname: str, corp: Corpus,
                     spath: str, decode_desc: bool) -> Corpus:
     subc = manatee.SubCorpus(corp, spath)
     subc.corp = corp
     subc.spath = spath
     try:
         open(spath[:-4] + 'used', 'w')
     except IOError:
         pass
     subc.corpname = str(corpname)  # never unicode (paths)
     subc.subcname = subcname
     with open(spath, 'rb') as subcinfo:
         subc.subchash = md5(subcinfo.read()).hexdigest()
     subc.created = datetime.fromtimestamp(int(os.path.getctime(spath)))
     subc.is_published = subcorpus_is_published(spath)
     meta, desc = get_subcorp_pub_info(os.path.splitext(spath)[0] + '.name')
     if meta.subcpath:
         subc.orig_spath = meta.subcpath
         subc.orig_subcname = os.path.splitext(
             os.path.basename(meta.subcpath))[0]
     else:
         subc.orig_spath = None
         subc.orig_subcname = None
     subc.author = meta.author_name
     subc.author_id = meta.author_id
     if desc:
         subc.description = k_markdown(desc) if decode_desc else desc
     else:
         subc.description = None
     return subc
Esempio n. 2
0
 def get_Corpus(self, corpname, subcname=''):
     if ':' in corpname:
         corpname, subcname = corpname.split(':', 1)
     corp = manatee.Corpus(corpname)
     corp.corpname = str(corpname)  # never unicode (paths)
     corp.cm = self
     dsubcpath = self.default_subcpath(corp)
     if subcname:
         for sp in self.subcpath + [dsubcpath]:
             if sp == dsubcpath:
                 spath = os.path.join(sp, subcname + '.subc')
             else:
                 spath = os.path.join(sp, corpname, subcname + '.subc')
             if type(spath) == unicode:
                 spath = spath.encode("utf-8")
             if os.path.isfile(spath):
                 subc = manatee.SubCorpus(corp, spath)
                 subc.corp = corp
                 subc.spath = spath
                 try:
                     open(spath[:-4] + 'used', 'w')
                 except Exception:
                     pass
                 subc.corpname = str(corpname)  # never unicode (paths)
                 subc.subcname = subcname
                 subc.cm = self
                 subc.subchash = md5(open(spath).read()).hexdigest()
                 subc.created = datetime.fromtimestamp(int(os.path.getctime(spath)))
                 return subc
         raise RuntimeError(_('Subcorpus "%s" not found') % subcname)
     else:
         return corp
Esempio n. 3
0
 def _open_subcorpus(self, corpname, subcname, corp, spath, decode_desc):
     subc = manatee.SubCorpus(corp, spath)
     subc.corp = corp
     subc.spath = spath
     try:
         open(spath[:-4] + 'used', 'w')
     except IOError:
         pass
     subc.corpname = str(corpname)  # never unicode (paths)
     subc.subcname = subcname
     subc.cm = self
     subc.subchash = md5(open(spath).read()).hexdigest()
     subc.created = datetime.fromtimestamp(int(os.path.getctime(spath)))
     subc.is_published = subcorpus_is_published(spath)
     orig_path, author, desc = get_subcorp_pub_info(
         os.path.splitext(spath)[0] + '.name')
     if orig_path:
         subc.orig_spath = orig_path
         subc.orig_subcname = os.path.splitext(
             os.path.basename(orig_path))[0]
     else:
         subc.orig_spath = None
         subc.orig_subcname = None
     subc.author = author
     if desc:
         subc.description = k_markdown(desc) if decode_desc else desc
     else:
         subc.description = None
     return subc
Esempio n. 4
0
def _load_corp(corp_id, subc_path):
    """
    Instantiate a manatee.Corpus (or manatee.SubCorpus)
    instance

    arguments:
    corp_id -- a corpus identifier
    subc_path -- path to a subcorpus
    """
    corp = manatee.Corpus(corp_id)
    if subc_path:
        corp = manatee.SubCorpus(corp, subc_path)
    corp.corpname = corp_id
    return corp
Esempio n. 5
0
 def get_Corpus(self, corpname, corp_variant='', subcname=''):
     """
     args:
         corp_variant: a registry file path prefix for (typically) limited variant of a corpus;
                       please note that in many cases this can be omitted as only in case user
                       wants to see a continuous text (e.g. kwic context) we must make sure he
                       sees only a 'legal' chunk.
     """
     if ':' in corpname:
         corpname, subcname = corpname.split(':', 1)
     registry_file = os.path.join(corp_variant,
                                  corpname) if corp_variant else corpname
     corp = manatee.Corpus(registry_file)
     corp.corpname = str(corpname)  # never unicode (paths)
     corp.cm = self
     dsubcpath = self.default_subcpath(corp)
     if subcname:
         for sp in self.subcpath + [dsubcpath]:
             if sp == dsubcpath:
                 spath = os.path.join(sp, subcname + '.subc')
             else:
                 spath = os.path.join(sp, corpname, subcname + '.subc')
             if type(spath) == unicode:
                 spath = spath.encode("utf-8")
             if os.path.isfile(spath):
                 subc = manatee.SubCorpus(corp, spath)
                 subc.corp = corp
                 subc.spath = spath
                 try:
                     open(spath[:-4] + 'used', 'w')
                 except IOError:
                     pass
                 subc.corpname = str(corpname)  # never unicode (paths)
                 subc.subcname = subcname
                 subc.cm = self
                 subc.subchash = md5(open(spath).read()).hexdigest()
                 subc.created = datetime.fromtimestamp(
                     int(os.path.getctime(spath)))
                 return subc
         raise RuntimeError(_('Subcorpus "%s" not found') % subcname)
     else:
         return corp
Esempio n. 6
0
  def run(self):

    # Check whether query is prepared.
    if self.corpus is None:
      raise QueryError('You must specify the corpus to do a search.')
    if self.attributes is None:
      raise QueryError('You must specify at least one attribute to do a search.')
    if self.structures is None:
      raise QueryError('You must specify at least one structure to do a search.')
    if self.references is None:
      raise QueryError('You must specify at least one reference to do a search.')
    if self.container  is None and not issubclass(type(self.processor), Nonprocessor):
      raise QueryError('You must specify the container to do a search.')
    if self.string is None or self.string is '':
      raise QueryError('You must set the string property to a search string.')

    # Check whether processor of proper type
    if self.processor and not issubclass(type(self.processor), Processor):
      raise QueryError('The processor class must inherit from SeaCOW.Processor.')

    # Emit heuristic warning that container might end up being to small.
    # This warns about the behviour reported 2020 by EP.
    q_pattern = r'.* within *<' + self.container + r'(| [^>]+)/>.*'
    q_string = r'within <' + self.container + r'/>'
    if not re.match(q_pattern, self.string):
      print("WARNING! Your query should probably end in '" + q_string + "' or your match might exceed the exported container.")
      if self.context_left == 0 or self.context_right == 0:
        print(" ... especially because at least one of your contexts is 0!")
      print(" ... Watch out for 'Index anomaly' warnings.")
      print


    # Allow the processor to engage in preparatory action/check whether everything is fine.
    if self.processor:
      self.processor.prepare(self)

    # Set up and run query.
    h_corpus      = manatee.Corpus(self.corpus)
    if self.subcorpus is not None:
        # If subcorpus name is given (instead of path), figure out full path to subcorpus .subc file.
        if not "/" in self.subcorpus:
            self.subcorpus = h_corpus.get_conf("PATH") + "subcorp/" + re.sub("\.subc$", "", self.subcorpus.strip(" /")) + ".subc"
        if os.path.exists(self.subcorpus):
            h_corpus = manatee.SubCorpus (h_corpus, self.subcorpus)
        else:
            raise QueryError('The requested subcorpus cannot be found.')

    if not issubclass(type(self.processor), Nonprocessor):
      h_region      = manatee.CorpRegion(h_corpus, ','.join(self.attributes), ','.join(self.structures))
      h_cont        = h_corpus.get_struct(self.container)
      h_refs        = [h_corpus.get_attr(r) for r in self.references]

    start_time    = time.time()
    results       = h_corpus.eval_query(self.string)

    # Process results.
    counter  = 0
    dup_no   = 0

    # In case class is "Noprocessor", we do not process the stream.
    if issubclass(type(self.processor), Nonprocessor):

      # Store the hit count as reported.
      self.hits = results.count_rest()
    else:
      while not results.end() and (self.max_hits < 0 or counter < self.max_hits):

        # Skip randomly if random subset desired.
        if self.random_subset > 0 and random.random() > self.random_subset:
          results.next()
          continue

        kwic_beg = results.peek_beg()                                  # Match begin.
        kwic_end = results.peek_end()                                  # Match end.
        cont_beg_num = h_cont.num_at_pos(kwic_beg)-self.context_left   # Container at match begin.
        cont_end_num = h_cont.num_at_pos(kwic_beg)+self.context_right  # Container at match end.

        # If hit not in desired region, drop.
        if cont_beg_num < 0 or cont_end_num < 0:
          results.next()
          continue

        cont_beg_pos = h_cont.beg(cont_beg_num)                   # Pos at container begin.
        cont_end_pos = h_cont.end(cont_end_num)                   # Pos at container end.

        refs = [h_refs[i].pos2str(kwic_beg) for i in range(0, len(h_refs))]
        region = h_region.region(cont_beg_pos, cont_end_pos, '\t', '\t')

        # Deduping.
        if type(self.bloom) is pybloom_live.ScalableBloomFilter:
          dd_region = ''.join([region[i].strip().lower() for i in range(0, len(region), 1+len(self.attributes))])
          if {dd_region : 0} in self.bloom:
            dup_no += 1
            results.next()
            continue
          else:
            self.bloom.add({dd_region : 0})

        # Call the processor.
        if self.processor:
          self.processor.process(self, region, refs, kwic_beg - cont_beg_pos, kwic_end - kwic_beg)

        # Advance stream/loop.
        results.next()
        counter = counter + 1

      # After loop but inside "if not Nonprocessor", set hit count.
      self.hits          = counter

    self.querytime     = strftime("%Y-%m-%d %H:%M:%S", gmtime())
    self.duplicates    = dup_no
    self.elapsed       = time.time()-start_time

    # Allow the processor to finalise its job.
    if self.processor:
      self.processor.finalise(self)
Esempio n. 7
0
def _load_corp(corp_id, subc_path):
    corp = manatee.Corpus(corp_id)
    if subc_path:
        corp = manatee.SubCorpus(corp, subc_path)
    corp.corpname = corp_id
    return corp