def get_Corpus(self, corpname, subcname=''): if ':' in corpname: corpname, subcname = corpname.split(':', 1) corp = manatee.Corpus(corpname) corp.corpname = str(corpname) # never unicode (paths) corp.cm = self dsubcpath = self.default_subcpath(corp) if subcname: for sp in self.subcpath + [dsubcpath]: if sp == dsubcpath: spath = os.path.join(sp, subcname + '.subc') else: spath = os.path.join(sp, corpname, subcname + '.subc') if type(spath) == unicode: spath = spath.encode("utf-8") if os.path.isfile(spath): subc = manatee.SubCorpus(corp, spath) subc.corp = corp subc.spath = spath try: open(spath[:-4] + 'used', 'w') except Exception: pass subc.corpname = str(corpname) # never unicode (paths) subc.subcname = subcname subc.cm = self subc.subchash = md5(open(spath).read()).hexdigest() subc.created = datetime.fromtimestamp(int(os.path.getctime(spath))) return subc raise RuntimeError(_('Subcorpus "%s" not found') % subcname) else: return corp
def _get_cached_conc(corp, subchash, q, pid_dir, minsize): """ Loads a concordance from cache """ start_time = time.time() q = tuple(q) if not os.path.isdir(pid_dir): os.makedirs(pid_dir, mode=0o775) cache_map = plugins.get('conc_cache').get_mapping(corp) cache_map.refresh_map() if _contains_shuffle_seq(q): srch_from = 1 else: srch_from = len(q) ans = (0, None) for i in range(srch_from, 0, -1): cachefile = cache_map.cache_file_path(subchash, q[:i]) if cachefile: pidfile = cache_map.get_stored_pidfile(subchash, q[:i]) _wait_for_conc(corp=corp, q=q, subchash=subchash, cachefile=cachefile, cache_map=cache_map, pidfile=pidfile, minsize=minsize) if not os.path.exists(cachefile): # broken cache cache_map.del_entry(subchash, q) try: os.remove(pidfile) except OSError: pass continue conccorp = corp for qq in reversed(q[:i]): # find the right main corp, if aligned if qq.startswith('x-'): conccorp = manatee.Corpus(qq[2:]) break conc = PyConc(conccorp, 'l', cachefile, orig_corp=corp) if not _is_conc_alive(pidfile, minsize) and not conc.finished(): # unfinished and dead concordance cache_map.del_entry(subchash, q) try: os.remove(cachefile) except OSError: pass try: os.remove(pidfile) except OSError: pass continue ans = (i, conc) break logging.getLogger(__name__).debug( 'get_cached_conc(%s, [%s]) -> %s, %01.4f' % (corp.corpname, ','.join(q), 'hit' if ans[1] else 'miss', time.time() - start_time)) return ans
def __init__(self, directory, cache_size=int(1e7)): """ Opens a phrase extractor located in a given directory. `cache_size` determines the maximum number of vocabulary items that can be held in memory. """ self.directory = directory self.cache_size = cache_size # Filenames of outputs of build_lexicon(...) self.fn_phrase_types = path.join(self.directory, "phrase_types.pickle") self.fn_lexicon = path.join(self.directory, "lexicon.vert") self.fn_discarded = path.join(self.directory, "discarded.vert") # Load configuration fn_config = path.join(self.directory, ManateeExtractor.FN_CONFIG) if not path.isfile(fn_config): raise WorkspaceNotFound( "Configuration file `%s` does not exist." "Did you forget to `ManateeExtractor.prepare` the directory?" % fn_config ) with our_open(fn_config) as f: self.config = json.load(f) # Initialize Manatee objects self.corpus = manatee.Corpus(self.config["corpus"]) self.struct = self.corpus.get_struct(self.config["struct"]) self.lex_attr = self.corpus.get_attr(self.config["lex_attr"]) if self.config["cql_attr"] is not None: self.corpus.set_default_attr(self.config["cql_attr"]) self.struct_size = self.struct.size()
def get_data_path(self, corp_id): try: c = manatee.Corpus(os.path.join(self._reg_path, corp_id)) return c.get_conf('PATH').rstrip('/') except Exception as ex: logging.getLogger(__name__).warning(ex) return None
def get_info(self, corpus_id: str) -> DefaultManateeCorpusInfo: try: if corpus_id not in self._cache: self._cache[corpus_id] = DefaultManateeCorpusInfo(manatee.Corpus(corpus_id), corpus_id) return self._cache[corpus_id] except: # probably a misconfigured/missing corpus return DefaultManateeCorpusInfo(EmptyCorpus(corpname=corpus_id), corpus_id)
def analyze_corpus(corpname, corp_info): size = 0 try: corp = manatee.Corpus(os.path.join(REGISTRY_PATH, corpname)) size = corp.size() except: pass return (size, get_web(corp_info), get_keywords(corp_info))
def make(directory, corpus, struct, lex_attr, cql_attr=None): """ Prepares a workspace for phrase extraction in a given directory. # Arguments directory: All intermediate data generated during phrase extraction will be saved to this directory. corpus: Name of a compiled corpus. struct: Name of a structure (typically sentence). lex_attr: Lexical attribute (typically lemma). cql_attr: Default CQL attribute. It can be used to shorten queries, e.g. you can use "N.*" instead of [tag="N.*"] if the default attribute is set to "tag". # Raises WorkspaceReserved: If there already exists a configuration file in the given directory. """ fn_config = path.join(directory, ManateeExtractor.FN_CONFIG) if path.exists(fn_config): raise WorkspaceReserved( "Configuration file `%s` already exists." % fn_config ) # Test whether the provided arguments # can be actually used. c = manatee.Corpus(corpus) c.get_struct(struct) c.get_attr(lex_attr) if cql_attr is not None: c.get_attr(cql_attr) # Create CQLExtractor's base directory if not path.exists(directory): makedirs(directory) # Create additional subdirectories sub_directories = [ path.join(directory, sd) for sd in [ManateeExtractor.DN_RAW_POCS, ManateeExtractor.DN_RAW_VOCABS] ] for sub_directory in sub_directories: if not path.exists(sub_directory): makedirs(sub_directory) # Create configuration file containing # basic info about this CQLExtractor config = dict( corpus=corpus, struct=struct, lex_attr=lex_attr, cql_attr=cql_attr, ) with our_open(fn_config, "w") as config_file: json.dump(config, config_file)
def get_info(self, canonical_corpus_id): try: if canonical_corpus_id not in self._cache: self._cache[canonical_corpus_id] = ManateeCorpusInfo( manatee.Corpus(canonical_corpus_id), canonical_corpus_id) return self._cache[canonical_corpus_id] except: # probably a misconfigured/missing corpus return ManateeCorpusInfo(EmptyCorpus(corpname=canonical_corpus_id), canonical_corpus_id)
def get_info(self, corpus_id): try: if corpus_id not in self._cache: self._cache[corpus_id] = DefaultManateeCorpusInfo( manatee.Corpus(corpus_id), corpus_id) return self._cache[corpus_id] except Exception as ex: logging.getLogger(__name__).warning(ex) # probably a misconfigured/missing corpus return DefaultManateeCorpusInfo(EmptyCorpus(corpname=corpus_id), corpus_id)
def _load_corp(corp_id, subc_path): """ Instantiate a manatee.Corpus (or manatee.SubCorpus) instance arguments: corp_id -- a corpus identifier subc_path -- path to a subcorpus """ corp = manatee.Corpus(corp_id) if subc_path: corp = manatee.SubCorpus(corp, subc_path) corp.corpname = corp_id return corp
def get_Corpus(self, corpname: str, corp_variant: str = '', subcname: str = '', decode_desc: bool = True) -> Corpus: """ args: corp_variant: a registry file path prefix for (typically) limited variant of a corpus; please note that in many cases this can be omitted as only in case user wants to see a continuous text (e.g. kwic context) we must make sure he sees only a 'legal' chunk. """ if ':' in corpname: corpname, subcname = corpname.split(':', 1) public_subcname = self.get_subc_public_name(corpname, subcname) cache_key = (corpname, corp_variant, subcname, public_subcname) if cache_key in self._cache: return self._cache[cache_key] registry_file = os.path.join(corp_variant, corpname) if corp_variant else corpname self._ensure_reg_file(registry_file, corp_variant) corp = manatee.Corpus(registry_file) corp.corpname = str(corpname) # never unicode (paths) corp.is_published = False corp.author = None corp.author_id = None # NOTE: line corp.cm = self (as present in NoSke and older KonText versions) has # been causing file descriptor leaking for some operations (e.g. corp.get_attr). # KonText does not need such an attribute but to keep developers informed I leave # the comment here. if subcname: if public_subcname: subcname = public_subcname for sp in self.subcpath: spath = os.path.join(sp, corpname, subcname + '.subc') if os.path.isfile(spath): subc = self._open_subcorpus(corpname, subcname, corp, spath, decode_desc) self._cache[cache_key] = subc return subc raise RuntimeError(_('Subcorpus "%s" not found') % subcname) else: self._cache[cache_key] = corp return corp
def get_existing_conc(corp: manatee.Corpus, q: Tuple[str, ...]) -> manatee.Concordance: cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp) subchash = getattr(corp, 'subchash', None) status = cache_map.get_calc_status(subchash, q) if status is None: raise ConcNotFoundException('Concordance not found.') if status.finished and status.readable: mcorp = corp for qq in reversed(q): # find the right main corp, if aligned if qq.startswith('x-'): mcorp = manatee.Corpus(qq[2:]) break return PyConc(mcorp, 'l', status.cachefile, orig_corp=corp) raise BrokenConcordanceException( 'Concordance broken. File: {}, error: {}'.format( status.cachefile, status.error))
def get_Corpus(self, corpname, corp_variant='', subcname='', decode_desc=True): """ args: corp_variant: a registry file path prefix for (typically) limited variant of a corpus; please note that in many cases this can be omitted as only in case user wants to see a continuous text (e.g. kwic context) we must make sure he sees only a 'legal' chunk. """ if ':' in corpname: corpname, subcname = corpname.split(':', 1) public_subcname = self.get_subc_public_name(corpname, subcname) cache_key = (corpname, corp_variant, subcname, public_subcname) if cache_key in self._cache: return self._cache[cache_key] registry_file = os.path.join(corp_variant, corpname) if corp_variant else corpname self._ensure_reg_file(registry_file, corp_variant) corp = manatee.Corpus(registry_file) corp.corpname = str(corpname) # never unicode (paths) corp.is_published = False corp.cm = self if subcname: if public_subcname: subcname = public_subcname for sp in self.subcpath: spath = os.path.join(sp, corpname, subcname + '.subc') if type(spath) == unicode: spath = spath.encode('utf-8') if os.path.isfile(spath): subc = self._open_subcorpus(corpname, subcname, corp, spath, decode_desc) self._cache[cache_key] = subc return subc raise RuntimeError(_('Subcorpus "%s" not found') % subcname) else: self._cache[cache_key] = corp return corp
def get_Corpus(self, corpname, corp_variant='', subcname=''): """ args: corp_variant: a registry file path prefix for (typically) limited variant of a corpus; please note that in many cases this can be omitted as only in case user wants to see a continuous text (e.g. kwic context) we must make sure he sees only a 'legal' chunk. """ if ':' in corpname: corpname, subcname = corpname.split(':', 1) registry_file = os.path.join(corp_variant, corpname) if corp_variant else corpname corp = manatee.Corpus(registry_file) corp.corpname = str(corpname) # never unicode (paths) corp.cm = self dsubcpath = self.default_subcpath(corp) if subcname: for sp in self.subcpath + [dsubcpath]: if sp == dsubcpath: spath = os.path.join(sp, subcname + '.subc') else: spath = os.path.join(sp, corpname, subcname + '.subc') if type(spath) == unicode: spath = spath.encode("utf-8") if os.path.isfile(spath): subc = manatee.SubCorpus(corp, spath) subc.corp = corp subc.spath = spath try: open(spath[:-4] + 'used', 'w') except IOError: pass subc.corpname = str(corpname) # never unicode (paths) subc.subcname = subcname subc.cm = self subc.subchash = md5(open(spath).read()).hexdigest() subc.created = datetime.fromtimestamp( int(os.path.getctime(spath))) return subc raise RuntimeError(_('Subcorpus "%s" not found') % subcname) else: return corp
def get_corpus(self, corpname: str, corp_variant: str = '', subcname: str = '', decode_desc: bool = True) -> AbstractKCorpus: """ args: corp_variant: a registry file path prefix for (typically) limited variant of a corpus; please note that in many cases this can be omitted as only in case user wants to see a continuous text (e.g. kwic context) we must make sure he sees only a 'legal' chunk. """ public_subcname = self.get_subc_public_name(corpname, subcname) registry_file = self._ensure_reg_file(corpname, corp_variant) cache_key = (registry_file, subcname, public_subcname) if cache_key in self._cache: return self._cache[cache_key] corp = manatee.Corpus(registry_file) # NOTE: line corp.cm = self (as present in NoSke and older KonText versions) has # been causing file descriptor leaking for some operations (e.g. corp.get_attr). # KonText does not need such an attribute but to keep developers informed I leave # the comment here. if subcname: if public_subcname: subcname = public_subcname for sp in self.subcpath: spath = os.path.join(sp, corpname, subcname + '.subc') if os.path.isfile(spath): subc = KSubcorpus.load(corp, corpname, subcname, spath, decode_desc) self._cache[cache_key] = subc return subc raise RuntimeError(_('Subcorpus "{}" not found').format( subcname)) # TODO error type else: kcorp = KCorpus(corp, corpname) self._cache[cache_key] = kcorp return kcorp
def fcs_scan(corpname, scan_query, max_ter, start): """ aux function for federated content search: operation=scan """ if not scan_query: raise Exception(7, 'scan_query', 'Mandatory parameter not supplied') query = scan_query.replace('+', ' ') # convert URL spaces exact_match = False if 'exact' in query.lower() and not '=' in query: # lemma ExacT "dog" pos = query.lower().index('exact') # first occurence of EXACT query = query[:pos] + '=' + query[pos + 5:] # 1st exact > = exact_match = True corp = manatee.Corpus(corpname) attrs = corp.get_conf('ATTRLIST').split(',') # list of available attrs try: if '=' in query: attr, value = query.split('=') attr = attr.strip() value = value.strip() else: # must be in format attr = value raise Exception if '"' in attr: raise Exception if '"' in value: if value[0] == '"' and value[-1] == '"': value = value[1:-1].strip() else: raise Exception except Exception: raise Exception(10, scan_query, 'Query syntax error') if not attr in attrs: raise Exception(16, attr, 'Unsupported index') import corplib if exact_match: wlpattern = '^' + value + '$' else: wlpattern = '.*' + value + '.*' wl = corplib.wordlist(corp, wlattr=attr, wlpat=wlpattern, wlsort='f') return [(d['str'], d['freq']) for d in wl][start:][:max_ter]
def _get_cached_conc(corp, subchash, q, minsize): """ Loads a concordance from cache. The function tries to find at least a sublist of 'q' (starting from zero) to avoid full concordance search if possible. arguments: corp -- a respective manatee.Corpus object subchash -- a subcorpus hash (generated by PyConc) q -- a query representation list minsize -- a minimum concordance size to return immediately (synchronously) returns: a 2-tuple [an index within 'q' where to start with non-cached results], [a concordance instance] """ start_time = time.time() q = tuple(q) cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp) cache_map.refresh_map() if _contains_shuffle_seq(q): srch_from = 1 else: srch_from = len(q) ans = (0, None) # try to find the most complete cached operation # (e.g. query + filter + sample) for i in range(srch_from, 0, -1): cachefile = cache_map.cache_file_path(subchash, q[:i]) if cachefile: try: _wait_for_conc(cache_map=cache_map, subchash=subchash, q=q[:i], minsize=minsize) except ConcCalculationControlException as ex: _cancel_async_task(cache_map, subchash, q[:i]) logging.getLogger(__name__).warning( 'Removed broken concordance cache record. Original error: %s' % (ex, )) continue conccorp = corp for qq in reversed(q[:i]): # find the right main corp, if aligned if qq.startswith('x-'): conccorp = manatee.Corpus(qq[2:]) break conc = None try: if not _min_conc_unfinished(cache_map=cache_map, subchash=subchash, q=q[:i], minsize=minsize): conc = PyConc(conccorp, 'l', cachefile, orig_corp=corp) except (ConcCalculationControlException, manatee.FileAccessError) as ex: logging.getLogger(__name__).error( 'Failed to join unfinished calculation: {0}'.format(ex)) _cancel_async_task(cache_map, subchash, q[:i]) continue ans = (i, conc) break logging.getLogger(__name__).debug( 'get_cached_conc(%s, [%s]) -> %s, %01.4f' % (corp.corpname, ','.join(q), 'hit' if ans[1] else 'miss', time.time() - start_time)) return ans
def _open_corpus(self, corpname): if corpname not in self._corp_cache: self._corp_cache[corpname] = manatee.Corpus(corpname) return self._corp_cache[corpname]
def _load_corp(corp_id, subc_path): corp = manatee.Corpus(corp_id) if subc_path: corp = manatee.SubCorpus(corp, subc_path) corp.corpname = corp_id return corp
def get_corpus_size(self, corp_id): c = manatee.Corpus(os.path.join(self._reg_path, corp_id)) return c.size()
def corp_factory(reg_path): return manatee.Corpus(reg_path)
def get_corpus_name(self, corp_id): try: c = manatee.Corpus(os.path.join(self._reg_path, corp_id)) return c.get_conf('NAME').decode(self.get_corpus_encoding(corp_id)) except: return None
def get_corpus_description(self, corp_id): try: c = manatee.Corpus(os.path.join(self._reg_path, corp_id)) return c.get_conf('INFO').decode(self.get_corpus_encoding(corp_id)) except: return None
def get_corpus_encoding(self, corp_id): try: c = manatee.Corpus(os.path.join(self._reg_path, corp_id)) return c.get_conf('ENCODING') except: return None
def open_corpus(*args, **kwargs): """ Creates a manatee.Corpus instance """ return manatee.Corpus(*args, **kwargs)
def find_cached_conc_base( corp: manatee.Corpus, subchash: Optional[str], q: Tuple[str, ...], minsize: int) -> Tuple[Optional[int], manatee.Concordance]: """ Load a concordance from cache starting from a complete operation q[:], then trying q[:-1], q[:-2], q:[:-i] etc. A possible found concordance can be used to skip calculation of already available operations q[:-i]. arguments: minsize -- a minimum concordance size to return immediately (synchronously); please note that unlike wait_for_conc here we accept also 0 returns: a 2-tuple [an index within 'q' where to start with non-cached results], [a concordance instance] """ start_time = time.time() cache_map = plugins.runtime.CONC_CACHE.instance.get_mapping(corp) cache_map.refresh_map() calc_status = cache_map.get_calc_status(subchash, q) if calc_status: if calc_status.error is None: corp_mtime = corplib_corp_mtime(corp) if calc_status.created - corp_mtime < 0: logging.getLogger(__name__).warning( 'Removed outdated cache file (older than corpus indices)') cache_map.del_full_entry(subchash, q) else: logging.getLogger(__name__).warning( 'Removed failed calculation cache record (error: {0}'.format( calc_status.error)) cache_map.del_full_entry(subchash, q) if _contains_shuffle_seq(q): srch_from = 1 else: srch_from = len(q) conc = EmptyConc(corp=corp) ans = (0, conc) # try to find the most complete cached operation # (e.g. query + filter + sample) for i in range(srch_from, 0, -1): cache_path = cache_map.cache_file_path(subchash, q[:i]) # now we know that someone already calculated the conc (but it might not be finished yet) if cache_path: try: ready = wait_for_conc(cache_map=cache_map, subchash=subchash, q=q[:i], minsize=minsize) if not ready: if minsize != 0: cancel_async_task(cache_map, subchash, q[:i]) logging.getLogger(__name__).warning( 'Removed unfinished concordance cache record due to exceeded time limit' ) continue _, finished = _check_result(cache_map=cache_map, subchash=subchash, q=q[:i], minsize=minsize) if finished: mcorp = corp for qq in reversed( q[:i]): # find the right main corp, if aligned if qq.startswith('x-'): mcorp = manatee.Corpus(qq[2:]) break conc = PyConc(mcorp, 'l', cache_path, orig_corp=corp) except (ConcCalculationStatusException, manatee.FileAccessError) as ex: logging.getLogger(__name__).error( f'Failed to use cached concordance for {q[:i]}: {ex}') cancel_async_task(cache_map, subchash, q[:i]) continue ans = (i, conc) break logging.getLogger(__name__).debug( f'get_cached_conc({corp.get_conffile()}, [{", ".join(q)}]), ' f'conc: {conc.__class__.__name__}, ' f'missing ops start idx: {i if i < len(q) else "none"}, ' f'time: {(time.time() - start_time):.4f}') return ans
def get_corpus_size(corpus_id, reg_dir): corp = manatee.Corpus(os.path.join(reg_dir, corpus_id)) return corp.size() if corp else 0
def get_data_path(self, corp_id): try: c = manatee.Corpus(os.path.join(self._reg_path, corp_id)) return c.get_conf('PATH').rstrip('/') except: return None
def run(self): # Check whether query is prepared. if self.corpus is None: raise QueryError('You must specify the corpus to do a search.') if self.attributes is None: raise QueryError('You must specify at least one attribute to do a search.') if self.structures is None: raise QueryError('You must specify at least one structure to do a search.') if self.references is None: raise QueryError('You must specify at least one reference to do a search.') if self.container is None and not issubclass(type(self.processor), Nonprocessor): raise QueryError('You must specify the container to do a search.') if self.string is None or self.string is '': raise QueryError('You must set the string property to a search string.') # Check whether processor of proper type if self.processor and not issubclass(type(self.processor), Processor): raise QueryError('The processor class must inherit from SeaCOW.Processor.') # Emit heuristic warning that container might end up being to small. # This warns about the behviour reported 2020 by EP. q_pattern = r'.* within *<' + self.container + r'(| [^>]+)/>.*' q_string = r'within <' + self.container + r'/>' if not re.match(q_pattern, self.string): print("WARNING! Your query should probably end in '" + q_string + "' or your match might exceed the exported container.") if self.context_left == 0 or self.context_right == 0: print(" ... especially because at least one of your contexts is 0!") print(" ... Watch out for 'Index anomaly' warnings.") print # Allow the processor to engage in preparatory action/check whether everything is fine. if self.processor: self.processor.prepare(self) # Set up and run query. h_corpus = manatee.Corpus(self.corpus) if self.subcorpus is not None: # If subcorpus name is given (instead of path), figure out full path to subcorpus .subc file. if not "/" in self.subcorpus: self.subcorpus = h_corpus.get_conf("PATH") + "subcorp/" + re.sub("\.subc$", "", self.subcorpus.strip(" /")) + ".subc" if os.path.exists(self.subcorpus): h_corpus = manatee.SubCorpus (h_corpus, self.subcorpus) else: raise QueryError('The requested subcorpus cannot be found.') if not issubclass(type(self.processor), Nonprocessor): h_region = manatee.CorpRegion(h_corpus, ','.join(self.attributes), ','.join(self.structures)) h_cont = h_corpus.get_struct(self.container) h_refs = [h_corpus.get_attr(r) for r in self.references] start_time = time.time() results = h_corpus.eval_query(self.string) # Process results. counter = 0 dup_no = 0 # In case class is "Noprocessor", we do not process the stream. if issubclass(type(self.processor), Nonprocessor): # Store the hit count as reported. self.hits = results.count_rest() else: while not results.end() and (self.max_hits < 0 or counter < self.max_hits): # Skip randomly if random subset desired. if self.random_subset > 0 and random.random() > self.random_subset: results.next() continue kwic_beg = results.peek_beg() # Match begin. kwic_end = results.peek_end() # Match end. cont_beg_num = h_cont.num_at_pos(kwic_beg)-self.context_left # Container at match begin. cont_end_num = h_cont.num_at_pos(kwic_beg)+self.context_right # Container at match end. # If hit not in desired region, drop. if cont_beg_num < 0 or cont_end_num < 0: results.next() continue cont_beg_pos = h_cont.beg(cont_beg_num) # Pos at container begin. cont_end_pos = h_cont.end(cont_end_num) # Pos at container end. refs = [h_refs[i].pos2str(kwic_beg) for i in range(0, len(h_refs))] region = h_region.region(cont_beg_pos, cont_end_pos, '\t', '\t') # Deduping. if type(self.bloom) is pybloom_live.ScalableBloomFilter: dd_region = ''.join([region[i].strip().lower() for i in range(0, len(region), 1+len(self.attributes))]) if {dd_region : 0} in self.bloom: dup_no += 1 results.next() continue else: self.bloom.add({dd_region : 0}) # Call the processor. if self.processor: self.processor.process(self, region, refs, kwic_beg - cont_beg_pos, kwic_end - kwic_beg) # Advance stream/loop. results.next() counter = counter + 1 # After loop but inside "if not Nonprocessor", set hit count. self.hits = counter self.querytime = strftime("%Y-%m-%d %H:%M:%S", gmtime()) self.duplicates = dup_no self.elapsed = time.time()-start_time # Allow the processor to finalise its job. if self.processor: self.processor.finalise(self)
def main(): fmt = '[%(asctime)-15s] %(levelname)s: %(message)s' logging.basicConfig(level=logging.INFO, format=fmt) m = argparse.ArgumentDefaultsHelpFormatter p = argparse.ArgumentParser(description="", formatter_class=m) p.add_argument("-c", "--corpus", type=str, required=True) p.add_argument("-d", "--db", type=str, required=True) p.add_argument("-o", "--outfile", type=str, required=True) args = p.parse_args() log.info("opening database {}".format(args.db)) db = sqlite3.connect(args.db) db.isolation_level = None # I want to handle transactions myself log.info("opening corpus {}".format(args.corpus)) corp = manatee.Corpus(args.corpus) log.info("corpus has %d positions" % corp.size()) log.info("reading annotations") attrs = read_annots(corp, db) headers_simple = [] headers_multi = [] for k, v in default_annot_values.iteritems(): if k not in simple_attributes: continue if len(v) > 2: for vv in v: headers_multi.append((k, vv, (k + "_" + vv).replace(' ', '-'))) else: headers_simple.append(k) log.info("reading corpus text") doc = corp.get_struct('doc') docsize = corp.get_struct('doc').size() fn = doc.get_attr('filename') with open('labels.csv', 'w') as lf: for x in headers_simple: print(x, file=lf) for x, y, z in headers_multi: print(z.encode('utf-8'), file=lf) print("Grouping most common answer") most_common = attrs.groupby(["docid", "name"]).agg(lambda x: pd.Series.mode(x)[0]) print("Converting dataset") df = most_common.unstack() df = df[~df.isna().any(axis=1)] df.columns = df.columns.droplevel() df.columns.name = None df.index.name = None print("Getting text") df["text"] = df.index.map(lambda docid: text(corp, docid, 'word')) print("Writing dataset") df.to_csv("data.csv", index=False)