def load( path: os.PathLike[AnyStr], sep: str = "\t", qid_field: int = 0, query_field: int = 1, ) -> TrecQuery: content = [ line.strip("\n").split(sep) for line in with_iter(open(path, "r")) ] qno_map = OrderedDict([(x[qid_field], x[query_field]) for x in content]) return TrecQuery(qno_map)
def process(self, dry_run): def _put_line(line): return 1 self.logger.info('Reading Ensembl gene info from %s' % self.ensembl_filename) lines = more_itertools.with_iter( URLZSource(self.ensembl_filename).open()) with URLZSource(self.es_mappings).open() as mappings_file: mappings = json.load(mappings_file) with URLZSource(self.es_settings).open() as settings_file: settings = json.load(settings_file) es = new_es_client(self.es_hosts) with ElasticsearchBulkIndexManager(es, self.es_index, settings, mappings): #write into elasticsearch chunk_size = 1000 #TODO make configurable actions = elasticsearch_actions(lines, self.es_index, self.es_doc) failcount = 0 if not dry_run: results = None if self.workers_write > 0: results = elasticsearch.helpers.parallel_bulk( es, actions, thread_count=self.workers_write, queue_size=self.queue_write, chunk_size=chunk_size) else: results = elasticsearch.helpers.streaming_bulk( es, actions, chunk_size=chunk_size) for success, details in results: if not success: failcount += 1 if failcount: raise RuntimeError("%s relations failed to index" % failcount)
async def lc(self, ctx, target=None): cmd = self.bot.get_command(target) if target else None cog = self.bot.get_cog(target) ext = self.bot.get_ext(target) if cmd: length = len(inspect.getsourcelines(cmd.callback)[0]) elif cog: length = len(inspect.getsourcelines(cog.__class__)[0]) elif ext: length = len(inspect.getsourcelines(ext)[0]) else: length = sum( ilen(with_iter(p.open(encoding='utf-8'))) for p in pathlib.Path('.').rglob('*.py') if not str(p.parent).startswith('venv')) return await ctx.send(f'**Total Lines:** `{length}`') await ctx.send(f'**{target} Lines:** `{length}`')
def __init__(self, path: os.PathLike[AnyStr], default: float = 0.0): lines = [ line.strip("\n").split() for line in with_iter(open(path, "r")) ] assert all(len(x) == 3 for x in lines), "Malformed eval file" self._content: Dict[str, Dict[str, float]] = OrderedDict() self._agg: Dict[str, float] = OrderedDict() for metric, qid, value in lines: self._content.setdefault(metric, OrderedDict()) if qid == "all": self._agg[metric] = float(value) else: self._content[metric][qid] = float(value) self._default = default # Check the qids are consistent across metrics first_qids = set(first(self._content.values()).keys()) assert all(set(x.keys()) == first_qids for x in self._content.values())
def process(self, ensembl_filename, dry_run): def _put_line(line): return 1 self.logger.info('Reading Ensembl gene info from %s' % ensembl_filename) #setup elasticsearch if not dry_run: self.loader.create_new_index( Const.ELASTICSEARCH_ENSEMBL_INDEX_NAME) #need to directly get the versioned index name for this function self.loader.prepare_for_bulk_indexing( self.loader.get_versioned_index( Const.ELASTICSEARCH_ENSEMBL_INDEX_NAME)) inserted_lines = 0 for line in more_itertools.with_iter( URLZSource(ensembl_filename).open()): entry = json.loads(line) #store in elasticsearch if not dry running if not dry_run: self.loader.put(Const.ELASTICSEARCH_ENSEMBL_INDEX_NAME, Const.ELASTICSEARCH_ENSEMBL_DOC_NAME, entry['id'], line) inserted_lines += 1 self.logger.info("Read %d lines from %s", inserted_lines, ensembl_filename) self.logger.info("flush index") #cleanup elasticsearch if not dry_run: self.loader.flush_all_and_wait( Const.ELASTICSEARCH_ENSEMBL_INDEX_NAME) #restore old pre-load settings #note this automatically does all prepared indexes self.loader.restore_after_bulk_indexing()
def iter_train_sentences(self, segmentize_func: Callable[[str], List[str]] = None): segmentize_func = segmentize_func or self.language_config.segmentize_text return map(segmentize_func, with_iter(tqdm_open(self.path.train)))
def open_to_read(filename): """return an iterator from izip (filename, (enumerate(file_handle, start=1))""" it = more_itertools.with_iter(URLZSource(filename).open()) return zip(itertools.cycle([filename]), enumerate(it, start=1))
def _process_text_file(filepath) -> np.ndarray: print(f"Load corpus data from {format_path(filepath)}") return tokenizer.texts_to_array(with_iter(tqdm_open(filepath)))
def open_to_read(filename): """return an iterator from izip (filename, (enumerate(file_handle, start=1))""" _l.debug('generate an iterator of (filename,enumerate) for filename %s', filename) it = more_itertools.with_iter(URLZSource(filename).open()) return itertools.izip(itertools.cycle([filename]), enumerate(it, start=1))
def __init__(self, path: os.PathLike[AnyStr], default_rel: int = 0): lines = [QrelLine(line) for line in with_iter(open(path, "r"))] self._qno_map = {x.qno: {x.dno: x.rel} for x in lines} self._dno_map = {x.dno: {x.qno: x.rel} for x in lines} self._default_rel = default_rel
def count_lines(filepath): return ilen(with_iter(open(filepath)))