Ejemplo n.º 1
0
 def load(
     path: os.PathLike[AnyStr],
     sep: str = "\t",
     qid_field: int = 0,
     query_field: int = 1,
 ) -> TrecQuery:
     content = [
         line.strip("\n").split(sep) for line in with_iter(open(path, "r"))
     ]
     qno_map = OrderedDict([(x[qid_field], x[query_field])
                            for x in content])
     return TrecQuery(qno_map)
Ejemplo n.º 2
0
    def process(self, dry_run):
        def _put_line(line):
            return 1

        self.logger.info('Reading Ensembl gene info from %s' %
                         self.ensembl_filename)

        lines = more_itertools.with_iter(
            URLZSource(self.ensembl_filename).open())

        with URLZSource(self.es_mappings).open() as mappings_file:
            mappings = json.load(mappings_file)

        with URLZSource(self.es_settings).open() as settings_file:
            settings = json.load(settings_file)

        es = new_es_client(self.es_hosts)
        with ElasticsearchBulkIndexManager(es, self.es_index, settings,
                                           mappings):
            #write into elasticsearch
            chunk_size = 1000  #TODO make configurable
            actions = elasticsearch_actions(lines, self.es_index, self.es_doc)
            failcount = 0

            if not dry_run:
                results = None
                if self.workers_write > 0:
                    results = elasticsearch.helpers.parallel_bulk(
                        es,
                        actions,
                        thread_count=self.workers_write,
                        queue_size=self.queue_write,
                        chunk_size=chunk_size)
                else:
                    results = elasticsearch.helpers.streaming_bulk(
                        es, actions, chunk_size=chunk_size)
                for success, details in results:
                    if not success:
                        failcount += 1

                if failcount:
                    raise RuntimeError("%s relations failed to index" %
                                       failcount)
Ejemplo n.º 3
0
    async def lc(self, ctx, target=None):
        cmd = self.bot.get_command(target) if target else None
        cog = self.bot.get_cog(target)
        ext = self.bot.get_ext(target)

        if cmd:
            length = len(inspect.getsourcelines(cmd.callback)[0])
        elif cog:
            length = len(inspect.getsourcelines(cog.__class__)[0])
        elif ext:
            length = len(inspect.getsourcelines(ext)[0])
        else:
            length = sum(
                ilen(with_iter(p.open(encoding='utf-8')))
                for p in pathlib.Path('.').rglob('*.py')
                if not str(p.parent).startswith('venv'))
            return await ctx.send(f'**Total Lines:** `{length}`')

        await ctx.send(f'**{target} Lines:** `{length}`')
Ejemplo n.º 4
0
    def __init__(self, path: os.PathLike[AnyStr], default: float = 0.0):
        lines = [
            line.strip("\n").split() for line in with_iter(open(path, "r"))
        ]
        assert all(len(x) == 3 for x in lines), "Malformed eval file"

        self._content: Dict[str, Dict[str, float]] = OrderedDict()
        self._agg: Dict[str, float] = OrderedDict()
        for metric, qid, value in lines:
            self._content.setdefault(metric, OrderedDict())
            if qid == "all":
                self._agg[metric] = float(value)
            else:
                self._content[metric][qid] = float(value)

        self._default = default

        # Check the qids are consistent across metrics
        first_qids = set(first(self._content.values()).keys())
        assert all(set(x.keys()) == first_qids for x in self._content.values())
Ejemplo n.º 5
0
    def process(self, ensembl_filename, dry_run):
        def _put_line(line):
            return 1

        self.logger.info('Reading Ensembl gene info from %s' %
                         ensembl_filename)

        #setup elasticsearch
        if not dry_run:
            self.loader.create_new_index(
                Const.ELASTICSEARCH_ENSEMBL_INDEX_NAME)
            #need to directly get the versioned index name for this function
            self.loader.prepare_for_bulk_indexing(
                self.loader.get_versioned_index(
                    Const.ELASTICSEARCH_ENSEMBL_INDEX_NAME))

        inserted_lines = 0
        for line in more_itertools.with_iter(
                URLZSource(ensembl_filename).open()):
            entry = json.loads(line)
            #store in elasticsearch if not dry running
            if not dry_run:
                self.loader.put(Const.ELASTICSEARCH_ENSEMBL_INDEX_NAME,
                                Const.ELASTICSEARCH_ENSEMBL_DOC_NAME,
                                entry['id'], line)
            inserted_lines += 1

        self.logger.info("Read %d lines from %s", inserted_lines,
                         ensembl_filename)

        self.logger.info("flush index")

        #cleanup elasticsearch
        if not dry_run:
            self.loader.flush_all_and_wait(
                Const.ELASTICSEARCH_ENSEMBL_INDEX_NAME)
            #restore old pre-load settings
            #note this automatically does all prepared indexes
            self.loader.restore_after_bulk_indexing()
Ejemplo n.º 6
0
 def iter_train_sentences(self,
                          segmentize_func: Callable[[str],
                                                    List[str]] = None):
     segmentize_func = segmentize_func or self.language_config.segmentize_text
     return map(segmentize_func, with_iter(tqdm_open(self.path.train)))
Ejemplo n.º 7
0
def open_to_read(filename):
    """return an iterator from izip (filename, (enumerate(file_handle, start=1))"""
    it = more_itertools.with_iter(URLZSource(filename).open())
    return zip(itertools.cycle([filename]), enumerate(it, start=1))
Ejemplo n.º 8
0
 def _process_text_file(filepath) -> np.ndarray:
     print(f"Load corpus data from {format_path(filepath)}")
     return tokenizer.texts_to_array(with_iter(tqdm_open(filepath)))
Ejemplo n.º 9
0
def open_to_read(filename):
    """return an iterator from izip (filename, (enumerate(file_handle, start=1))"""
    _l.debug('generate an iterator of (filename,enumerate) for filename %s',
             filename)
    it = more_itertools.with_iter(URLZSource(filename).open())
    return itertools.izip(itertools.cycle([filename]), enumerate(it, start=1))
Ejemplo n.º 10
0
 def __init__(self, path: os.PathLike[AnyStr], default_rel: int = 0):
     lines = [QrelLine(line) for line in with_iter(open(path, "r"))]
     self._qno_map = {x.qno: {x.dno: x.rel} for x in lines}
     self._dno_map = {x.dno: {x.qno: x.rel} for x in lines}
     self._default_rel = default_rel
Ejemplo n.º 11
0
def count_lines(filepath):
    return ilen(with_iter(open(filepath)))