Exemple #1
0
def test_grouped():
    items = [1, 2, 3, 4]
    result = grouped(items, 2)
    expected_result = [[1, 2], [3, 4]]

    assert isinstance(result, types.GeneratorType)
    assert list(result) == expected_result

    result = grouped(items, 2, as_list=True)
    assert isinstance(result, list)
    assert result == expected_result

    result = grouped(items, 3, as_list=True)
    assert result == [[1, 2, 3], [4]]  # No None items, a shorter last group
    def _batch_query(self, ids):
        """
        Uses myvariant.info service to query many IDs across the scopes defined
        by the class variable SCOPES. It returns a dict of {id: result, ... }
        with the IDs that were found (i.e. leaves out the not found ones) and
        info from the fields defined in the class variable FIELDS.

        The successful results are cached.
        """
        if not hasattr(self, 'mv'):
            self.mv = MyVariantInfo()

        grouped_ids = list(grouped(ids, self.BATCH_SIZE))
        for batch_of_ids in tqdm(grouped_ids):
            logger.debug('{} query {} IDs'.format(self.name,
                                                  len(batch_of_ids)))
            hits = self.mv.querymany(batch_of_ids,
                                     scopes=self.SCOPES,
                                     fields=self.FIELDS,
                                     verbose=self.VERBOSE)

            batch_annotations = {}
            for query, hits in groupby(hits, itemgetter('query')):
                hits = list(hits)
                if 'notfound' not in hits[0]:
                    batch_annotations[query] = hits

            yield batch_annotations
Exemple #3
0
    def _batch_query(self, ids):
        if self.proxies:
            logger.info('{} using proxies: {}'.format(self.name, self.proxies))

        if self.full_info:
            logger.warn('{} using full_info (quite slow)'.format(self.name))

        for group_of_ids in tqdm(grouped(ids, self.BATCH_SIZE, as_list=True)):
            yield self._post_query(group_of_ids)
            time.sleep(self.SLEEP_TIME)
    def _batch_query(self, ids):
        """
        Uses mygene.info service to query many Entrez gene IDs. It returns a
        dict of {id-1: result-1, id-2: ... } with the IDs that were found (i.e.
        leaves out the not found ones).
        """
        if not hasattr(self, 'mg'):
            self.mg = MyGeneInfo()

        for batch_of_ids in grouped(ids, self.BATCH_SIZE):
            batch_annotations = {}
            for hit in self.mg.querymany(batch_of_ids,
                                         scopes='entrezgene',
                                         fields='all',
                                         verbose=self.VERBOSE):
                if 'notfound' not in hit and hit['taxid'] == self.TAXID:
                    batch_annotations[hit['query']] = hit
            yield batch_annotations
Exemple #5
0
    def _batch_query(self, ids):
        """
        Query a group of IDs using <class>.BATCH_SIZE threads, sleeping
        <class>.SLEEP_TIME between batches.

        Set <class>.RANDOMIZE_SLEEP_TIME = True to make the sleep time between
        batches random, in order to fool crawler detection.

        Yields tuples of (id_, annotation).
        """
        grouped_ids = list(grouped(ids, self.BATCH_SIZE))
        msg = ('{}: get {} entries in {} batches '
               '({} items/batch & sleep {}{}s between batches)')
        logger.info(
            msg.format(self.name, len(ids), len(grouped_ids), self.BATCH_SIZE,
                       ('~' if self.RANDOMIZE_SLEEP_TIME else ''),
                       self.SLEEP_TIME))

        if self.proxies:
            logger.info('{} using proxies: {}'.format(self.name, self.proxies))
        elif self.proxies == {}:
            logger.info('{} explicitely set to use NO PROXIES!'.format(
                self.name))
        else:
            if self.MANDATORY_PROXIES:
                message = (
                    'No proxies set for {}. Please set self.proxies to '
                    'avoid a possible ban or explicitely set proxies as an '
                    'empty dict (self.proxies={{}}) if you want to proceed '
                    'anyway.'.format(self.name))
                raise NoProxiesException(message)
            else:
                logger.warning('{} not using proxies!'.format(self.name))

        with ThreadPoolExecutor(max_workers=self.BATCH_SIZE) as executor:
            sys.stdout.flush()  # Hack to display tqdm progress bar correctly
            iterator = tqdm(grouped_ids, total=len(grouped_ids))
            for i, batch_of_ids in enumerate(iterator):
                if i > 0:
                    time.sleep(self.sleep_time)
                annotations = executor.map(self._query, batch_of_ids)
                yield dict(zip(batch_of_ids, annotations))
 def _esummary_query(self, ids):
     n_batchs = len(ids)//self.batch_size
     for ids_group in tqdm(grouped(ids, self.batch_size), total=n_batchs):
         handle = Entrez.esummary(db=self.ENTREZ_PARAMS['db'],
                                  id=','.join(ids_group))
         yield ids_group, handle
 def _efetch_query(self, ids):
     n_batchs = len(ids)//self.batch_size
     for ids_group in tqdm(grouped(ids, self.batch_size), total=n_batchs):
         handle = Entrez.efetch(id=','.join(ids_group),
                                **self.ENTREZ_PARAMS)
         yield ids_group, handle