Example #1
0
def _job_shares(n_jobs, trials):
    if n_jobs == -1:
        n_jobs = cpu_count()
    shares = [trials // n_jobs] * n_jobs
    for i in range(trials - sum(shares)):
        shares[i] += 1
    return shares
Example #2
0
def _job_shares(n_jobs, trials):
    if n_jobs == -1:
        n_jobs = cpu_count()
    shares = [trials // n_jobs] * n_jobs
    for i in range(trials - sum(shares)):
        shares[i] += 1
    return shares
def _hdbscan_boruvka_balltree(X, min_samples=5, alpha=1.0,
                              metric='minkowski', p=2, leaf_size=40,
                              approx_min_span_tree=True,
                              gen_min_span_tree=False,
                              core_dist_n_jobs=4, **kwargs):
    if leaf_size < 3:
        leaf_size = 3

    if core_dist_n_jobs < 1:
        core_dist_n_jobs = max(cpu_count() + 1 + core_dist_n_jobs, 1)

    if X.dtype != np.float64:
        X = X.astype(np.float64)

    tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs)
    alg = BallTreeBoruvkaAlgorithm(tree, min_samples, metric=metric,
                                   leaf_size=leaf_size // 3,
                                   approx_min_span_tree=approx_min_span_tree,
                                   n_jobs=core_dist_n_jobs, **kwargs)
    min_spanning_tree = alg.spanning_tree()
    # Sort edges of the min_spanning_tree by weight
    min_spanning_tree = min_spanning_tree[np.argsort(min_spanning_tree.T[2]),
                        :]
    # Convert edge list into standard hierarchical clustering format
    single_linkage_tree = label(min_spanning_tree)

    if gen_min_span_tree:
        return single_linkage_tree, min_spanning_tree
    else:
        return single_linkage_tree, None
Example #4
0
def _get_n_jobs(n_jobs):
    """Get number of jobs for the computation.
    See sklearn/utils/__init__.py for more information.

    This function reimplements the logic of joblib to determine the actual
    number of jobs depending on the cpu count. If -1 all CPUs are used.
    If 1 is given, no parallel computing code is used at all, which is useful
    for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used.
    Thus for n_jobs = -2, all CPUs but one are used.
    Parameters
    ----------
    n_jobs : int
        Number of jobs stated in joblib convention.
    Returns
    -------
    n_jobs : int
        The actual number of jobs as positive integer.
    Examples
    --------
    >>> from sklearn.utils import _get_n_jobs
    >>> _get_n_jobs(4)
    4
    >>> jobs = _get_n_jobs(-2)
    >>> assert jobs == max(cpu_count() - 1, 1)
    >>> _get_n_jobs(0)
    Traceback (most recent call last):
    ...
    ValueError: Parameter n_jobs == 0 has no meaning.
    """
    if n_jobs < 0:
        return max(cpu_count() + 1 + n_jobs, 1)
    elif n_jobs == 0:
        raise ValueError('Parameter n_jobs == 0 has no meaning.')
    else:
        return n_jobs
def compute_geodesic_distance_matrix(verts, tris):
    print "precomputing geodesic distance..."
    n_chunks = cpu_count()
    chunk_size = int(np.ceil(len(verts) / float(n_chunks)))
    sources = np.arange(len(verts))
    D = Parallel(n_chunks)(
        delayed(compute_geodesic_distances)(verts, tris, sources[i: i + chunk_size])
        for i in xrange(0, len(verts), chunk_size))
    return np.vstack(D)
Example #6
0
def compute_geodesic_distance_matrix(verts, tris):
    print "precomputing geodesic distance..."
    n_chunks = cpu_count()
    chunk_size = int(np.ceil(len(verts) / float(n_chunks)))
    sources = np.arange(len(verts))
    D = Parallel(n_chunks)(
        delayed(compute_geodesic_distances)(verts, tris, sources[i:i +
                                                                 chunk_size])
        for i in xrange(0, len(verts), chunk_size))
    return np.vstack(D)
Example #7
0
def junction_make(config, *args, **kwargs):
    click.echo(green_fg("\n{}  Junction Make  {}\n".format(">" * 10,
                                                           "<" * 10)))
    threads = kwargs['threads'] if kwargs['threads'] else parallel.cpu_count()
    input_data_folder = 'unmapped_sam_files' if kwargs[
        'unmapped'] else 'sam_files'
    junction_folder = 'junction_files'  # Manage name of junction reads output folder here
    blast_results_folder = 'blast_results'  # Manage name of blast results output folder here
    blast_results_query = 'blast_results_query'  # Manage name of blast results dictionary output folder here
    junction_sequence = junction_sequences[kwargs['genome']].replace(
        " ", "").split(",")
    if kwargs['seq'] != "":
        junction_sequence = kwargs['seq'].replace(" ", "").split(",")
    exclusion_sequence = kwargs['exclude_seq'].replace(" ", "")
    blast_db = blast_dbs[kwargs['genome']]
    gene_list_file = gene_lists[kwargs['genome']]
    # verify if the options provided are valid
    verify_options(*args, **kwargs)
    # create folders for junction make
    check_and_create_folders(
        kwargs['dir'],
        ['junction_files', 'blast_results', 'blast_results_query'],
        interactive=kwargs['interactive'])
    if kwargs['interactive']:
        if not click.confirm(
                magenta_fg('\nDo you want to search junctions and blast?')):
            click.echo(red_fg("...Skipping search junctions and blast..."))
        else:
            # search for junctions
            junction_search(kwargs['dir'], junction_folder, input_data_folder,
                            blast_results_folder, junction_sequence,
                            exclusion_sequence, threads)
            # blast the junctions
            blast_search(kwargs['dir'], blast_db, blast_results_folder)

        if not click.confirm(
                magenta_fg('\nDo you want to parse blast results')):
            click.echo(red_fg("ABORTING..."))
            sys.exit(1)
        else:
            # parse blast results
            parse_blast_results(kwargs['dir'], blast_results_folder,
                                blast_results_query, gene_list_file, threads)
    else:
        # search for junctions
        junction_search(kwargs['dir'], junction_folder, input_data_folder,
                        blast_results_folder, junction_sequence,
                        exclusion_sequence, threads)
        # blast the junctions
        blast_search(kwargs['dir'], blast_db, blast_results_folder)
        # parse blast results
        parse_blast_results(kwargs['dir'], blast_results_folder,
                            blast_results_query, gene_list_file, threads)
Example #8
0
def test_nested_parallelism_limit(backend):
    with parallel_backend(backend, n_jobs=2):
        backend_types_and_levels = _recursive_backend_info()

    if cpu_count() == 1:
        second_level_backend_type = 'SequentialBackend'
    else:
        second_level_backend_type = 'ThreadingBackend'

    top_level_backend_type = backend.title() + 'Backend'
    expected_types_and_levels = [
        (top_level_backend_type, 0),
        (second_level_backend_type, 1),
        ('SequentialBackend', 2),
        ('SequentialBackend', 3)
    ]
    assert backend_types_and_levels == expected_types_and_levels
Example #9
0
    def _calculate_n_jobs_and_actual_iters(self):
        # because HpBandSter assigns n_iter jobs to each worker, we need to divide
        n_jobs = self.n_jobs
        if not n_jobs:
            n_jobs = 1
        elif n_jobs < 0:
            try:
                import psutil

                cpus = int(
                    os.environ.get("LOKY_MAX_CPU_COUNT",
                                   psutil.cpu_count(logical=False)))
            except:
                cpus = cpu_count()
            n_jobs = max(cpus + 1 + n_jobs, 1)

        if n_jobs > self.n_iter:
            n_jobs = self.n_iter

        actual_iterations = self.n_iter // n_jobs + (self.n_iter % n_jobs > 0)
        return (n_jobs, actual_iterations)
Example #10
0
def blast_search(directory, db_name, blast_results_folder):
    suffix = ''
    if _platform.startswith('win'):
        suffix = '.exe'
    blast_path = os.path.join(os.path.expanduser('~'), ".deepn", "data",
                              "blast")
    db_path = os.path.join(os.path.expanduser('~'), ".deepn", db_name)
    click.echo(green_fg("\n>>> Selected Blast DB: %s" % db_name))
    file_list = get_file_list(directory, blast_results_folder, ".fa")
    for file_name in file_list:
        if not os.path.getsize(
                os.path.join(directory, blast_results_folder, file_name)) == 0:
            start = time.time()
            output_file = os.path.join(
                directory, blast_results_folder,
                file_name.replace(".junctions.fa", '.blast.txt'))
            click.echo(
                yellow_fg("\n>>> Running BLAST search for file: " + file_name))
            blast_command_list = [
                os.path.join(blast_path, 'blastn' + suffix), '-query',
                os.path.join(directory, blast_results_folder,
                             file_name), '-db', db_path, '-task', 'blastn',
                '-dust', 'no', '-num_threads',
                str(parallel.cpu_count()), '-outfmt', '7', '-out', output_file,
                '-evalue', '0.2', '-max_target_seqs', '10'
            ]
            blast_pipe = subprocess.Popen(blast_command_list, shell=False)
            blast_pipe.wait()
            finish = time.time()
            hr, min, sec = elapsed_time(start, finish)
            click.echo(
                cyan_fg(
                    "\nFinished blasting file %s in time %d hr, %d min, %d sec"
                    % (file_name, hr, min, sec)))
        else:
            click.echo(
                red_fg("\n>>> ERROR: File %s does not have any junctions, "
                       "please check if they right genome was chosen." %
                       file_name))
            sys.exit(1)
    num_mutations = randint(1, max_mutations)

    for mutation in range(num_mutations):
        street = choice(streets)
        idx = choice(street)
        chromosome[idx] = randint(0, 6)

    return key_chromosome(chromosome)

if __name__ == '__main__':
    population_size = 50
    nr_populations = 10
    remain_perc = 0.2
    mutate_perc = 0.01

    population_size = cpu_count() * ((population_size / cpu_count()) + 1)
    population = set(generate_random() for _ in xrange(population_size))
    
    evaluated = {}
    
    for generation in range(1, nr_populations + 1):
        results = Parallel(n_jobs=-1, verbose=100)(delayed(calculate)(chromosome, buy_all) for chromosome in population)
        
        for score, chromosome in results:
            evaluated[chromosome] = score
        
        winners = []
        for chromosome, score in evaluated.iteritems():
            winners.append((score, chromosome))
        winners.sort(reverse=True)
Example #12
0
def test_cpu_count():
    assert cpu_count() > 0
Example #13
0
def test_cpu_count():
    assert cpu_count() > 0
    for mutation in range(num_mutations):
        street = choice(streets)
        idx = choice(street)
        chromosome[idx] = randint(0, 6)

    return key_chromosome(chromosome)


if __name__ == '__main__':
    population_size = 50
    nr_populations = 25
    remain_perc = 0.2
    mutate_perc = 0.01

    population_size = cpu_count() * ((population_size / cpu_count()) + 1)

    population = set(generate_random() for _ in xrange(population_size))
    population.add(buy_all)
    # population.add(key_chromosome(map(int, '0,6,0,5,0,2,4,0,4,4,0,6,0,5,5,2,4,0,5,3,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,4,0,5,0,3'.split(","))))
    # population.add(key_chromosome(map(int, '0,6,0,6,0,4,4,0,4,4,0,3,0,5,5,3,4,0,5,6,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,4,0,5,0,2'.split(","))))

    for generation in range(1, nr_populations + 1):
        to_schedule = []

        for chromosome in population:
            get_player_stats(chromosome)

        for chromosome in population:
            opponents = [
                buy_all,
Example #15
0
class Significance(object):
    """Test for pairwise significance between systems"""

    METHODS = {
        'permute': count_permutation_trials,
        #'bootstrap': count_bootstrap_trials,
    }

    def __init__(self,
                 systems,
                 gold,
                 trials=10000,
                 method='permute',
                 n_jobs=1,
                 metrics=['precision', 'recall', 'fscore'],
                 fmt='json'):
        if len(systems) < 2:
            raise ValueError('Require at least two systems to compare')
        if method not in self.METHODS:
            raise ValueError('Unsupported method: {}'.format(method))
        # Check whether import worked, generate a more useful error.
        if Parallel is None:
            raise ImportError(
                'Package: "joblib" not available, please install to run significance tests.'
            )
        self.systems = systems
        self.gold = gold
        self.method = method
        self.trials = trials
        self.n_jobs = n_jobs
        self.metrics = metrics
        self.fmt = FMTS[fmt] if fmt is not callable else fmt

    def __call__(self):
        all_counts = defaultdict(dict)
        gold = sorted(Reader(open(self.gold)))
        for path in self.systems:
            system = sorted(Reader(open(path)))
            for match, per_doc, overall in Evaluate.count_all(system, gold):
                all_counts[match][path] = (per_doc, overall)

        results = [
            {
                'sys1': sys1,
                'sys2': sys2,
                'match': match,
                'stats': self.significance(match_counts[sys1],
                                           match_counts[sys2])
            } for sys1, sys2 in itertools.combinations(self.systems, 2)
            for match, match_counts in sorted(
                all_counts.iteritems(), key=lambda (k, v): MATCHES.index(k))
        ]

        return self.fmt(results, self.metrics)

    def significance(self, (per_doc1, overall1), (per_doc2, overall2)):
        # TODO: limit to metrics
        base_diff = _result_diff(overall1, overall2)
        randomized_diffs = functools.partial(self.METHODS[self.method],
                                             per_doc1, per_doc2, base_diff)
        n_jobs = self.n_jobs
        if n_jobs == -1:
            n_jobs = cpu_count()
        shares = [self.trials // n_jobs] * n_jobs
        for i in range(self.trials - sum(shares)):
            shares[i] += 1

        results = Parallel(n_jobs=self.n_jobs)(delayed(randomized_diffs)(share)
                                               for share in shares)
        all_counts = []
        for result in results:
            metrics, counts = zip(*result.iteritems())
            all_counts.append(counts)

        return {
            metric: {
                'diff': base_diff[metric],
                'p': (sum(counts) + 1) / (self.trials + 1)
            }
            for metric, counts in zip(metrics, zip(*all_counts))
        }
Example #16
0
def druhg(X,
          max_ranking=16,
          limit1=None,
          limit2=None,
          exclude=None,
          fix_outliers=0,
          metric='minkowski',
          p=2,
          algorithm='best',
          leaf_size=40,
          verbose=False,
          core_n_jobs=None,
          **kwargs):
    """Perform DRUHG clustering from a vector array or distance matrix.

    Parameters
    ----------
    X : array matrix of shape (n_samples, n_features), or \
            array of shape (n_samples, n_samples)
        A feature array, or array of distances between samples if
        ``metric='precomputed'``.

    max_ranking : int, optional (default=15)
        The maximum number of neighbors to search.
        Affects performance vs precision.

    limit1 : float, optional (default=sqrt(size))
        Clusters that are smaller than this limit treated as noise.
        Use 1 to find True outliers.
        Numbers under 1 treated as percentage of the dataset size

    limit2 : float, optional (default=size/2)
        Clusters with size OVER this limit treated as noise.
        Use it to break down big clusters.
        Numbers under 1 treated as percentage of the dataset size

    exclude: list, optional (default=None)
        Clusters with these indexes would not be formed.
        Use it for surgical cluster removal.

    fix_outliers: int, optional (default=0)
        In case of 1 - all outliers will be assigned to the nearest cluster

    metric : string or callable, optional (default='minkowski')
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string or callable, it must be one of
        the options allowed by metrics.pairwise.pairwise_distances for its
        metric parameter.
        If metric is "precomputed", X is assumed to be a distance matrix and
        must be square.

    p : int, optional (default=2)
        p value to use if using the minkowski metric.

    leaf_size : int, optional (default=40)
        Leaf size for trees responsible for fast nearest
        neighbour queries.

    algorithm : string, optional (default='best')
        Exactly, which algorithm to use; DRUHG has variants specialized
        for different characteristics of the data. By default, this is set
        to ``best`` which chooses the "best" algorithm given the nature of
        the data. You can force other options if you believe you know
        better. Options are:
            * ``best``
            * ``kdtree``
            * ``balltree``
        If you want it to be accurate add:
            * ``slow``

    core_n_jobs : int, optional (default=None)
        Number of parallel jobs to run in neighbors distance computations (if
        supported by the specific algorithm).
        For default, (n_cpus + 1 + core_dist_n_jobs) is used.

    **kwargs : optional
        Arguments passed to the distance metric

    Returns
    -------
    labels : ndarray, shape (n_samples)
        Cluster labels for each point. Noisy samples are given the label -1.

    min_spanning_tree : ndarray, shape (2*n_samples - 2)
        The minimum spanning tree as edgepairs.

    values_edges : ndarray, shape (n_samples - 1)
        Values of the edges.


    References
    ----------

    None

    """
    if type(X) is list:
        raise ValueError('X must be array! Not a list!')

    size = X.shape[0]

    if core_n_jobs is None:
        core_n_jobs = max(cpu_count(), 1)
    elif core_n_jobs < 0:
        core_n_jobs = max(cpu_count() + 1 + core_n_jobs, 1)

    if max_ranking is not None:
        if type(max_ranking) is not int:
            raise ValueError('Max ranking must be integer!')
        if max_ranking < 0:
            raise ValueError('Max ranking must be non-negative integer!')

    if leaf_size < 1:
        raise ValueError('Leaf size must be greater than 0!')

    if metric == 'minkowski':
        if p is None:
            raise TypeError('Minkowski metric given but no p value supplied!')
        if p < 0:
            raise ValueError('Minkowski metric with negative p value is not'
                             ' defined!')
    printout = ''
    if max_ranking is None:
        max_ranking = 16
        printout += 'max_ranking is set to ' + str(max_ranking) + ', '

    max_ranking = min(size - 1, max_ranking)

    if limit1 is None:
        limit1 = int(np.sqrt(size))
        printout += 'limit1 is set to ' + str(limit1) + ', '
    else:
        if limit1 < 0:
            raise ValueError('Limit1 must be non-negative integer!')
        if limit1 < 1:
            limit1 = int(limit1 * size)

    if limit2 is None:
        limit2 = int(size / 2 + 1)
        printout += 'limit2 is set to ' + str(limit2) + ', '
    else:
        if limit2 < 0:
            raise ValueError('Limit2 must be non-negative integer!')
        if limit2 <= 1:
            limit2 = int(limit2 * size + 1)

    if algorithm == 'best':
        algorithm = 'kd_tree'

    if X.dtype != np.float64:
        print('Converting data to numpy float64')
        X = X.astype(np.float64)

    algo_code = 0
    if "precomputed" in algorithm.lower() or "precomputed" in metric.lower(
    ) or issparse(X):
        algo_code = 2
        if issparse(X):
            algo_code = 3
        elif len(X.shape) == 2 and X.shape[0] != X.shape[1]:
            raise ValueError('Precomputed matrix is not a square.')
        tree = X
    else:
        # The Cython routines used require contiguous arrays
        if not X.flags['C_CONTIGUOUS']:
            X = np.array(X, dtype=np.double, order='C')

        if "kd" in algorithm.lower() and "tree" in algorithm.lower():
            algo_code = 0
            if metric not in KDTree.valid_metrics:
                raise ValueError('Metric: %s\n'
                                 'Cannot be used with KDTree' % metric)
            tree = KDTree(X, metric=metric, leaf_size=leaf_size, **kwargs)
        elif "ball" in algorithm.lower() and "tree" in algorithm.lower():
            algo_code = 1
            tree = BallTree(X, metric=metric, leaf_size=leaf_size, **kwargs)
        else:
            algo_code = 0
            if metric not in KDTree.valid_metrics:
                raise ValueError('Metric: %s\n'
                                 'Cannot be used with KDTree' % metric)
            tree = KDTree(X, metric=metric, leaf_size=leaf_size, **kwargs)
            # raise TypeError('Unknown algorithm type %s specified' % algorithm)

    is_slow_and_deterministic = 0
    if "slow" in algorithm.lower():
        is_slow_and_deterministic = 1

    if printout:
        print('Druhg is using defaults for: ' + printout)

    ur = UniversalReciprocity(algo_code,
                              tree,
                              max_neighbors_search=max_ranking,
                              metric=metric,
                              leaf_size=leaf_size // 3,
                              is_slow=is_slow_and_deterministic,
                              n_jobs=core_n_jobs,
                              **kwargs)

    pairs, values = ur.get_tree()

    labels = label(pairs,
                   values,
                   size,
                   exclude=exclude,
                   limit1=int(limit1),
                   limit2=int(limit2),
                   fix_outliers=fix_outliers)

    return (labels, pairs, values)