Exemple #1
0
    def add_minsq_partitions(self, partitions, batchsize=1, background=False):
        if isinstance(partitions, Partition):
            partitions = (partitions, )
        index_tuples = set(ix for partition in partitions
                           for ix in partition.get_membership()).difference(
                               self.minsq_cache.keys())

        if len(index_tuples) == 0:
            return

        # Collect argument list
        args = []
        for ix in index_tuples:
            conc = self.concatenate(ix)
            args.append(conc.get_tree_collection_strings())

        # Distribute work
        msg = 'Adding MinSq cluster trees'
        client = get_client()
        if client is None:
            map_result = sequential_map(tasks.minsq_task, args, msg)
        else:
            map_result = parallel_map(client, tasks.minsq_task, args, msg,
                                      batchsize, background)
            if background:
                return map_result
        # Process results
        pbar = setup_progressbar('Processing results', len(map_result))
        pbar.start()
        for i, (ix, result) in enumerate(zip(index_tuples, map_result)):
            self.minsq_cache[ix] = result
            pbar.update(i)
        pbar.finish()
Exemple #2
0
    def add_minsq_partitions(self, partitions, batchsize=1, background=False):
        if isinstance(partitions, Partition):
            partitions = (partitions,)
        index_tuples = set(ix for partition in partitions for ix in partition.get_membership()).difference(
            self.minsq_cache.keys())

        if len(index_tuples) == 0:
            return

        # Collect argument list
        args = []
        for ix in index_tuples:
            conc = self.concatenate(ix)
            args.append(conc.get_tree_collection_strings())

        # Distribute work
        msg = 'Adding MinSq cluster trees'
        client = get_client()
        if client is None:
            map_result = sequential_map(tasks.minsq_task, args, msg)
        else:
            map_result = parallel_map(client, tasks.minsq_task, args, msg, batchsize, background)
            if background:
                return map_result
        # Process results
        pbar = setup_progressbar('Processing results', len(map_result))
        pbar.start()
        for i, (ix, result) in enumerate(zip(index_tuples, map_result)):
            self.minsq_cache[ix] = result
            pbar.update(i)
        pbar.finish()
Exemple #3
0
def _generic_matrix_calc(fn, trees, normalise, min_overlap=4):
    """(fn, trees, normalise)

    Calculates all pairwise distances between trees given in the parameter 'trees'.

    Distance functions:
        eucdist_matrix
        geodist_matrix
        rfdist_matrix
        wrfdist_matrix

    These wrap the leafset-checking functions. If the faster non-leafset-checking functions are needed, do this:
    scipy.spatial.distance(['getDistance'(t1.phylotree, t2.phylotree, normalise)
                                for (t1, t2) in itertools.combinations(trees, 2)])
    for your choice of 'getDistance' out of:
        getEuclideanDistance
        getGeodesicDistance
        getRobinsonFouldsDistance
        getWeightedRobinsonFouldsDistance

    :param trees: list or tuple, or some other iterable container type containing Tree objects
    :param normalise: boolean
    :param min_overlap: int
    :return: numpy.array
    """
    jobs = itertools.combinations(trees, 2)
    results = []
    pbar = setup_progressbar('Calculating tree distances', 0.5 * len(trees) * (len(trees) - 1))
    pbar.start()
    for i, (t1, t2) in enumerate(jobs):
        results.append(_generic_distance_calc(fn, t1, t2, normalise, min_overlap))
        pbar.update(i)
    pbar.finish()
    return scipy.spatial.distance.squareform(results)
Exemple #4
0
    def calc_trees(self,
                   model=None,
                   threads=1,
                   indices=None,
                   batchsize=1,
                   output_dir=None,
                   background=False):
        """
        Use pllpy to calculate maximum-likelihood trees
        :return: None (all side effects)
        """
        # Assemble argument lists
        if indices is None:
            indices = list(range(len(self)))
        args = []
        to_delete = []
        for i in indices:
            rec = self[i]
            filename, delete = rec.get_alignment_file(as_phylip=True)
            if delete:
                to_delete.append(filename)
            if model is None:
                model = ('DNA' if rec.is_dna() else 'LGX')
            if model == 'AUTOX':
                model = 'AUTO'
            partition = '{}, {} = 1 - {}'.format(model, rec.name, len(rec))
            tree = rec.parameters.nj_tree if rec.parameters.nj_tree is not None else True
            if output_dir is not None and os.path.isdir(output_dir):
                output_file = os.path.join(output_dir,
                                           '{}.json'.format(rec.name))
                curr_args = (filename, partition, tree, threads,
                             PLL_RANDOM_SEED, None, output_file)
            else:
                curr_args = (filename, partition, tree, threads,
                             PLL_RANDOM_SEED)
            args.append(curr_args)

        # Dispatch work
        msg = 'Calculating ML trees'
        client = get_client()
        if client is None:
            map_result = sequential_map(tasks.pll_task, args, msg)
        else:
            map_result = parallel_map(client, tasks.pll_task, args, msg,
                                      batchsize, background)
            if background:
                return map_result

        # Process results
        with fileIO.TempFileList(to_delete):
            pbar = setup_progressbar('Processing results', len(map_result))
            j = 0
            pbar.start()
            for i, result in zip(indices, map_result):
                rec = self[i]
                rec.parameters.construct_from_dict(result)
                pbar.update(j + 1)
                j += 1
Exemple #5
0
    def add_lnl_partitions(self,
                           partitions,
                           threads=1,
                           use_calculated_freqs=True,
                           batchsize=1,
                           background=False):
        self.add_minsq_partitions(partitions)
        if isinstance(partitions, Partition):
            partitions = (partitions, )
        index_tuples = set(ix for partition in partitions
                           for ix in partition.get_membership()).difference(
                               self.lnl_cache.keys())
        if len(index_tuples) == 0:
            return

        # Collect argument list
        args = []
        to_delete = []
        for ix in index_tuples:
            conc = self.concatenate(ix)
            al = conc.alignment
            filename, delete = al.get_alignment_file(as_phylip=True)
            if delete:
                to_delete.append(filename)
            partition = conc.qfile(dna_model="GTR",
                                   protein_model="LG",
                                   ml_freqs=True)
            tree = self.minsq_cache[ix]['tree']
            if use_calculated_freqs:
                args.append((filename, partition, tree, threads,
                             PLL_RANDOM_SEED, conc.frequencies))
            else:
                args.append((filename, partition, tree, threads,
                             PLL_RANDOM_SEED, None))

        # Distribute work
        with fileIO.TempFileList(to_delete):
            msg = 'Adding ML cluster trees'
            client = get_client()
            if client is None:
                map_result = sequential_map(tasks.pll_task, args, msg)
            else:
                map_result = parallel_map(client, tasks.pll_task, args, msg,
                                          batchsize, background)
                if background:
                    return map_result

        # Process results
        pbar = setup_progressbar('Processing results', len(map_result))
        pbar.start()
        for i, (ix, result) in enumerate(zip(index_tuples, map_result)):
            self.lnl_cache[ix] = result
            pbar.update(i)
        pbar.finish()
Exemple #6
0
    def calc_distances(self, batchsize=1, background=False):
        """
        Calculate fast approximate intra-alignment pairwise distances and variances using
        ML (requires ML models to have been set up using `calc_trees`).
        :return: None (all side effects)
        """
        # Assemble argument lists
        args = []
        to_delete = []
        for rec in self:
            filename, delete = rec.get_alignment_file(as_phylip=True)
            if delete:
                to_delete.append(filename)
            # Get input dict
            model = {'partitions': {}}
            data = {
                'alpha': rec.parameters.partitions.alpha,
                'frequencies': rec.parameters.partitions.frequencies
            }
            if rec.is_dna():
                data['rates'] = rec.parameters.partitions.rates
            model['partitions'][0] = data
            args.append((model, filename))

        # Dispatch
        msg = 'Calculating ML distances'
        client = get_client()
        if client is None:
            map_result = sequential_map(tasks.calc_distances_task, args, msg)
        else:
            map_result = parallel_map(client, tasks.calc_distances_task, args,
                                      msg, batchsize, background)
            if background:
                return map_result

        # Process results
        with fileIO.TempFileList(to_delete):
            pbar = setup_progressbar('Processing results', len(map_result))
            j = 0
            pbar.start()
            for i, result in enumerate(map_result):
                rec = self[i]
                rec.parameters.partitions.distances = result['partitions'][0][
                    'distances']
                rec.parameters.partitions.variances = result['partitions'][0][
                    'variances']
                rec.parameters.nj_tree = result['nj_tree']
                pbar.update(j + 1)
                j += 1
Exemple #7
0
    def calc_trees(self, model=None, threads=1, indices=None, batchsize=1, output_dir=None, background=False):
        """
        Use pllpy to calculate maximum-likelihood trees
        :return: None (all side effects)
        """
        # Assemble argument lists
        if indices is None:
            indices = list(range(len(self)))
        args = []
        to_delete = []
        for i in indices:
            rec = self[i]
            filename, delete = rec.get_alignment_file(as_phylip=True)
            if delete:
                to_delete.append(filename)
            if model is None:
                model = ('DNA' if rec.is_dna() else 'LGX')
            if model == 'AUTOX':
                model = 'AUTO'
            partition = '{}, {} = 1 - {}'.format(model, rec.name, len(rec))
            tree = rec.parameters.nj_tree if rec.parameters.nj_tree is not None else True
            if output_dir is not None and os.path.isdir(output_dir):
                output_file = os.path.join(output_dir, '{}.json'.format(rec.name))
                curr_args = (filename, partition, tree, threads, PLL_RANDOM_SEED, None, output_file)
            else:
                curr_args = (filename, partition, tree, threads, PLL_RANDOM_SEED)
            args.append(curr_args)

        # Dispatch work
        msg = 'Calculating ML trees'
        client = get_client()
        if client is None:
            map_result = sequential_map(tasks.pll_task, args, msg)
        else:
            map_result = parallel_map(client, tasks.pll_task, args, msg, batchsize, background)
            if background:
                return map_result

        # Process results
        with fileIO.TempFileList(to_delete):
            pbar = setup_progressbar('Processing results', len(map_result))
            j = 0
            pbar.start()
            for i, result in zip(indices, map_result):
                rec = self[i]
                rec.parameters.construct_from_dict(result)
                pbar.update(j+1)
                j += 1
Exemple #8
0
    def fast_calc_distances(self, batchsize=1, background=False):
        """
        Calculate fast approximate intra-alignment pairwise distances and variances using
        Jukes-Cantor closed formulae.
        :return: None (all side effects)
        """
        # Assemble argument lists
        args = []
        to_delete = []
        for rec in self:
            filename, delete = rec.get_alignment_file(as_phylip=True)
            if delete:
                to_delete.append(filename)
            args.append((filename, ))

        # Dispatch work (either sequentially or in parallel)
        msg = 'Calculating fast distances'
        with fileIO.TempFileList(to_delete):
            client = get_client()
            if client is None:
                map_result = sequential_map(tasks.fast_calc_distances_task,
                                            args, msg)
            else:
                map_result = parallel_map(client,
                                          tasks.fast_calc_distances_task, args,
                                          msg, batchsize, background)
                if background:
                    return map_result

        # Process results
        pbar = setup_progressbar('Processing results', len(map_result))
        j = 0
        pbar.start()
        for i, result in enumerate(map_result):
            rec = self[i]
            distances = result['distances']
            variances = result['variances']
            tree = result['tree']
            rec.parameters.nj_tree = tree
            params = rec.parameters.partitions
            if params is None:
                params = PartitionParameters()
                rec.parameters.partitions = [params]
            params.distances = distances
            params.variances = variances
            pbar.update(i)
        pbar.finish()
Exemple #9
0
    def fast_calc_distances(self, batchsize=1, background=False):
        """
        Calculate fast approximate intra-alignment pairwise distances and variances using
        Jukes-Cantor closed formulae.
        :return: None (all side effects)
        """
        # Assemble argument lists
        args = []
        to_delete = []
        for rec in self:
            filename, delete = rec.get_alignment_file(as_phylip=True)
            if delete:
                to_delete.append(filename)
            args.append((filename,))

        # Dispatch work (either sequentially or in parallel)
        msg = 'Calculating fast distances'
        with fileIO.TempFileList(to_delete):
            client = get_client()
            if client is None:
                map_result = sequential_map(tasks.fast_calc_distances_task, args, msg)
            else:
                map_result = parallel_map(client, tasks.fast_calc_distances_task, args, msg, batchsize, background)
                if background:
                    return map_result

        # Process results
        pbar = setup_progressbar('Processing results', len(map_result))
        j = 0
        pbar.start()
        for i, result in enumerate(map_result):
            rec = self[i]
            distances = result['distances']
            variances = result['variances']
            tree = result['tree']
            rec.parameters.nj_tree = tree
            params = rec.parameters.partitions
            if params is None:
                params = PartitionParameters()
                rec.parameters.partitions = [params]
            params.distances = distances
            params.variances = variances
            pbar.update(i)
        pbar.finish()
Exemple #10
0
    def add_lnl_partitions(self, partitions, threads=1, use_calculated_freqs=True, batchsize=1, background=False):
        self.add_minsq_partitions(partitions)
        if isinstance(partitions, Partition):
            partitions = (partitions,)
        index_tuples = set(ix for partition in partitions for ix in partition.get_membership()).difference(
            self.lnl_cache.keys())
        if len(index_tuples) == 0:
            return

        # Collect argument list
        args = []
        to_delete = []
        for ix in index_tuples:
            conc = self.concatenate(ix)
            al = conc.alignment
            filename, delete = al.get_alignment_file(as_phylip=True)
            if delete:
                to_delete.append(filename)
            partition = conc.qfile(dna_model="GTR", protein_model="LG", ml_freqs=True)
            tree = self.minsq_cache[ix]['tree']
            if use_calculated_freqs:
                args.append((filename, partition, tree, threads, PLL_RANDOM_SEED, conc.frequencies))
            else:
                args.append((filename, partition, tree, threads, PLL_RANDOM_SEED, None))

        # Distribute work
        with fileIO.TempFileList(to_delete):
            msg = 'Adding ML cluster trees'
            client = get_client()
            if client is None:
                map_result = sequential_map(tasks.pll_task, args, msg)
            else:
                map_result = parallel_map(client, tasks.pll_task, args, msg, batchsize, background)
                if background:
                    return map_result

        # Process results
        pbar = setup_progressbar('Processing results', len(map_result))
        pbar.start()
        for i, (ix, result) in enumerate(zip(index_tuples, map_result)):
            self.lnl_cache[ix] = result
            pbar.update(i)
        pbar.finish()
Exemple #11
0
    def calc_distances(self, batchsize=1, background=False):
        """
        Calculate fast approximate intra-alignment pairwise distances and variances using
        ML (requires ML models to have been set up using `calc_trees`).
        :return: None (all side effects)
        """
        # Assemble argument lists
        args = []
        to_delete = []
        for rec in self:
            filename, delete = rec.get_alignment_file(as_phylip=True)
            if delete:
                to_delete.append(filename)
            # Get input dict
            model = {'partitions': {}}
            data = {'alpha': rec.parameters.partitions.alpha, 'frequencies': rec.parameters.partitions.frequencies}
            if rec.is_dna():
                data['rates'] = rec.parameters.partitions.rates
            model['partitions'][0] = data
            args.append((model, filename))

        # Dispatch
        msg = 'Calculating ML distances'
        client = get_client()
        if client is None:
            map_result = sequential_map(tasks.calc_distances_task, args, msg)
        else:
            map_result = parallel_map(client, tasks.calc_distances_task, args, msg, batchsize, background)
            if background:
                return map_result

        # Process results
        with fileIO.TempFileList(to_delete):
            pbar = setup_progressbar('Processing results', len(map_result))
            j = 0
            pbar.start()
            for i, result in enumerate(map_result):
                rec = self[i]
                rec.parameters.partitions.distances = result['partitions'][0]['distances']
                rec.parameters.partitions.variances = result['partitions'][0]['variances']
                rec.parameters.nj_tree = result['nj_tree']
                pbar.update(j+1)
                j += 1
Exemple #12
0
    def read_parameters(self, input_dir):
        """ Read a directory full of tree files, matching them up to the
        already loaded alignments """

        pbar = setup_progressbar("Loading parameters", len(self.records))
        pbar.start()
        for i, rec in enumerate(self.records):
            hook = os.path.join(input_dir, '{}.json*'.format(rec.name))
            filename = glob.glob(hook)
            try:
                with fileIO.freader(filename[0]) as infile:
                    d = json.load(infile, parse_int=True)

                rec.parameters.construct_from_dict(d)

            except IOError, IndexError:
                continue

            finally:
Exemple #13
0
    def read_parameters(self, input_dir):
        """ Read a directory full of tree files, matching them up to the
        already loaded alignments """

        pbar = setup_progressbar("Loading parameters", len(self.records))
        pbar.start()
        for i, rec in enumerate(self.records):
            hook = os.path.join(input_dir, '{}.json*'.format(rec.name))
            filename = glob.glob(hook)
            try:
                with fileIO.freader(filename[0]) as infile:
                    d = json.load(infile, parse_int=True)

                rec.parameters.construct_from_dict(d)

            except IOError, IndexError:
                continue

            finally:
Exemple #14
0
    def read_alignments(self, input_dir, file_format, header_grep=None, compression=None):
        """ Get list of alignment files from an input directory *.fa, *.fas and
        *.phy files only

        Stores in self.files """

        optioncheck(compression, [None, 'gz', 'bz2'])

        if file_format == 'fasta':
            extensions = ['fa', 'fas', 'fasta']

        elif file_format == 'phylip':
            extensions = ['phy']

        else:
            extensions = []

        if compression:
            extensions = ['.'.join([x, compression]) for x in extensions]

        files = fileIO.glob_by_extensions(input_dir, extensions)
        files.sort(key=SORT_KEY)
        self._input_files = files
        records = []

        pbar = setup_progressbar("Loading files", len(files), simple_progress=True)
        pbar.start()

        for i, f in enumerate(files):
            if compression is not None:
                with fileIO.TempFile() as tmpfile:
                    with fileIO.freader(f, compression) as reader, fileIO.fwriter(tmpfile) as writer:
                        for line in reader:
                            writer.write(line)
                    try:
                        record = Alignment(tmpfile, file_format, True)
                    except RuntimeError:
                        record = Alignment(tmpfile, file_format, False)

            else:
                try:
                    record = Alignment(f, file_format, True)
                except RuntimeError:
                    record = Alignment(f, file_format, False)

            if header_grep:
                try:
                    datatype = 'dna' if record.is_dna() else 'protein'

                    record = Alignment([(header_grep(x), y) for (x, y) in record.get_sequences()], datatype)

                except TypeError:
                    raise TypeError("Couldn't apply header_grep to header\n"
                                    "alignment number={}, name={}\n"
                                    "header_grep={}".format(i, fileIO.strip_extensions(f), header_grep))
                except RuntimeError:
                    print('RuntimeError occurred processing alignment number={}, name={}'
                          .format(i, fileIO.strip_extensions(f)))
                    raise

            record.name = (fileIO.strip_extensions(f))
            records.append(record)
            pbar.update(i)
        pbar.finish()
        return records
Exemple #15
0
    def read_alignments(self,
                        input_dir,
                        file_format,
                        header_grep=None,
                        compression=None):
        """ Get list of alignment files from an input directory *.fa, *.fas and
        *.phy files only

        Stores in self.files """

        optioncheck(compression, [None, 'gz', 'bz2'])

        if file_format == 'fasta':
            extensions = ['fa', 'fas', 'fasta']

        elif file_format == 'phylip':
            extensions = ['phy']

        else:
            extensions = []

        if compression:
            extensions = ['.'.join([x, compression]) for x in extensions]

        files = fileIO.glob_by_extensions(input_dir, extensions)
        files.sort(key=SORT_KEY)
        self._input_files = files
        records = []

        pbar = setup_progressbar("Loading files",
                                 len(files),
                                 simple_progress=True)
        pbar.start()

        for i, f in enumerate(files):
            if compression is not None:
                with fileIO.TempFile() as tmpfile:
                    with fileIO.freader(f,
                                        compression) as reader, fileIO.fwriter(
                                            tmpfile) as writer:
                        for line in reader:
                            writer.write(line)
                    try:
                        record = Alignment(tmpfile, file_format, True)
                    except RuntimeError:
                        record = Alignment(tmpfile, file_format, False)

            else:
                try:
                    record = Alignment(f, file_format, True)
                except RuntimeError:
                    record = Alignment(f, file_format, False)

            if header_grep:
                try:
                    datatype = 'dna' if record.is_dna() else 'protein'

                    record = Alignment([(header_grep(x), y)
                                        for (x, y) in record.get_sequences()],
                                       datatype)

                except TypeError:
                    raise TypeError("Couldn't apply header_grep to header\n"
                                    "alignment number={}, name={}\n"
                                    "header_grep={}".format(
                                        i, fileIO.strip_extensions(f),
                                        header_grep))
                except RuntimeError:
                    print(
                        'RuntimeError occurred processing alignment number={}, name={}'
                        .format(i, fileIO.strip_extensions(f)))
                    raise

            record.name = (fileIO.strip_extensions(f))
            records.append(record)
            pbar.update(i)
        pbar.finish()
        return records