Beispiel #1
0
    def __init__(
        self,
        records=None,
        input_dir=None,
        file_format='fasta',
        datatype=None,
        tmpdir='/tmp',
        calc_distances=False,
        compression=None,
        analysis=None,
        ):

        self.tmpdir = directorycheck(tmpdir)

        if records:
            self.records = records
            self.datatype = datatype or records[0].datatype
            optioncheck(self.datatype, ['dna', 'protein'])

        elif input_dir:
            directorycheck(input_dir)
            self.datatype = optioncheck(datatype, ['dna', 'protein'])
            optioncheck(file_format, ['fasta', 'phylip'])
            self.records = self.read_files(input_dir, file_format, compression)

        else:
            print 'Provide a list of records, or the path to a set of alignments'

        if not self.records:
            raise NoRecordsError(file_format, input_dir, compression)

        if calc_distances:
            self.calc_distances()
Beispiel #2
0
    def embedding_plotter(
            self, coordinates, dimensions, partition=None, add_sphere=False,
            xlab='PCo1', ylab='PCo2', zlab='PCo3',
            title='Trees embedded in dimension-reduced space',
            outfile=False,
        ):
        """ Points are coloured according to cluster membership specified
        by Partition object (or all black if no Partition specified) """

        optioncheck(dimensions, [2,3])
        partition = partition or Partition(tuple([6]*len(self.collection.records)))

        colours = 'bgrcmyk'
        colour_mapping = np.array([colours[i] for i in partition.partition_vector])
        fig = plt.figure()

        if dimensions == 3:
            ax = fig.add_subplot(111, projection='3d',
                    xlabel=xlab, ylabel=ylab, zlabel=zlab, title=title)
            if add_sphere:
                ax = self.sphere(ax)

        else:
            ax = fig.add_subplot(111,
                    xlabel=xlab, ylabel=ylab, title=title)

        ax.scatter(*coordinates.T, color=colour_mapping)
        # ax.set_aspect(1)


        if outfile:
            fig.savefig('{0}.pdf'.format(outfile))

        return fig
Beispiel #3
0
 def get_decomp(self, method='MDS', **kwargs):
     optioncheck(method, ['MDS', 'spectral'])
     cl = Clustering(self.dm)
     if method == 'MDS':
         return cl.MDS_decomp()
     if method == 'spectral':
         return cl.spectral_decomp(**kwargs)
Beispiel #4
0
    def read_files(self, input_dir, file_format, compression=None):
        """ Get list of alignment files from an input directory *.fa, *.fas and
        *.phy files only

        Stores in self.files """

        optioncheck(compression, [None, 'gz', 'bz2'])

        if file_format == 'fasta':
            extensions = ['fa', 'fas', 'fasta']

        elif file_format == 'phylip':
            extensions = ['phy']

        if compression:
            extensions = ['.'.join([x, compression]) for x in extensions]

        files = fileIO.glob_by_extensions(input_dir, extensions)
        files.sort(key=sort_key)

        return [
            TrClSeq(f,
                    file_format=file_format,
                    datatype=self.datatype,
                    name=get_name(f),
                    tmpdir=self.tmpdir) for f in files
        ]
Beispiel #5
0
    def __init__(
        self,
        records=None,
        input_dir=None,
        file_format='fasta',
        datatype=None,
        tmpdir='/tmp',
        calc_distances=False,
        compression=None,
        analysis=None,
    ):

        self.tmpdir = directorycheck(tmpdir)

        if records:
            self.records = records
            self.datatype = datatype or records[0].datatype
            optioncheck(self.datatype, ['dna', 'protein'])

        elif input_dir:
            directorycheck(input_dir)
            self.datatype = optioncheck(datatype, ['dna', 'protein'])
            optioncheck(file_format, ['fasta', 'phylip'])
            self.records = self.read_files(input_dir, file_format, compression)

        else:
            print 'Provide a list of records, or the path to a set of alignments'

        if not self.records:
            raise NoRecordsError(file_format, input_dir, compression)

        if calc_distances:
            self.calc_distances()
Beispiel #6
0
 def get_decomp(self, method='MDS', **kwargs):
     optioncheck(method, ['MDS', 'spectral'])
     cl = Clustering(self.dm)
     if method == 'MDS':
         return cl.MDS_decomp()
     if method == 'spectral':
         return cl.spectral_decomp(**kwargs)
Beispiel #7
0
 def __new__(
     cls,
     trees,
     metric,
     tmpdir='/tmp',
     dtype=float,
     add_noise=False,
     normalise=False,
     ):
     optioncheck(metric, ['euc', 'geo', 'rf', 'wrf'])
     input_array = get_distance_matrix(trees, metric, tmpdir, 
         normalise=normalise)
     obj = np.asarray(input_array, dtype).view(cls)
     obj.metric = metric
     obj.tmpdir = tmpdir
     if add_noise:
         obj = obj.add_noise()
     return obj
Beispiel #8
0
    def embedding_plotter(
        self,
        coordinates,
        dimensions,
        partition=None,
        add_sphere=False,
        xlab='PCo1',
        ylab='PCo2',
        zlab='PCo3',
        title='Trees embedded in dimension-reduced space',
        outfile=False,
    ):
        """ Points are coloured according to cluster membership specified
        by Partition object (or all black if no Partition specified) """

        optioncheck(dimensions, [2, 3])
        partition = partition or Partition(
            tuple([6] * len(self.collection.records)))

        colours = 'bgrcmyk'
        colour_mapping = np.array(
            [colours[i] for i in partition.partition_vector])
        fig = plt.figure()

        if dimensions == 3:
            ax = fig.add_subplot(111,
                                 projection='3d',
                                 xlabel=xlab,
                                 ylabel=ylab,
                                 zlabel=zlab,
                                 title=title)
            if add_sphere:
                ax = self.sphere(ax)

        else:
            ax = fig.add_subplot(111, xlabel=xlab, ylabel=ylab, title=title)

        ax.scatter(*coordinates.T, color=colour_mapping)
        # ax.set_aspect(1)

        if outfile:
            fig.savefig('{0}.pdf'.format(outfile))

        return fig
Beispiel #9
0
    def __init__(
        self,
        records,
        analysis,
        max_guidetrees=10,
        tmpdir=None,
        datatype=None,
        verbosity=0,
    ):

        self.analysis = optioncheck(analysis, ['ml', 'nj', 'TreeCollection'])
        self.max_guidetrees = max_guidetrees
        self.records = records
        self.datatype = datatype or records[0].datatype
        self.verbosity = verbosity
        optioncheck(self.datatype, ['protein', 'dna'])
        self.tmpdir = tmpdir or records[0].tmpdir
        self.concats = {}
        self.history = []
Beispiel #10
0
    def __init__(
        self,
        records,
        analysis,
        max_guidetrees=10,
        tmpdir=None,
        datatype=None,
        verbosity=0,
        ):

        self.analysis = optioncheck(analysis, ['ml', 'nj',
                        'TreeCollection'])
        self.max_guidetrees = max_guidetrees
        self.records = records
        self.datatype = datatype or records[0].datatype
        self.verbosity = verbosity
        optioncheck(self.datatype, ['protein', 'dna'])
        self.tmpdir = tmpdir or records[0].tmpdir
        self.concats = {}
        self.history = []
Beispiel #11
0
 def __new__(
     cls,
     trees,
     metric,
     tmpdir='/tmp',
     dtype=float,
     add_noise=False,
     normalise=False,
 ):
     optioncheck(metric, ['euc', 'geo', 'rf', 'wrf'])
     input_array = get_distance_matrix(trees,
                                       metric,
                                       tmpdir,
                                       normalise=normalise)
     obj = np.asarray(input_array, dtype).view(cls)
     obj.metric = metric
     obj.tmpdir = tmpdir
     if add_noise:
         obj = obj.add_noise()
     return obj
Beispiel #12
0
    def read_files(self, input_dir, file_format, compression=None):
        """ Get list of alignment files from an input directory *.fa, *.fas and
        *.phy files only

        Stores in self.files """

        optioncheck(compression, [None, 'gz', 'bz2'])

        if file_format == 'fasta':
            extensions = ['fa', 'fas', 'fasta']

        elif file_format == 'phylip':
            extensions = ['phy']

        if compression:
            extensions = ['.'.join([x, compression]) for x in extensions]

        files = fileIO.glob_by_extensions(input_dir, extensions)
        files.sort(key=sort_key)

        return [TrClSeq(f, file_format=file_format, datatype=self.datatype,
                name=get_name(f), tmpdir=self.tmpdir) for f in files]
Beispiel #13
0
    def simulate(self, index_list, model=None):
        """ Simulate a group of sequence alignments using ALF. Uses one of
        {(GCB, JTT, LG, WAG - protein), (CPAM, ECM and ECMu - DNA)}, WAG by
        default. TO DO: add parameterised models when I have a robust (probably
        PAML) method of estimating them from alignment+tree """

        if self.datatype == 'protein':  # set some defaults
            model = model or 'WAG'
            optioncheck(model, [
                'CPAM',
                'ECM',
                'ECMu',
                'WAG',
                'JTT',
                'GCB',
                'LG',
                ])
        else:
            model = model or 'ECM'
            try:
                optioncheck(model, ['CPAM', 'ECM', 'ECMu'])
            except OptionError, e:
                print 'Choose a DNA-friendly model for simulation:\n', e
                return
Beispiel #14
0
    def simulate(self, index_list, model=None):
        """ Simulate a group of sequence alignments using ALF. Uses one of
        {(GCB, JTT, LG, WAG - protein), (CPAM, ECM and ECMu - DNA)}, WAG by
        default. TO DO: add parameterised models when I have a robust (probably
        PAML) method of estimating them from alignment+tree """

        if self.datatype == 'protein':  # set some defaults
            model = model or 'WAG'
            optioncheck(model, [
                'CPAM',
                'ECM',
                'ECMu',
                'WAG',
                'JTT',
                'GCB',
                'LG',
            ])
        else:
            model = model or 'ECM'
            try:
                optioncheck(model, ['CPAM', 'ECM', 'ECMu'])
            except OptionError, e:
                print 'Choose a DNA-friendly model for simulation:\n', e
                return
Beispiel #15
0
import argparse

parser = argparse.ArgumentParser(description='Clustering optimiser')
parser.add_argument('-n', '--nclusters', type=int)
parser.add_argument('-f', '--format', default='phylip')
parser.add_argument('-d', '--datatype', default='protein')
parser.add_argument('-i', '--input_dir', default='./')
parser.add_argument('-c', '--compression', default=None)
parser.add_argument('-t', '--tmpdir', default='/tmp/')
parser.add_argument('-m', '--method', default='s')
parser.add_argument('-o', '--output', default=None)

args = parser.parse_args()

optioncheck(args.method, ['s', 'spectral', 'h', 'hierarchical', 'k', 'kmedoids', 'MDS', 'mds'])


def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
    return ''.join(random.choice(chars) for x in range(size))

new_tmpdir = tempfile.mkdtemp(prefix='tmpwrap_mgp_', dir=args.tmpdir)

c = Collection(input_dir=args.input_dir,
               compression=args.compression, file_format=args.format, datatype=args.datatype,
               tmpdir=new_tmpdir)

c.calc_NJ_trees()

dm = c.distance_matrix('euc')
cl = Clustering(dm)
Beispiel #16
0
    def decomp_to_coords(self, decomp, dimensions, normalise=False):
        optioncheck(dimensions, [2,3])

        coords = decomp.coords_by_dimension(dimensions)[0]
        return coords.normalise_rows() if normalise else coords
Beispiel #17
0
    def decomp_to_coords(self, decomp, dimensions, normalise=False):
        optioncheck(dimensions, [2, 3])

        coords = decomp.coords_by_dimension(dimensions)[0]
        return coords.normalise_rows() if normalise else coords