コード例 #1
0
    except AssertionError:
        print 'Can\'t get partitions to match'
        return -1
    return partition_list

sort_key = lambda item: tuple((int(num) if num else alpha) for (num,alpha) in re.findall(r'(\d+)|(\D+)', item.name))

print 'loading recs...'
recs = sorted(load_records(sys.argv[1], sys.argv[2]), key=sort_key)
print 'loading crecs...'
crecs = load_records(sys.argv[3])
d = make_target_dict(crecs)
print 'done.'

t = [1]*15+[2]*15+[3]*15+[4]*15
c = SequenceCollection()
c.records = recs

for metric in ['sym', 'euc', 'geo']:
    c.put_distance_matrices(metric, gtp_path = '/homes/kgori/research/clustering_project/class_files')
    for method in ['single', 'complete', 'average', 'ward', 'MDS', 'spectral', 'kmedoids']:
        p = order(rebuild_partitions(recs, d, metric, method))
        c.clustering.partitions[(metric, method, 4)] = p
c.clustering.partitions['true'] = t
c.put_clusters()
print '(Done).'

for rec in c.get_cluster_records():
    try:
        rec.tree = d[rec.name].tree
    except KeyError:
コード例 #2
0
ファイル: test_code.py プロジェクト: kgori/clustering_project
    
5)  Compare scores derived from clusters to random permutation of the original data
    either by making a copy of the SequenceCollection object, with clusters made up
    of the same number of genes with the same number of characters, or by randomising
    the alignments and performing hierarchical clustering on the randomised data

    if the former, do rand1 = col.make_randomised_copy
    if the latterm do rand2 = SequenceCollection(records=col.get_randomised_alignments(),
        datatype = 'protein')
"""


# indir = '/Users/kgori/git/kevin/yeast_data/MSA'
indir = '/Users/kgori/git/kevin/data/simulated_data/eight/MSA'

col = SequenceCollection(indir, datatype='protein')
ran = SequenceCollection(records=col.get_randomised_alignments(), datatype='protein')
col.put_trees_parallel()
ran.put_trees_parallel()
col.put_partitions(metrics=['euc','rf','sym'], linkages=['ward'], nclasses=[2,3,4,5,6,7,8,9,10])
ran.put_partitions(metrics=['euc','rf','sym'], linkages=['ward'], nclasses=[2,3,4,5,6,7,8,9,10])
col.put_clusters()
col.put_cluster_trees_parallel()
ran.put_clusters()
ran.put_cluster_trees_parallel()
rn2 = col.make_randomised_copy()

r1 = ran.get_clusters()
r2 = rn2.get_clusters()
cl = col.get_clusters()
コード例 #3
0
        '\t')[1])

for rec in phymlrecords:
    rec.datatype = 'dna'
for rec in bionjrecords:
    rec.datatype = 'dna'

try:
    assert len(phymlrecords) == len(bionjrecords) == 60
except:
    print 'Missing records in {0}'.format(indir)
    sys.exit(1)

phyml_sc = SequenceCollection(records=phymlrecords,
                              datatype='dna',
                              helper=os.environ['DARWINHELPER'],
                              tmpdir=tmpdir,
                              get_distances=False)
bionj_sc = SequenceCollection(records=bionjrecords,
                              datatype='dna',
                              helper=os.environ['DARWINHELPER'],
                              tmpdir=tmpdir,
                              get_distances=False)

phyml_sc.put_partitions(
    ['geo', 'euc', 'sym'],
    ['single', 'complete', 'average', 'ward', 'kmedoids', 'MDS', 'spectral'],
    4,
    gtp_path=
    '/net/isilon7/nobackup/research/goldman/kevin/clustering_project/class_files',
    tmpdir=tmpdir)
コード例 #4
0
progname = re.compile('[A-Za-z0-9.-_]+').search(sys.argv[0]).group()
desc = 'Read in a SequenceCollection from disk and dump records'
input_help = 'Filepath+name of gzipped SequenceCollection object'
output_help = 'Directory to dump files in'
choice_help = \
    '\n'.join(['Choose to dump post-clustering concatenated records',
              'instead of pre-clustering single records'])
parser = argparse.ArgumentParser(prog=progname, description=desc)
parser.add_argument('-i', dest='input_file', help=input_help, type=str)
parser.add_argument('-o', dest='output_dir', help=output_help, type=str)
parser.add_argument('-c', dest='cluster_recs', action='store_true')

args = parser.parse_args()
input_file = args.input_file
output_dir = args.output_dir.rstrip('/')
cluster_recs = args.cluster_recs

filecheck_and_quit(input_file)
directorycheck_and_make(output_dir)

from sequence_collection import SequenceCollection

sc = SequenceCollection.gunzip(input_file)
if cluster_recs:
    records = sc.get_cluster_records()
    sc.dump_records(output_dir, records)
else:
    records = sc.get_records()  # should be default anyway, but explicit
    sc.dump_records(output_dir,
                    records)  # is better than implicit, and all that
コード例 #5
0
        ])
else:
    helper = os.environ['DARWINHELPER']

try:
    TMPDIR = os.environ['TEMPORARY_DIRECTORY']
except:
    TMPDIR = '/tmp'

### MAIN
if treeprog == 'treecollection':
    get_distances = True
else:
    get_distances = False
sc = SequenceCollection(indir, file_format=file_format,
                        gtp_path=gtp_path, datatype=datatype,
                        get_distances=get_distances, tmpdir=TMPDIR, helper=helper)

sc.put_trees(program=treeprog)
sc.put_partitions(['geo', 'euc', 'rf'], [
    'average',
    'complete',
    'kmedoids',
    'MDS',
    'single',
    'spectral00',
    'spectral01',
    'spectral10',
    'spectral11',
    'ward',
    ], nclasses, recalculate=True)
コード例 #6
0
    'average',
    'ward',
    'kmedoids',
    'spectral',
    'MDS',
    ]
calc_varinf = False

if os.path.isfile(outf):
    sys.exit(1)

sc = SequenceCollection(
    seqdir,
    tmpdir=TMPDIR,
    gtp_path=GTP_PATH,
    helper=HELPER,
    file_format=format,
    datatype=datatype,
    parallel_load=False,
    get_distances=False,
    )

sc.put_trees(program='bionj', model='GTR', tmpdir=TMPDIR, ncat=4,
             datatype='nt')

# sc.put_distance_matrices(['rf'])
# print sc.get_distance_matrices()['rf']
# sys.exit()

sc.put_partitions('rf', methods, nclasses, recalculate=True)
sc.put_partitions('euc', methods, nclasses, recalculate=True)
sc.put_partitions('geo', methods, nclasses, recalculate=True)
コード例 #7
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

from sequence_collection import SequenceCollection
import cPickle
import time

indir = '/Users/kgori/git/kevin/data/simulated_data/small/MSA'

print 'test directory = ', indir

load_start = time.time()
print 'loading sequences (parallel)'
col = SequenceCollection(indir, datatype='protein')
print col
load_end = time.time()

tcpar_start = time.time()
print 'putting TC trees (parallel)'
col.put_trees_parallel(program='treecollection', tmpdir='/tmp')
for rec in col.records:
    print rec.name
    print rec.tree
tcpar_end = time.time()

par_start = time.time()
print 'Putting partitions'
col.put_partitions(metrics=['sym', 'euc'],
                   linkages=['ward', 'single'],
                   nclasses=[3, 4, 5, 6])
print col.get_partitions()
コード例 #8
0
from pylab import *
print 'done.'


def print_dict(d):
    for k in sorted(d):
        print d


np.set_printoptions(linewidth=200, precision=3)
sc = SequenceCollection(
    '/Users/kgori/scratch/chk/aa_alignments/',
    get_distances=False,
    file_format='phylip',
    helper='/Users/kgori/git/kevin/clustering_project/class_files/DV_wrapper.drw'
        ,
    parallel_load=True,
    gtp_path='/Users/kgori/git/kevin/clustering_project/class_files/',
    tmpdir='/tmp',
    datatype='protein',
    )

sc_yeast = SequenceCollection(
    '/Users/kgori/scratch/yeast_MSA',
    get_distances=False,
    file_format='phylip',
    helper='/Users/kgori/git/kevin/clustering_project/class_files/DV_wrapper.drw'
        ,
    parallel_load=True,
    gtp_path='/Users/kgori/git/kevin/clustering_project/class_files/',
    tmpdir='/tmp/yeast',
コード例 #9
0
from tree import Tree
from clustering import Clustering
import cPickle
import time
import os
import copy
import numpy as np
np.set_printoptions(precision=2, linewidth=200)

indir = '/Users/kgori/git/kevin/data/simulated_data/small/MSA'

print 'test directory = ', indir

load_start = time.time()
print 'loading sequences (parallel)'
col = SequenceCollection(indir, datatype='protein')
print col
load_end = time.time()

tcseq_start = time.time()
print 'getting TC trees (sequential)'
col.get_trees(program='treecollection', tmpdir='/tmp')
for rec in col.records:
    print rec.name
    print rec.tree
tcseq_end = time.time()

tcpar_start = time.time()
print 'getting TC trees (parallel)'
col.get_trees_parallel(program='treecollection', tmpdir='/tmp')
for rec in col.records:
コード例 #10
0
from pylab import *
print 'done.'


def print_dict(d):
    for k in sorted(d):
        print d


np.set_printoptions(linewidth=200, precision=3)
sc = SequenceCollection(
    '/Users/kgori/scratch/chk/aa_alignments/',
    get_distances=False,
    file_format='phylip',
    helper=
    '/Users/kgori/git/kevin/clustering_project/class_files/DV_wrapper.drw',
    parallel_load=True,
    gtp_path='/Users/kgori/git/kevin/clustering_project/class_files/',
    tmpdir='/tmp',
    datatype='protein',
)

sc_yeast = SequenceCollection(
    '/Users/kgori/scratch/yeast_MSA',
    get_distances=False,
    file_format='phylip',
    helper=
    '/Users/kgori/git/kevin/clustering_project/class_files/DV_wrapper.drw',
    parallel_load=True,
    gtp_path='/Users/kgori/git/kevin/clustering_project/class_files/',
    tmpdir='/tmp/yeast',
コード例 #11
0
#!/usr/bin/python
# -*- coding: utf-8 -*-

from sequence_collection import SequenceCollection
import cPickle
import time

indir = "/Users/kgori/git/kevin/data/simulated_data/small/MSA"

print "test directory = ", indir

load_start = time.time()
print "loading sequences (parallel)"
col = SequenceCollection(indir, datatype="protein")
print col
load_end = time.time()

tcpar_start = time.time()
print "putting TC trees (parallel)"
col.put_trees_parallel(program="treecollection", tmpdir="/tmp")
for rec in col.records:
    print rec.name
    print rec.tree
tcpar_end = time.time()

par_start = time.time()
print "Putting partitions"
col.put_partitions(metrics=["sym", "euc"], linkages=["ward", "single"], nclasses=[3, 4, 5, 6])
print col.get_partitions()
par_end = time.time()
コード例 #12
0
                    type=fpath,
                    default='/tmp')
parser.add_argument('-data', '--datatype', help='datatype', default=None)

args = vars(parser.parse_args())
outdir = args['directory']
program = args['program']
model = args['model']
ncat = args['ncat']
datatype = args['datatype']
gtp_path = os.environ['GTP_PATH']
tmpdir = os.environ['TEMPORARY_DIRECTORY']
tmpdir = args['tmpdir']
print 'Reading alignments into SequenceRecord object'
seq = SequenceCollection('{0}/dna_alignments'.format(outdir),
                         datatype='dna',
                         tmpdir=tmpdir,
                         helper=os.environ['DARWINHELPER'])
print 'Calculating trees'
print program, model, datatype, ncat, tmpdir
seq.put_trees(program=program,
              model=model,
              datatype=datatype,
              ncat=ncat,
              tmpdir=tmpdir)
print 'doing geodesic distance matrices'
seq.put_distance_matrices('geo', gtp_path=gtp_path, tmpdir=tmpdir)
print 'doing euc distance matrices'
seq.put_distance_matrices('euc')
print 'doing sym distance matrices'
seq.put_distance_matrices('sym')
コード例 #13
0
        tree.read_from_file('{0}/trees/{1}.nwk'.format(working_dir,
                            name))

    dv_matrix_strip_header = '\n'.join(dv_matrix.split('\n'
            )[2:]).rstrip()
    labels_strip_header = labels.split('\n')[1].rstrip()
    record = TCSeqRec()
    record.dv = [(dv_matrix_strip_header, labels_strip_header)]
    record.tree = tree
    record.name = name
    record.headers = labels_strip_header.split()
    record.sequences = ['' for _ in record.headers]
    record._update()
    records.append(record)

collection = SequenceCollection(records=records, get_distances=False,
                                gtp_path=os.environ['GTP_PATH'])
collection.put_distance_matrices('rf')
T = \
    collection.Clustering.run_spectral_rotate(collection.distance_matrices['rf'
        ])
collection.partitions[T] = Partition(T)
collection.clusters_to_partitions[('rf', 'spectral_rotate', max(T))] = T
collection.concatenate_records()
cluster_recs = collection.get_cluster_records()

number_of_clusters = len(cluster_recs)
for j in range(number_of_clusters):
    record = cluster_recs[j]
    record_dv = record.dv[0]
    labels = record.dv[1]
コード例 #14
0
datatype = args['datatype']
score = args['score']

directorycheck_and_quit(input_dir)
directorycheck_and_make(tmpdir)

gtp_path = os.environ['GTP_PATH']
helper = os.environ['DARWINHELPER']

from sequence_collection import SequenceCollection

sc = SequenceCollection(
    input_dir,
    file_format='phylip',
    datatype=datatype,
    helper=helper,
    gtp_path=gtp_path,
    tmpdir=tmpdir,
    overwrite=True,
)

sc.load_phyml_results(input_dir, program=None)
sc.quality_scores = {}
for dist in distance:
    (_, qs) = sc.autotune(dist,
                          max_groups=max_clusters,
                          min_groups=min_clusters)
    sc.quality_scores[dist] = qs
cluster_range = range(min_clusters, max_clusters + 1)
if min_clusters > 1:
    cluster_range.insert(0, 1)
コード例 #15
0
        (simdir, 'bionj_clustering'))),
                             key=sort_key)
else:
    records = sorted(load_records('/'.join((simdir, record_dir)),
                                  '*.ml.pickle'),
                     key=sort_key)
    cluster_records = sorted(load_records('/'.join(
        (simdir, 'phyml_clustering'))),
                             key=sort_key)

cluster_dic = make_target_dict(cluster_records)
print '(Done).'

#rebuild sequenceCollection object

sc = SequenceCollection()
sc.records = records

print 'Generating distance matrices...'
for metric in ['euc', 'sym', 'geo']:
    sc.put_distance_matrices(
        metric,
        gtp_path='/homes/kgori/research/clustering_project/class_files',
        tmpdir=tmpdir)
    for method in [
            'single', 'complete', 'ward', 'average', 'spectral', 'MDS',
            'kmedoids'
    ]:
        partition = rebuild_partitions(records,
                                       cluster_dic,
                                       metric=metric,
コード例 #16
0
    'average',
    'ward',
    'kmedoids',
    'spectral',
    'MDS',
]
calc_varinf = False

if os.path.isfile(outf):
    sys.exit(1)

sc = SequenceCollection(
    seqdir,
    tmpdir=TMPDIR,
    gtp_path=GTP_PATH,
    helper=HELPER,
    file_format=format,
    datatype=datatype,
    parallel_load=False,
    get_distances=False,
)

sc.put_trees(program='bionj',
             model='GTR',
             tmpdir=TMPDIR,
             ncat=4,
             datatype='nt')

# sc.put_distance_matrices(['rf'])
# print sc.get_distance_matrices()['rf']
# sys.exit()
コード例 #17
0
parser.add_argument('-m', '--model', help='which model to use in phylogenetic inference', default=None)
parser.add_argument('-n', '--ncat', help='number of categories for gamma distributed rates', default=1)
parser.add_argument('-t', '--tmpdir', help='temporary directory', type=fpath, default='/tmp')
parser.add_argument('-data', '--datatype', help='datatype', default=None)

args = vars(parser.parse_args())
outdir = args['directory']
program = args['program']
model = args['model']
ncat = args['ncat']
datatype = args['datatype']
gtp_path = os.environ['GTP_PATH']
tmpdir = os.environ['TEMPORARY_DIRECTORY']
tmpdir = args['tmpdir']
print 'Reading alignments into SequenceRecord object'
seq = SequenceCollection('{0}/dna_alignments'.format(outdir), datatype='dna', tmpdir=tmpdir, helper=os.environ['DARWINHELPER'])
print 'Calculating trees'
print program,model,datatype,ncat,tmpdir
seq.put_trees(program=program, model=model, datatype=datatype, ncat=ncat, tmpdir=tmpdir)
print 'doing geodesic distance matrices'
seq.put_distance_matrices('geo', gtp_path=gtp_path, tmpdir=tmpdir)
print 'doing euc distance matrices'
seq.put_distance_matrices('euc')
print 'doing sym distance matrices'
seq.put_distance_matrices('sym')

print 'getting score for true clustering'
with open('{0}/treedistances.txt'.format(outdir)) as file:
	T = file.readline().rstrip().split('\t')[1][1:-1].split(', ')
seq.clustering.partitions['true'] = T
seq.put_clusters()
コード例 #18
0
bionjrecords = sorted([cPickle.load(file(x)) for x in bionjpickles], key=lambda x:sort_key(x.name))

true = eval(open('{0}/treedistances.txt'.format(indir)).read().split('\n')[0].split('\t')[1])

for rec in phymlrecords:
    rec.datatype='dna'
for rec in bionjrecords:
    rec.datatype='dna'

try: 
    assert len(phymlrecords) == len(bionjrecords) == 60
except: 
    print 'Missing records in {0}'.format(indir)
    sys.exit(1)

phyml_sc = SequenceCollection(records=phymlrecords, datatype='dna', helper=os.environ['DARWINHELPER'],tmpdir=tmpdir, get_distances=False)
bionj_sc = SequenceCollection(records=bionjrecords, datatype='dna', helper=os.environ['DARWINHELPER'],tmpdir=tmpdir, get_distances=False)

phyml_sc.put_partitions(['geo','euc','sym'],['single','complete','average','ward','kmedoids','MDS','spectral'], 4, gtp_path='/net/isilon7/nobackup/research/goldman/kevin/clustering_project/class_files', tmpdir=tmpdir)
bionj_sc.put_partitions(['geo','euc','sym'],['single','complete','average','ward','kmedoids','MDS','spectral'], 4, gtp_path='/net/isilon7/nobackup/research/goldman/kevin/clustering_project/class_files', tmpdir=tmpdir)
phyml_sc.clustering.partitions['true']=true
bionj_sc.clustering.partitions['true']=true

phyml_sc.put_clusters()
bionj_sc.put_clusters()

if not os.path.isdir('{0}/phyml_clustering'.format(indir)):
    os.mkdir('{0}/phyml_clustering'.format(indir))
if not os.path.isdir('{0}/bionj_clustering'.format(indir)):
    os.mkdir('{0}/bionj_clustering'.format(indir))
コード例 #19
0
ファイル: test_code.py プロジェクト: haehn/clustering_project
    col.put_cluster_trees_parallel()
    
5)  Compare scores derived from clusters to random permutation of the original data
    either by making a copy of the SequenceCollection object, with clusters made up
    of the same number of genes with the same number of characters, or by randomising
    the alignments and performing hierarchical clustering on the randomised data

    if the former, do rand1 = col.make_randomised_copy
    if the latterm do rand2 = SequenceCollection(records=col.get_randomised_alignments(),
        datatype = 'protein')
"""

# indir = '/Users/kgori/git/kevin/yeast_data/MSA'
indir = '/Users/kgori/git/kevin/data/simulated_data/eight/MSA'

col = SequenceCollection(indir, datatype='protein')
ran = SequenceCollection(records=col.get_randomised_alignments(),
                         datatype='protein')
col.put_trees_parallel()
ran.put_trees_parallel()
col.put_partitions(metrics=['euc', 'rf', 'sym'],
                   linkages=['ward'],
                   nclasses=[2, 3, 4, 5, 6, 7, 8, 9, 10])
ran.put_partitions(metrics=['euc', 'rf', 'sym'],
                   linkages=['ward'],
                   nclasses=[2, 3, 4, 5, 6, 7, 8, 9, 10])
col.put_clusters()
col.put_cluster_trees_parallel()
ran.put_clusters()
ran.put_cluster_trees_parallel()
rn2 = col.make_randomised_copy()
コード例 #20
0
"""
print 'loading sequences...'
col = SequenceCollection(indir, helper=helper, tmpdir=tmpdir)

print 'getting trees...'
col.put_trees_parallel(program='phyml',tmpdir=tmpdir)

print 'getting partitions...'
col.put_partitions(metrics=['sym'],linkages=['ward'],nclasses=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17])
#print 'randomizing bytes...'
col.put_clusters()

#print 'immanentizing the eschaton...'
col.put_cluster_trees_parallel(program='phyml',tmpdir=tmpdir)
"""
plottable = []
#plottable.append(add_to_plot(col,'sym','ward'))
col = cPickle.load(file('col.pickle'))
#print 'whipping into frenzy...'
for i in range(1):
    r = SequenceCollection(records=col.get_randomised_alignments(), helper=helper, tmpdir=tmpdir)
    r.put_trees_parallel(program='phyml',tmpdir=tmpdir)
    r.put_partitions(metrics=['sym'],linkages=['ward'],nclasses=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17])
    r.put_clusters()
    r.put_cluster_trees_parallel(program='phyml',tmpdir=tmpdir)   
    plottable.append(add_to_plot(r, 'sym', 'ward'))
    
cPickle.dump(plottable, file('plottable{0}.pickle'.format(index),'w'))
#cPickle.dump(col, file('col.pickle','w'))

コード例 #21
0
#!/usr/bin/python
# -*- coding: utf-8 -*-


from sequence_collection import SequenceCollection
import time

indir = '/Users/kgori/git/kevin/data/real_data/yeast_data/MSA'

print 'test directory = ', indir

load_start = time.time()
print 'loading sequences (parallel)'
col = SequenceCollection(indir, datatype='dna')
print col
load_end = time.time()

col.put_trees_parallel()
col.put_partitions(metrics=['sym','geodesic'], linkages=['ward'], nclasses=[2,3,4,5,6,7,8,9,10,11,12,13,14,15,16])
col.put_clusters()
col.put_cluster_trees_parallel()

timings = [
load_end - load_start
]

print 'time = {0:.3f}'.format(*timings)
コード例 #22
0
from tree import Tree
from clustering import Clustering
import cPickle
import time
import os
import copy
import numpy as np
np.set_printoptions(precision=2, linewidth=200)

indir = '/Users/kgori/git/kevin/data/simulated_data/small/MSA'

print 'test directory = ', indir

load_start = time.time()
print 'loading sequences (parallel)'
col = SequenceCollection(indir, datatype='protein')
print col
load_end = time.time()

tcseq_start = time.time()
print 'getting TC trees (sequential)'
col.get_trees(program='treecollection', tmpdir='/tmp')
for rec in col.records:
    print rec.name
    print rec.tree
tcseq_end = time.time()

tcpar_start = time.time()
print 'getting TC trees (parallel)'
col.get_trees_parallel(program='treecollection', tmpdir='/tmp')
for rec in col.records:
コード例 #23
0
datatype = args['datatype']
score = args['score']

directorycheck_and_quit(input_dir)
directorycheck_and_make(tmpdir)

gtp_path = os.environ['GTP_PATH']
helper = os.environ['DARWINHELPER']

from sequence_collection import SequenceCollection

sc = SequenceCollection(
    input_dir,
    file_format='phylip',
    datatype=datatype,
    helper=helper,
    gtp_path=gtp_path,
    tmpdir=tmpdir,
    overwrite=True,
    )

sc.load_phyml_results(input_dir, program=None)
sc.quality_scores = {}
for dist in distance:
    (_, qs) = sc.autotune(dist, max_groups=max_clusters,
                          min_groups=min_clusters)
    sc.quality_scores[dist] = qs
cluster_range = range(min_clusters, max_clusters+1)
if min_clusters > 1:
    cluster_range.insert(0, 1)
sc.put_partitions(distance, method, cluster_range)
コード例 #24
0
ファイル: findings.py プロジェクト: kgori/clustering_project
print 'Reading records...'

# some initialisation
if program == 'bionj':
    records = sorted(load_records('/'.join((simdir, record_dir)), '*.nj.pickle'), key=sort_key)
    cluster_records = sorted(load_records('/'.join((simdir, 'bionj_clustering'))), key=sort_key)
else:
    records = sorted(load_records('/'.join((simdir, record_dir)), '*.ml.pickle'), key=sort_key)
    cluster_records = sorted(load_records('/'.join((simdir, 'phyml_clustering'))), key=sort_key)

cluster_dic = make_target_dict(cluster_records)
print '(Done).'

#rebuild sequenceCollection object

sc = SequenceCollection()
sc.records = records

print 'Generating distance matrices...'
for metric in ['euc', 'sym', 'geo']:
    sc.put_distance_matrices(metric, gtp_path = '/homes/kgori/research/clustering_project/class_files', tmpdir=tmpdir)
    for method in ['single', 'complete', 'ward', 'average', 'spectral', 'MDS', 'kmedoids']:
        partition = rebuild_partitions(records, cluster_dic, metric=metric, method=method)
        sc.clustering.partitions[(metric, method, 4)] = partition

sc.clustering.partitions['true'] = [1]*15 + [2]*15 + [3]*15 + [4]*15

sc.put_clusters()
print '(Done).'

for rec in sc.get_cluster_records():
コード例 #25
0
import argparse
import re
import sys
from errors import filecheck_and_quit, directorycheck_and_quit

progname = re.compile('[A-Za-z0-9.-_]+').search(sys.argv[0]).group()
desc = 'Read in a SequenceCollection from disk and print scores'
input_help = 'Filepath+name of gzipped SequenceCollection object'
category_choices = ['Observed', 'Randomised', 'Simulated', 'NA']

parser = argparse.ArgumentParser(prog=progname, description=desc)
parser.add_argument('-i', dest='input_file', help=input_help, type=str)
parser.add_argument('-t', dest='phyml_dir', help=input_help, type=str)

args = parser.parse_args()
input_file = args.input_file
phyml_dir = args.phyml_dir.rstrip('/')

filecheck_and_quit(input_file)
directorycheck_and_quit(phyml_dir)

from sequence_collection import SequenceCollection

sc = SequenceCollection.gunzip(input_file)
cluster_records = sc.get_cluster_records()
sc.load_phyml_results(phyml_dir, records=cluster_records,
                      use_hashname=True)
sc.update_scores()
sc.gzip(input_file)
コード例 #26
0
print indir

rob.r('library(ape)')
rob.r('library(phangorn)')
print 'r libraries loaded'

alignments_dir = '{0}/dna_alignments'.format(indir)

if os.path.isfile('{0}/tmpPickle.pkl'.format(indir)):
    sc = cPickle.load(file('{0}/tmpPickle.pkl'.format(indir)))
else:

    sc = SequenceCollection(
        alignments_dir,
        file_format='fasta',
        datatype='dna',
        gtp_path=gtp_path,
        helper=helper,
        tmpdir=tmpdir,
        )

    sc.put_trees(program='bionj')
    sc.put_partitions('geo', 'spectral', 4)
    sc.concatenate_records()
    sc.put_cluster_trees(program='bionj')
    cPickle.dump(sc, open('{0}/tmpPickle.pkl'.format(indir), 'w'))
print 'SC object available'

# Plot the heatmap of the distance matrix

dm = sc.get_distance_matrices()['geo']
p = sc.partitions[sc.clusters_to_partitions[('geo', 'spectral', 4)]]