Python SequenceCollection Examples, sequence_collection.SequenceCollection Python Examples

Example #1

0

Show file

    except AssertionError:
        print 'Can\'t get partitions to match'
        return -1
    return partition_list

sort_key = lambda item: tuple((int(num) if num else alpha) for (num,alpha) in re.findall(r'(\d+)|(\D+)', item.name))

print 'loading recs...'
recs = sorted(load_records(sys.argv[1], sys.argv[2]), key=sort_key)
print 'loading crecs...'
crecs = load_records(sys.argv[3])
d = make_target_dict(crecs)
print 'done.'

t = [1]*15+[2]*15+[3]*15+[4]*15
c = SequenceCollection()
c.records = recs

for metric in ['sym', 'euc', 'geo']:
    c.put_distance_matrices(metric, gtp_path = '/homes/kgori/research/clustering_project/class_files')
    for method in ['single', 'complete', 'average', 'ward', 'MDS', 'spectral', 'kmedoids']:
        p = order(rebuild_partitions(recs, d, metric, method))
        c.clustering.partitions[(metric, method, 4)] = p
c.clustering.partitions['true'] = t
c.put_clusters()
print '(Done).'

for rec in c.get_cluster_records():
    try:
        rec.tree = d[rec.name].tree
    except KeyError:

Example #2

0

Show file

File: test_code.py Project: kgori/clustering_project

    
5)  Compare scores derived from clusters to random permutation of the original data
    either by making a copy of the SequenceCollection object, with clusters made up
    of the same number of genes with the same number of characters, or by randomising
    the alignments and performing hierarchical clustering on the randomised data

    if the former, do rand1 = col.make_randomised_copy
    if the latterm do rand2 = SequenceCollection(records=col.get_randomised_alignments(),
        datatype = 'protein')
"""


# indir = '/Users/kgori/git/kevin/yeast_data/MSA'
indir = '/Users/kgori/git/kevin/data/simulated_data/eight/MSA'

col = SequenceCollection(indir, datatype='protein')
ran = SequenceCollection(records=col.get_randomised_alignments(), datatype='protein')
col.put_trees_parallel()
ran.put_trees_parallel()
col.put_partitions(metrics=['euc','rf','sym'], linkages=['ward'], nclasses=[2,3,4,5,6,7,8,9,10])
ran.put_partitions(metrics=['euc','rf','sym'], linkages=['ward'], nclasses=[2,3,4,5,6,7,8,9,10])
col.put_clusters()
col.put_cluster_trees_parallel()
ran.put_clusters()
ran.put_cluster_trees_parallel()
rn2 = col.make_randomised_copy()

r1 = ran.get_clusters()
r2 = rn2.get_clusters()
cl = col.get_clusters()

Example #3

0

Show file

File: doclustering.py Project: haehn/clustering_project

        '\t')[1])

for rec in phymlrecords:
    rec.datatype = 'dna'
for rec in bionjrecords:
    rec.datatype = 'dna'

try:
    assert len(phymlrecords) == len(bionjrecords) == 60
except:
    print 'Missing records in {0}'.format(indir)
    sys.exit(1)

phyml_sc = SequenceCollection(records=phymlrecords,
                              datatype='dna',
                              helper=os.environ['DARWINHELPER'],
                              tmpdir=tmpdir,
                              get_distances=False)
bionj_sc = SequenceCollection(records=bionjrecords,
                              datatype='dna',
                              helper=os.environ['DARWINHELPER'],
                              tmpdir=tmpdir,
                              get_distances=False)

phyml_sc.put_partitions(
    ['geo', 'euc', 'sym'],
    ['single', 'complete', 'average', 'ward', 'kmedoids', 'MDS', 'spectral'],
    4,
    gtp_path=
    '/net/isilon7/nobackup/research/goldman/kevin/clustering_project/class_files',
    tmpdir=tmpdir)

Example #4

0

Show file

File: load_and_dump_records.py Project: haehn/clustering_project

progname = re.compile('[A-Za-z0-9.-_]+').search(sys.argv[0]).group()
desc = 'Read in a SequenceCollection from disk and dump records'
input_help = 'Filepath+name of gzipped SequenceCollection object'
output_help = 'Directory to dump files in'
choice_help = \
    '\n'.join(['Choose to dump post-clustering concatenated records',
              'instead of pre-clustering single records'])
parser = argparse.ArgumentParser(prog=progname, description=desc)
parser.add_argument('-i', dest='input_file', help=input_help, type=str)
parser.add_argument('-o', dest='output_dir', help=output_help, type=str)
parser.add_argument('-c', dest='cluster_recs', action='store_true')

args = parser.parse_args()
input_file = args.input_file
output_dir = args.output_dir.rstrip('/')
cluster_recs = args.cluster_recs

filecheck_and_quit(input_file)
directorycheck_and_make(output_dir)

from sequence_collection import SequenceCollection

sc = SequenceCollection.gunzip(input_file)
if cluster_recs:
    records = sc.get_cluster_records()
    sc.dump_records(output_dir, records)
else:
    records = sc.get_records()  # should be default anyway, but explicit
    sc.dump_records(output_dir,
                    records)  # is better than implicit, and all that

Example #5

0

Show file

        ])
else:
    helper = os.environ['DARWINHELPER']

try:
    TMPDIR = os.environ['TEMPORARY_DIRECTORY']
except:
    TMPDIR = '/tmp'

### MAIN
if treeprog == 'treecollection':
    get_distances = True
else:
    get_distances = False
sc = SequenceCollection(indir, file_format=file_format,
                        gtp_path=gtp_path, datatype=datatype,
                        get_distances=get_distances, tmpdir=TMPDIR, helper=helper)

sc.put_trees(program=treeprog)
sc.put_partitions(['geo', 'euc', 'rf'], [
    'average',
    'complete',
    'kmedoids',
    'MDS',
    'single',
    'spectral00',
    'spectral01',
    'spectral10',
    'spectral11',
    'ward',
    ], nclasses, recalculate=True)

Example #6

0

Show file

File: calc_scores.py Project: kgori/clustering_project

    'average',
    'ward',
    'kmedoids',
    'spectral',
    'MDS',
    ]
calc_varinf = False

if os.path.isfile(outf):
    sys.exit(1)

sc = SequenceCollection(
    seqdir,
    tmpdir=TMPDIR,
    gtp_path=GTP_PATH,
    helper=HELPER,
    file_format=format,
    datatype=datatype,
    parallel_load=False,
    get_distances=False,
    )

sc.put_trees(program='bionj', model='GTR', tmpdir=TMPDIR, ncat=4,
             datatype='nt')

# sc.put_distance_matrices(['rf'])
# print sc.get_distance_matrices()['rf']
# sys.exit()

sc.put_partitions('rf', methods, nclasses, recalculate=True)
sc.put_partitions('euc', methods, nclasses, recalculate=True)
sc.put_partitions('geo', methods, nclasses, recalculate=True)

Example #7

0

Show file

File: test_get_cluster_trees.py Project: haehn/clustering_project

#!/usr/bin/python
# -*- coding: utf-8 -*-

from sequence_collection import SequenceCollection
import cPickle
import time

indir = '/Users/kgori/git/kevin/data/simulated_data/small/MSA'

print 'test directory = ', indir

load_start = time.time()
print 'loading sequences (parallel)'
col = SequenceCollection(indir, datatype='protein')
print col
load_end = time.time()

tcpar_start = time.time()
print 'putting TC trees (parallel)'
col.put_trees_parallel(program='treecollection', tmpdir='/tmp')
for rec in col.records:
    print rec.name
    print rec.tree
tcpar_end = time.time()

par_start = time.time()
print 'Putting partitions'
col.put_partitions(metrics=['sym', 'euc'],
                   linkages=['ward', 'single'],
                   nclasses=[3, 4, 5, 6])
print col.get_partitions()

Example #8

0

Show file

File: test_new_dictionary.py Project: kgori/clustering_project

from pylab import *
print 'done.'


def print_dict(d):
    for k in sorted(d):
        print d


np.set_printoptions(linewidth=200, precision=3)
sc = SequenceCollection(
    '/Users/kgori/scratch/chk/aa_alignments/',
    get_distances=False,
    file_format='phylip',
    helper='/Users/kgori/git/kevin/clustering_project/class_files/DV_wrapper.drw'
        ,
    parallel_load=True,
    gtp_path='/Users/kgori/git/kevin/clustering_project/class_files/',
    tmpdir='/tmp',
    datatype='protein',
    )

sc_yeast = SequenceCollection(
    '/Users/kgori/scratch/yeast_MSA',
    get_distances=False,
    file_format='phylip',
    helper='/Users/kgori/git/kevin/clustering_project/class_files/DV_wrapper.drw'
        ,
    parallel_load=True,
    gtp_path='/Users/kgori/git/kevin/clustering_project/class_files/',
    tmpdir='/tmp/yeast',

Example #9

0

Show file

File: test_scoll.py Project: haehn/clustering_project

from tree import Tree
from clustering import Clustering
import cPickle
import time
import os
import copy
import numpy as np
np.set_printoptions(precision=2, linewidth=200)

indir = '/Users/kgori/git/kevin/data/simulated_data/small/MSA'

print 'test directory = ', indir

load_start = time.time()
print 'loading sequences (parallel)'
col = SequenceCollection(indir, datatype='protein')
print col
load_end = time.time()

tcseq_start = time.time()
print 'getting TC trees (sequential)'
col.get_trees(program='treecollection', tmpdir='/tmp')
for rec in col.records:
    print rec.name
    print rec.tree
tcseq_end = time.time()

tcpar_start = time.time()
print 'getting TC trees (parallel)'
col.get_trees_parallel(program='treecollection', tmpdir='/tmp')
for rec in col.records:

Example #10

0

Show file

from pylab import *
print 'done.'


def print_dict(d):
    for k in sorted(d):
        print d


np.set_printoptions(linewidth=200, precision=3)
sc = SequenceCollection(
    '/Users/kgori/scratch/chk/aa_alignments/',
    get_distances=False,
    file_format='phylip',
    helper=
    '/Users/kgori/git/kevin/clustering_project/class_files/DV_wrapper.drw',
    parallel_load=True,
    gtp_path='/Users/kgori/git/kevin/clustering_project/class_files/',
    tmpdir='/tmp',
    datatype='protein',
)

sc_yeast = SequenceCollection(
    '/Users/kgori/scratch/yeast_MSA',
    get_distances=False,
    file_format='phylip',
    helper=
    '/Users/kgori/git/kevin/clustering_project/class_files/DV_wrapper.drw',
    parallel_load=True,
    gtp_path='/Users/kgori/git/kevin/clustering_project/class_files/',
    tmpdir='/tmp/yeast',

Example #11

0

Show file

File: test_get_cluster_trees.py Project: kgori/clustering_project

#!/usr/bin/python
# -*- coding: utf-8 -*-

from sequence_collection import SequenceCollection
import cPickle
import time

indir = "/Users/kgori/git/kevin/data/simulated_data/small/MSA"

print "test directory = ", indir

load_start = time.time()
print "loading sequences (parallel)"
col = SequenceCollection(indir, datatype="protein")
print col
load_end = time.time()

tcpar_start = time.time()
print "putting TC trees (parallel)"
col.put_trees_parallel(program="treecollection", tmpdir="/tmp")
for rec in col.records:
    print rec.name
    print rec.tree
tcpar_end = time.time()

par_start = time.time()
print "Putting partitions"
col.put_partitions(metrics=["sym", "euc"], linkages=["ward", "single"], nclasses=[3, 4, 5, 6])
print col.get_partitions()
par_end = time.time()

Example #12

0

Show file

File: pickle_simulation.py Project: haehn/clustering_project

                    type=fpath,
                    default='/tmp')
parser.add_argument('-data', '--datatype', help='datatype', default=None)

args = vars(parser.parse_args())
outdir = args['directory']
program = args['program']
model = args['model']
ncat = args['ncat']
datatype = args['datatype']
gtp_path = os.environ['GTP_PATH']
tmpdir = os.environ['TEMPORARY_DIRECTORY']
tmpdir = args['tmpdir']
print 'Reading alignments into SequenceRecord object'
seq = SequenceCollection('{0}/dna_alignments'.format(outdir),
                         datatype='dna',
                         tmpdir=tmpdir,
                         helper=os.environ['DARWINHELPER'])
print 'Calculating trees'
print program, model, datatype, ncat, tmpdir
seq.put_trees(program=program,
              model=model,
              datatype=datatype,
              ncat=ncat,
              tmpdir=tmpdir)
print 'doing geodesic distance matrices'
seq.put_distance_matrices('geo', gtp_path=gtp_path, tmpdir=tmpdir)
print 'doing euc distance matrices'
seq.put_distance_matrices('euc')
print 'doing sym distance matrices'
seq.put_distance_matrices('sym')

Example #13

0

Show file

File: cluster_TC_input.py Project: haehn/clustering_project

        tree.read_from_file('{0}/trees/{1}.nwk'.format(working_dir,
                            name))

    dv_matrix_strip_header = '\n'.join(dv_matrix.split('\n'
            )[2:]).rstrip()
    labels_strip_header = labels.split('\n')[1].rstrip()
    record = TCSeqRec()
    record.dv = [(dv_matrix_strip_header, labels_strip_header)]
    record.tree = tree
    record.name = name
    record.headers = labels_strip_header.split()
    record.sequences = ['' for _ in record.headers]
    record._update()
    records.append(record)

collection = SequenceCollection(records=records, get_distances=False,
                                gtp_path=os.environ['GTP_PATH'])
collection.put_distance_matrices('rf')
T = \
    collection.Clustering.run_spectral_rotate(collection.distance_matrices['rf'
        ])
collection.partitions[T] = Partition(T)
collection.clusters_to_partitions[('rf', 'spectral_rotate', max(T))] = T
collection.concatenate_records()
cluster_recs = collection.get_cluster_records()

number_of_clusters = len(cluster_recs)
for j in range(number_of_clusters):
    record = cluster_recs[j]
    record_dv = record.dv[0]
    labels = record.dv[1]

Example #14

0

Show file

File: read_alignments_and_cluster.py Project: haehn/clustering_project

datatype = args['datatype']
score = args['score']

directorycheck_and_quit(input_dir)
directorycheck_and_make(tmpdir)

gtp_path = os.environ['GTP_PATH']
helper = os.environ['DARWINHELPER']

from sequence_collection import SequenceCollection

sc = SequenceCollection(
    input_dir,
    file_format='phylip',
    datatype=datatype,
    helper=helper,
    gtp_path=gtp_path,
    tmpdir=tmpdir,
    overwrite=True,
)

sc.load_phyml_results(input_dir, program=None)
sc.quality_scores = {}
for dist in distance:
    (_, qs) = sc.autotune(dist,
                          max_groups=max_clusters,
                          min_groups=min_clusters)
    sc.quality_scores[dist] = qs
cluster_range = range(min_clusters, max_clusters + 1)
if min_clusters > 1:
    cluster_range.insert(0, 1)

Example #15

0

Show file

        (simdir, 'bionj_clustering'))),
                             key=sort_key)
else:
    records = sorted(load_records('/'.join((simdir, record_dir)),
                                  '*.ml.pickle'),
                     key=sort_key)
    cluster_records = sorted(load_records('/'.join(
        (simdir, 'phyml_clustering'))),
                             key=sort_key)

cluster_dic = make_target_dict(cluster_records)
print '(Done).'

#rebuild sequenceCollection object

sc = SequenceCollection()
sc.records = records

print 'Generating distance matrices...'
for metric in ['euc', 'sym', 'geo']:
    sc.put_distance_matrices(
        metric,
        gtp_path='/homes/kgori/research/clustering_project/class_files',
        tmpdir=tmpdir)
    for method in [
            'single', 'complete', 'ward', 'average', 'spectral', 'MDS',
            'kmedoids'
    ]:
        partition = rebuild_partitions(records,
                                       cluster_dic,
                                       metric=metric,

Example #16

0

Show file

File: calc_scores.py Project: haehn/clustering_project

    'average',
    'ward',
    'kmedoids',
    'spectral',
    'MDS',
]
calc_varinf = False

if os.path.isfile(outf):
    sys.exit(1)

sc = SequenceCollection(
    seqdir,
    tmpdir=TMPDIR,
    gtp_path=GTP_PATH,
    helper=HELPER,
    file_format=format,
    datatype=datatype,
    parallel_load=False,
    get_distances=False,
)

sc.put_trees(program='bionj',
             model='GTR',
             tmpdir=TMPDIR,
             ncat=4,
             datatype='nt')

# sc.put_distance_matrices(['rf'])
# print sc.get_distance_matrices()['rf']
# sys.exit()

Example #17

0

Show file

File: pickle_simulation.py Project: kgori/clustering_project

parser.add_argument('-m', '--model', help='which model to use in phylogenetic inference', default=None)
parser.add_argument('-n', '--ncat', help='number of categories for gamma distributed rates', default=1)
parser.add_argument('-t', '--tmpdir', help='temporary directory', type=fpath, default='/tmp')
parser.add_argument('-data', '--datatype', help='datatype', default=None)

args = vars(parser.parse_args())
outdir = args['directory']
program = args['program']
model = args['model']
ncat = args['ncat']
datatype = args['datatype']
gtp_path = os.environ['GTP_PATH']
tmpdir = os.environ['TEMPORARY_DIRECTORY']
tmpdir = args['tmpdir']
print 'Reading alignments into SequenceRecord object'
seq = SequenceCollection('{0}/dna_alignments'.format(outdir), datatype='dna', tmpdir=tmpdir, helper=os.environ['DARWINHELPER'])
print 'Calculating trees'
print program,model,datatype,ncat,tmpdir
seq.put_trees(program=program, model=model, datatype=datatype, ncat=ncat, tmpdir=tmpdir)
print 'doing geodesic distance matrices'
seq.put_distance_matrices('geo', gtp_path=gtp_path, tmpdir=tmpdir)
print 'doing euc distance matrices'
seq.put_distance_matrices('euc')
print 'doing sym distance matrices'
seq.put_distance_matrices('sym')

print 'getting score for true clustering'
with open('{0}/treedistances.txt'.format(outdir)) as file:
	T = file.readline().rstrip().split('\t')[1][1:-1].split(', ')
seq.clustering.partitions['true'] = T
seq.put_clusters()

Example #18

0

Show file

File: doclustering.py Project: kgori/clustering_project

bionjrecords = sorted([cPickle.load(file(x)) for x in bionjpickles], key=lambda x:sort_key(x.name))

true = eval(open('{0}/treedistances.txt'.format(indir)).read().split('\n')[0].split('\t')[1])

for rec in phymlrecords:
    rec.datatype='dna'
for rec in bionjrecords:
    rec.datatype='dna'

try: 
    assert len(phymlrecords) == len(bionjrecords) == 60
except: 
    print 'Missing records in {0}'.format(indir)
    sys.exit(1)

phyml_sc = SequenceCollection(records=phymlrecords, datatype='dna', helper=os.environ['DARWINHELPER'],tmpdir=tmpdir, get_distances=False)
bionj_sc = SequenceCollection(records=bionjrecords, datatype='dna', helper=os.environ['DARWINHELPER'],tmpdir=tmpdir, get_distances=False)

phyml_sc.put_partitions(['geo','euc','sym'],['single','complete','average','ward','kmedoids','MDS','spectral'], 4, gtp_path='/net/isilon7/nobackup/research/goldman/kevin/clustering_project/class_files', tmpdir=tmpdir)
bionj_sc.put_partitions(['geo','euc','sym'],['single','complete','average','ward','kmedoids','MDS','spectral'], 4, gtp_path='/net/isilon7/nobackup/research/goldman/kevin/clustering_project/class_files', tmpdir=tmpdir)
phyml_sc.clustering.partitions['true']=true
bionj_sc.clustering.partitions['true']=true

phyml_sc.put_clusters()
bionj_sc.put_clusters()

if not os.path.isdir('{0}/phyml_clustering'.format(indir)):
    os.mkdir('{0}/phyml_clustering'.format(indir))
if not os.path.isdir('{0}/bionj_clustering'.format(indir)):
    os.mkdir('{0}/bionj_clustering'.format(indir))

Example #19

0

Show file

File: test_code.py Project: haehn/clustering_project

    col.put_cluster_trees_parallel()
    
5)  Compare scores derived from clusters to random permutation of the original data
    either by making a copy of the SequenceCollection object, with clusters made up
    of the same number of genes with the same number of characters, or by randomising
    the alignments and performing hierarchical clustering on the randomised data

    if the former, do rand1 = col.make_randomised_copy
    if the latterm do rand2 = SequenceCollection(records=col.get_randomised_alignments(),
        datatype = 'protein')
"""

# indir = '/Users/kgori/git/kevin/yeast_data/MSA'
indir = '/Users/kgori/git/kevin/data/simulated_data/eight/MSA'

col = SequenceCollection(indir, datatype='protein')
ran = SequenceCollection(records=col.get_randomised_alignments(),
                         datatype='protein')
col.put_trees_parallel()
ran.put_trees_parallel()
col.put_partitions(metrics=['euc', 'rf', 'sym'],
                   linkages=['ward'],
                   nclasses=[2, 3, 4, 5, 6, 7, 8, 9, 10])
ran.put_partitions(metrics=['euc', 'rf', 'sym'],
                   linkages=['ward'],
                   nclasses=[2, 3, 4, 5, 6, 7, 8, 9, 10])
col.put_clusters()
col.put_cluster_trees_parallel()
ran.put_clusters()
ran.put_cluster_trees_parallel()
rn2 = col.make_randomised_copy()

Example #20

0

Show file

File: test_randomiser.py Project: haehn/clustering_project

"""
print 'loading sequences...'
col = SequenceCollection(indir, helper=helper, tmpdir=tmpdir)

print 'getting trees...'
col.put_trees_parallel(program='phyml',tmpdir=tmpdir)

print 'getting partitions...'
col.put_partitions(metrics=['sym'],linkages=['ward'],nclasses=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17])
#print 'randomizing bytes...'
col.put_clusters()

#print 'immanentizing the eschaton...'
col.put_cluster_trees_parallel(program='phyml',tmpdir=tmpdir)
"""
plottable = []
#plottable.append(add_to_plot(col,'sym','ward'))
col = cPickle.load(file('col.pickle'))
#print 'whipping into frenzy...'
for i in range(1):
    r = SequenceCollection(records=col.get_randomised_alignments(), helper=helper, tmpdir=tmpdir)
    r.put_trees_parallel(program='phyml',tmpdir=tmpdir)
    r.put_partitions(metrics=['sym'],linkages=['ward'],nclasses=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17])
    r.put_clusters()
    r.put_cluster_trees_parallel(program='phyml',tmpdir=tmpdir)   
    plottable.append(add_to_plot(r, 'sym', 'ward'))
    
cPickle.dump(plottable, file('plottable{0}.pickle'.format(index),'w'))
#cPickle.dump(col, file('col.pickle','w'))

Example #21

0

Show file

File: test_get_trees.py Project: kgori/clustering_project

#!/usr/bin/python
# -*- coding: utf-8 -*-


from sequence_collection import SequenceCollection
import time

indir = '/Users/kgori/git/kevin/data/real_data/yeast_data/MSA'

print 'test directory = ', indir

load_start = time.time()
print 'loading sequences (parallel)'
col = SequenceCollection(indir, datatype='dna')
print col
load_end = time.time()

col.put_trees_parallel()
col.put_partitions(metrics=['sym','geodesic'], linkages=['ward'], nclasses=[2,3,4,5,6,7,8,9,10,11,12,13,14,15,16])
col.put_clusters()
col.put_cluster_trees_parallel()

timings = [
load_end - load_start
]

print 'time = {0:.3f}'.format(*timings)

Example #22

0

Show file

File: test_scoll.py Project: kgori/clustering_project

from tree import Tree
from clustering import Clustering
import cPickle
import time
import os
import copy
import numpy as np
np.set_printoptions(precision=2, linewidth=200)

indir = '/Users/kgori/git/kevin/data/simulated_data/small/MSA'

print 'test directory = ', indir

load_start = time.time()
print 'loading sequences (parallel)'
col = SequenceCollection(indir, datatype='protein')
print col
load_end = time.time()

tcseq_start = time.time()
print 'getting TC trees (sequential)'
col.get_trees(program='treecollection', tmpdir='/tmp')
for rec in col.records:
    print rec.name
    print rec.tree
tcseq_end = time.time()

tcpar_start = time.time()
print 'getting TC trees (parallel)'
col.get_trees_parallel(program='treecollection', tmpdir='/tmp')
for rec in col.records:

Example #23

0

Show file

File: read_alignments_and_cluster.py Project: kgori/clustering_project

datatype = args['datatype']
score = args['score']

directorycheck_and_quit(input_dir)
directorycheck_and_make(tmpdir)

gtp_path = os.environ['GTP_PATH']
helper = os.environ['DARWINHELPER']

from sequence_collection import SequenceCollection

sc = SequenceCollection(
    input_dir,
    file_format='phylip',
    datatype=datatype,
    helper=helper,
    gtp_path=gtp_path,
    tmpdir=tmpdir,
    overwrite=True,
    )

sc.load_phyml_results(input_dir, program=None)
sc.quality_scores = {}
for dist in distance:
    (_, qs) = sc.autotune(dist, max_groups=max_clusters,
                          min_groups=min_clusters)
    sc.quality_scores[dist] = qs
cluster_range = range(min_clusters, max_clusters+1)
if min_clusters > 1:
    cluster_range.insert(0, 1)
sc.put_partitions(distance, method, cluster_range)

Example #24

0

Show file

File: findings.py Project: kgori/clustering_project

print 'Reading records...'

# some initialisation
if program == 'bionj':
    records = sorted(load_records('/'.join((simdir, record_dir)), '*.nj.pickle'), key=sort_key)
    cluster_records = sorted(load_records('/'.join((simdir, 'bionj_clustering'))), key=sort_key)
else:
    records = sorted(load_records('/'.join((simdir, record_dir)), '*.ml.pickle'), key=sort_key)
    cluster_records = sorted(load_records('/'.join((simdir, 'phyml_clustering'))), key=sort_key)

cluster_dic = make_target_dict(cluster_records)
print '(Done).'

#rebuild sequenceCollection object

sc = SequenceCollection()
sc.records = records

print 'Generating distance matrices...'
for metric in ['euc', 'sym', 'geo']:
    sc.put_distance_matrices(metric, gtp_path = '/homes/kgori/research/clustering_project/class_files', tmpdir=tmpdir)
    for method in ['single', 'complete', 'ward', 'average', 'spectral', 'MDS', 'kmedoids']:
        partition = rebuild_partitions(records, cluster_dic, metric=metric, method=method)
        sc.clustering.partitions[(metric, method, 4)] = partition

sc.clustering.partitions['true'] = [1]*15 + [2]*15 + [3]*15 + [4]*15

sc.put_clusters()
print '(Done).'

for rec in sc.get_cluster_records():

Example #25

0

Show file

File: load_final_results.py Project: kgori/clustering_project

import argparse
import re
import sys
from errors import filecheck_and_quit, directorycheck_and_quit

progname = re.compile('[A-Za-z0-9.-_]+').search(sys.argv[0]).group()
desc = 'Read in a SequenceCollection from disk and print scores'
input_help = 'Filepath+name of gzipped SequenceCollection object'
category_choices = ['Observed', 'Randomised', 'Simulated', 'NA']

parser = argparse.ArgumentParser(prog=progname, description=desc)
parser.add_argument('-i', dest='input_file', help=input_help, type=str)
parser.add_argument('-t', dest='phyml_dir', help=input_help, type=str)

args = parser.parse_args()
input_file = args.input_file
phyml_dir = args.phyml_dir.rstrip('/')

filecheck_and_quit(input_file)
directorycheck_and_quit(phyml_dir)

from sequence_collection import SequenceCollection

sc = SequenceCollection.gunzip(input_file)
cluster_records = sc.get_cluster_records()
sc.load_phyml_results(phyml_dir, records=cluster_records,
                      use_hashname=True)
sc.update_scores()
sc.gzip(input_file)

Example #26

0

Show file

print indir

rob.r('library(ape)')
rob.r('library(phangorn)')
print 'r libraries loaded'

alignments_dir = '{0}/dna_alignments'.format(indir)

if os.path.isfile('{0}/tmpPickle.pkl'.format(indir)):
    sc = cPickle.load(file('{0}/tmpPickle.pkl'.format(indir)))
else:

    sc = SequenceCollection(
        alignments_dir,
        file_format='fasta',
        datatype='dna',
        gtp_path=gtp_path,
        helper=helper,
        tmpdir=tmpdir,
        )

    sc.put_trees(program='bionj')
    sc.put_partitions('geo', 'spectral', 4)
    sc.concatenate_records()
    sc.put_cluster_trees(program='bionj')
    cPickle.dump(sc, open('{0}/tmpPickle.pkl'.format(indir), 'w'))
print 'SC object available'

# Plot the heatmap of the distance matrix

dm = sc.get_distance_matrices()['geo']
p = sc.partitions[sc.clusters_to_partitions[('geo', 'spectral', 4)]]