Python SequenceCollectionの例、sequence_collection.SequenceCollection Pythonの例

コード例 #1

0

ファイルを表示

    except AssertionError:
        print 'Can\'t get partitions to match'
        return -1
    return partition_list

sort_key = lambda item: tuple((int(num) if num else alpha) for (num,alpha) in re.findall(r'(\d+)|(\D+)', item.name))

print 'loading recs...'
recs = sorted(load_records(sys.argv[1], sys.argv[2]), key=sort_key)
print 'loading crecs...'
crecs = load_records(sys.argv[3])
d = make_target_dict(crecs)
print 'done.'

t = [1]*15+[2]*15+[3]*15+[4]*15
c = SequenceCollection()
c.records = recs

for metric in ['sym', 'euc', 'geo']:
    c.put_distance_matrices(metric, gtp_path = '/homes/kgori/research/clustering_project/class_files')
    for method in ['single', 'complete', 'average', 'ward', 'MDS', 'spectral', 'kmedoids']:
        p = order(rebuild_partitions(recs, d, metric, method))
        c.clustering.partitions[(metric, method, 4)] = p
c.clustering.partitions['true'] = t
c.put_clusters()
print '(Done).'

for rec in c.get_cluster_records():
    try:
        rec.tree = d[rec.name].tree
    except KeyError:

コード例 #2

0

ファイルを表示

ファイル: test_code.py プロジェクト: kgori/clustering_project

    
5)  Compare scores derived from clusters to random permutation of the original data
    either by making a copy of the SequenceCollection object, with clusters made up
    of the same number of genes with the same number of characters, or by randomising
    the alignments and performing hierarchical clustering on the randomised data

    if the former, do rand1 = col.make_randomised_copy
    if the latterm do rand2 = SequenceCollection(records=col.get_randomised_alignments(),
        datatype = 'protein')
"""


# indir = '/Users/kgori/git/kevin/yeast_data/MSA'
indir = '/Users/kgori/git/kevin/data/simulated_data/eight/MSA'

col = SequenceCollection(indir, datatype='protein')
ran = SequenceCollection(records=col.get_randomised_alignments(), datatype='protein')
col.put_trees_parallel()
ran.put_trees_parallel()
col.put_partitions(metrics=['euc','rf','sym'], linkages=['ward'], nclasses=[2,3,4,5,6,7,8,9,10])
ran.put_partitions(metrics=['euc','rf','sym'], linkages=['ward'], nclasses=[2,3,4,5,6,7,8,9,10])
col.put_clusters()
col.put_cluster_trees_parallel()
ran.put_clusters()
ran.put_cluster_trees_parallel()
rn2 = col.make_randomised_copy()

r1 = ran.get_clusters()
r2 = rn2.get_clusters()
cl = col.get_clusters()

コード例 #3

0

ファイルを表示

ファイル: doclustering.py プロジェクト: haehn/clustering_project

        '\t')[1])

for rec in phymlrecords:
    rec.datatype = 'dna'
for rec in bionjrecords:
    rec.datatype = 'dna'

try:
    assert len(phymlrecords) == len(bionjrecords) == 60
except:
    print 'Missing records in {0}'.format(indir)
    sys.exit(1)

phyml_sc = SequenceCollection(records=phymlrecords,
                              datatype='dna',
                              helper=os.environ['DARWINHELPER'],
                              tmpdir=tmpdir,
                              get_distances=False)
bionj_sc = SequenceCollection(records=bionjrecords,
                              datatype='dna',
                              helper=os.environ['DARWINHELPER'],
                              tmpdir=tmpdir,
                              get_distances=False)

phyml_sc.put_partitions(
    ['geo', 'euc', 'sym'],
    ['single', 'complete', 'average', 'ward', 'kmedoids', 'MDS', 'spectral'],
    4,
    gtp_path=
    '/net/isilon7/nobackup/research/goldman/kevin/clustering_project/class_files',
    tmpdir=tmpdir)

コード例 #4

0

ファイルを表示

ファイル: load_and_dump_records.py プロジェクト: haehn/clustering_project

progname = re.compile('[A-Za-z0-9.-_]+').search(sys.argv[0]).group()
desc = 'Read in a SequenceCollection from disk and dump records'
input_help = 'Filepath+name of gzipped SequenceCollection object'
output_help = 'Directory to dump files in'
choice_help = \
    '\n'.join(['Choose to dump post-clustering concatenated records',
              'instead of pre-clustering single records'])
parser = argparse.ArgumentParser(prog=progname, description=desc)
parser.add_argument('-i', dest='input_file', help=input_help, type=str)
parser.add_argument('-o', dest='output_dir', help=output_help, type=str)
parser.add_argument('-c', dest='cluster_recs', action='store_true')

args = parser.parse_args()
input_file = args.input_file
output_dir = args.output_dir.rstrip('/')
cluster_recs = args.cluster_recs

filecheck_and_quit(input_file)
directorycheck_and_make(output_dir)

from sequence_collection import SequenceCollection

sc = SequenceCollection.gunzip(input_file)
if cluster_recs:
    records = sc.get_cluster_records()
    sc.dump_records(output_dir, records)
else:
    records = sc.get_records()  # should be default anyway, but explicit
    sc.dump_records(output_dir,
                    records)  # is better than implicit, and all that

コード例 #5

0

ファイルを表示

        ])
else:
    helper = os.environ['DARWINHELPER']

try:
    TMPDIR = os.environ['TEMPORARY_DIRECTORY']
except:
    TMPDIR = '/tmp'

### MAIN
if treeprog == 'treecollection':
    get_distances = True
else:
    get_distances = False
sc = SequenceCollection(indir, file_format=file_format,
                        gtp_path=gtp_path, datatype=datatype,
                        get_distances=get_distances, tmpdir=TMPDIR, helper=helper)

sc.put_trees(program=treeprog)
sc.put_partitions(['geo', 'euc', 'rf'], [
    'average',
    'complete',
    'kmedoids',
    'MDS',
    'single',
    'spectral00',
    'spectral01',
    'spectral10',
    'spectral11',
    'ward',
    ], nclasses, recalculate=True)

コード例 #6

0

ファイルを表示

ファイル: calc_scores.py プロジェクト: kgori/clustering_project

    'average',
    'ward',
    'kmedoids',
    'spectral',
    'MDS',
    ]
calc_varinf = False

if os.path.isfile(outf):
    sys.exit(1)

sc = SequenceCollection(
    seqdir,
    tmpdir=TMPDIR,
    gtp_path=GTP_PATH,
    helper=HELPER,
    file_format=format,
    datatype=datatype,
    parallel_load=False,
    get_distances=False,
    )

sc.put_trees(program='bionj', model='GTR', tmpdir=TMPDIR, ncat=4,
             datatype='nt')

# sc.put_distance_matrices(['rf'])
# print sc.get_distance_matrices()['rf']
# sys.exit()

sc.put_partitions('rf', methods, nclasses, recalculate=True)
sc.put_partitions('euc', methods, nclasses, recalculate=True)
sc.put_partitions('geo', methods, nclasses, recalculate=True)

コード例 #7

0

ファイルを表示

ファイル: test_get_cluster_trees.py プロジェクト: haehn/clustering_project

#!/usr/bin/python
# -*- coding: utf-8 -*-

from sequence_collection import SequenceCollection
import cPickle
import time

indir = '/Users/kgori/git/kevin/data/simulated_data/small/MSA'

print 'test directory = ', indir

load_start = time.time()
print 'loading sequences (parallel)'
col = SequenceCollection(indir, datatype='protein')
print col
load_end = time.time()

tcpar_start = time.time()
print 'putting TC trees (parallel)'
col.put_trees_parallel(program='treecollection', tmpdir='/tmp')
for rec in col.records:
    print rec.name
    print rec.tree
tcpar_end = time.time()

par_start = time.time()
print 'Putting partitions'
col.put_partitions(metrics=['sym', 'euc'],
                   linkages=['ward', 'single'],
                   nclasses=[3, 4, 5, 6])
print col.get_partitions()

コード例 #8

0

ファイルを表示

ファイル: test_new_dictionary.py プロジェクト: kgori/clustering_project

from pylab import *
print 'done.'


def print_dict(d):
    for k in sorted(d):
        print d


np.set_printoptions(linewidth=200, precision=3)
sc = SequenceCollection(
    '/Users/kgori/scratch/chk/aa_alignments/',
    get_distances=False,
    file_format='phylip',
    helper='/Users/kgori/git/kevin/clustering_project/class_files/DV_wrapper.drw'
        ,
    parallel_load=True,
    gtp_path='/Users/kgori/git/kevin/clustering_project/class_files/',
    tmpdir='/tmp',
    datatype='protein',
    )

sc_yeast = SequenceCollection(
    '/Users/kgori/scratch/yeast_MSA',
    get_distances=False,
    file_format='phylip',
    helper='/Users/kgori/git/kevin/clustering_project/class_files/DV_wrapper.drw'
        ,
    parallel_load=True,
    gtp_path='/Users/kgori/git/kevin/clustering_project/class_files/',
    tmpdir='/tmp/yeast',

コード例 #9

0

ファイルを表示

ファイル: test_scoll.py プロジェクト: haehn/clustering_project

from tree import Tree
from clustering import Clustering
import cPickle
import time
import os
import copy
import numpy as np
np.set_printoptions(precision=2, linewidth=200)

indir = '/Users/kgori/git/kevin/data/simulated_data/small/MSA'

print 'test directory = ', indir

load_start = time.time()
print 'loading sequences (parallel)'
col = SequenceCollection(indir, datatype='protein')
print col
load_end = time.time()

tcseq_start = time.time()
print 'getting TC trees (sequential)'
col.get_trees(program='treecollection', tmpdir='/tmp')
for rec in col.records:
    print rec.name
    print rec.tree
tcseq_end = time.time()

tcpar_start = time.time()
print 'getting TC trees (parallel)'
col.get_trees_parallel(program='treecollection', tmpdir='/tmp')
for rec in col.records:

コード例 #10

0

ファイルを表示

from pylab import *
print 'done.'


def print_dict(d):
    for k in sorted(d):
        print d


np.set_printoptions(linewidth=200, precision=3)
sc = SequenceCollection(
    '/Users/kgori/scratch/chk/aa_alignments/',
    get_distances=False,
    file_format='phylip',
    helper=
    '/Users/kgori/git/kevin/clustering_project/class_files/DV_wrapper.drw',
    parallel_load=True,
    gtp_path='/Users/kgori/git/kevin/clustering_project/class_files/',
    tmpdir='/tmp',
    datatype='protein',
)

sc_yeast = SequenceCollection(
    '/Users/kgori/scratch/yeast_MSA',
    get_distances=False,
    file_format='phylip',
    helper=
    '/Users/kgori/git/kevin/clustering_project/class_files/DV_wrapper.drw',
    parallel_load=True,
    gtp_path='/Users/kgori/git/kevin/clustering_project/class_files/',
    tmpdir='/tmp/yeast',

コード例 #11

0

ファイルを表示

ファイル: test_get_cluster_trees.py プロジェクト: kgori/clustering_project

#!/usr/bin/python
# -*- coding: utf-8 -*-

from sequence_collection import SequenceCollection
import cPickle
import time

indir = "/Users/kgori/git/kevin/data/simulated_data/small/MSA"

print "test directory = ", indir

load_start = time.time()
print "loading sequences (parallel)"
col = SequenceCollection(indir, datatype="protein")
print col
load_end = time.time()

tcpar_start = time.time()
print "putting TC trees (parallel)"
col.put_trees_parallel(program="treecollection", tmpdir="/tmp")
for rec in col.records:
    print rec.name
    print rec.tree
tcpar_end = time.time()

par_start = time.time()
print "Putting partitions"
col.put_partitions(metrics=["sym", "euc"], linkages=["ward", "single"], nclasses=[3, 4, 5, 6])
print col.get_partitions()
par_end = time.time()

コード例 #12

0

ファイルを表示

ファイル: pickle_simulation.py プロジェクト: haehn/clustering_project

                    type=fpath,
                    default='/tmp')
parser.add_argument('-data', '--datatype', help='datatype', default=None)

args = vars(parser.parse_args())
outdir = args['directory']
program = args['program']
model = args['model']
ncat = args['ncat']
datatype = args['datatype']
gtp_path = os.environ['GTP_PATH']
tmpdir = os.environ['TEMPORARY_DIRECTORY']
tmpdir = args['tmpdir']
print 'Reading alignments into SequenceRecord object'
seq = SequenceCollection('{0}/dna_alignments'.format(outdir),
                         datatype='dna',
                         tmpdir=tmpdir,
                         helper=os.environ['DARWINHELPER'])
print 'Calculating trees'
print program, model, datatype, ncat, tmpdir
seq.put_trees(program=program,
              model=model,
              datatype=datatype,
              ncat=ncat,
              tmpdir=tmpdir)
print 'doing geodesic distance matrices'
seq.put_distance_matrices('geo', gtp_path=gtp_path, tmpdir=tmpdir)
print 'doing euc distance matrices'
seq.put_distance_matrices('euc')
print 'doing sym distance matrices'
seq.put_distance_matrices('sym')

コード例 #13

0

ファイルを表示

ファイル: cluster_TC_input.py プロジェクト: haehn/clustering_project

        tree.read_from_file('{0}/trees/{1}.nwk'.format(working_dir,
                            name))

    dv_matrix_strip_header = '\n'.join(dv_matrix.split('\n'
            )[2:]).rstrip()
    labels_strip_header = labels.split('\n')[1].rstrip()
    record = TCSeqRec()
    record.dv = [(dv_matrix_strip_header, labels_strip_header)]
    record.tree = tree
    record.name = name
    record.headers = labels_strip_header.split()
    record.sequences = ['' for _ in record.headers]
    record._update()
    records.append(record)

collection = SequenceCollection(records=records, get_distances=False,
                                gtp_path=os.environ['GTP_PATH'])
collection.put_distance_matrices('rf')
T = \
    collection.Clustering.run_spectral_rotate(collection.distance_matrices['rf'
        ])
collection.partitions[T] = Partition(T)
collection.clusters_to_partitions[('rf', 'spectral_rotate', max(T))] = T
collection.concatenate_records()
cluster_recs = collection.get_cluster_records()

number_of_clusters = len(cluster_recs)
for j in range(number_of_clusters):
    record = cluster_recs[j]
    record_dv = record.dv[0]
    labels = record.dv[1]

コード例 #14

0

ファイルを表示

ファイル: read_alignments_and_cluster.py プロジェクト: haehn/clustering_project

datatype = args['datatype']
score = args['score']

directorycheck_and_quit(input_dir)
directorycheck_and_make(tmpdir)

gtp_path = os.environ['GTP_PATH']
helper = os.environ['DARWINHELPER']

from sequence_collection import SequenceCollection

sc = SequenceCollection(
    input_dir,
    file_format='phylip',
    datatype=datatype,
    helper=helper,
    gtp_path=gtp_path,
    tmpdir=tmpdir,
    overwrite=True,
)

sc.load_phyml_results(input_dir, program=None)
sc.quality_scores = {}
for dist in distance:
    (_, qs) = sc.autotune(dist,
                          max_groups=max_clusters,
                          min_groups=min_clusters)
    sc.quality_scores[dist] = qs
cluster_range = range(min_clusters, max_clusters + 1)
if min_clusters > 1:
    cluster_range.insert(0, 1)

コード例 #15

0

ファイルを表示

        (simdir, 'bionj_clustering'))),
                             key=sort_key)
else:
    records = sorted(load_records('/'.join((simdir, record_dir)),
                                  '*.ml.pickle'),
                     key=sort_key)
    cluster_records = sorted(load_records('/'.join(
        (simdir, 'phyml_clustering'))),
                             key=sort_key)

cluster_dic = make_target_dict(cluster_records)
print '(Done).'

#rebuild sequenceCollection object

sc = SequenceCollection()
sc.records = records

print 'Generating distance matrices...'
for metric in ['euc', 'sym', 'geo']:
    sc.put_distance_matrices(
        metric,
        gtp_path='/homes/kgori/research/clustering_project/class_files',
        tmpdir=tmpdir)
    for method in [
            'single', 'complete', 'ward', 'average', 'spectral', 'MDS',
            'kmedoids'
    ]:
        partition = rebuild_partitions(records,
                                       cluster_dic,
                                       metric=metric,

コード例 #16

0

ファイルを表示

ファイル: calc_scores.py プロジェクト: haehn/clustering_project

    'average',
    'ward',
    'kmedoids',
    'spectral',
    'MDS',
]
calc_varinf = False

if os.path.isfile(outf):
    sys.exit(1)

sc = SequenceCollection(
    seqdir,
    tmpdir=TMPDIR,
    gtp_path=GTP_PATH,
    helper=HELPER,
    file_format=format,
    datatype=datatype,
    parallel_load=False,
    get_distances=False,
)

sc.put_trees(program='bionj',
             model='GTR',
             tmpdir=TMPDIR,
             ncat=4,
             datatype='nt')

# sc.put_distance_matrices(['rf'])
# print sc.get_distance_matrices()['rf']
# sys.exit()

コード例 #17

0

ファイルを表示

ファイル: pickle_simulation.py プロジェクト: kgori/clustering_project

parser.add_argument('-m', '--model', help='which model to use in phylogenetic inference', default=None)
parser.add_argument('-n', '--ncat', help='number of categories for gamma distributed rates', default=1)
parser.add_argument('-t', '--tmpdir', help='temporary directory', type=fpath, default='/tmp')
parser.add_argument('-data', '--datatype', help='datatype', default=None)

args = vars(parser.parse_args())
outdir = args['directory']
program = args['program']
model = args['model']
ncat = args['ncat']
datatype = args['datatype']
gtp_path = os.environ['GTP_PATH']
tmpdir = os.environ['TEMPORARY_DIRECTORY']
tmpdir = args['tmpdir']
print 'Reading alignments into SequenceRecord object'
seq = SequenceCollection('{0}/dna_alignments'.format(outdir), datatype='dna', tmpdir=tmpdir, helper=os.environ['DARWINHELPER'])
print 'Calculating trees'
print program,model,datatype,ncat,tmpdir
seq.put_trees(program=program, model=model, datatype=datatype, ncat=ncat, tmpdir=tmpdir)
print 'doing geodesic distance matrices'
seq.put_distance_matrices('geo', gtp_path=gtp_path, tmpdir=tmpdir)
print 'doing euc distance matrices'
seq.put_distance_matrices('euc')
print 'doing sym distance matrices'
seq.put_distance_matrices('sym')

print 'getting score for true clustering'
with open('{0}/treedistances.txt'.format(outdir)) as file:
	T = file.readline().rstrip().split('\t')[1][1:-1].split(', ')
seq.clustering.partitions['true'] = T
seq.put_clusters()

コード例 #18

0

ファイルを表示

ファイル: doclustering.py プロジェクト: kgori/clustering_project

bionjrecords = sorted([cPickle.load(file(x)) for x in bionjpickles], key=lambda x:sort_key(x.name))

true = eval(open('{0}/treedistances.txt'.format(indir)).read().split('\n')[0].split('\t')[1])

for rec in phymlrecords:
    rec.datatype='dna'
for rec in bionjrecords:
    rec.datatype='dna'

try: 
    assert len(phymlrecords) == len(bionjrecords) == 60
except: 
    print 'Missing records in {0}'.format(indir)
    sys.exit(1)

phyml_sc = SequenceCollection(records=phymlrecords, datatype='dna', helper=os.environ['DARWINHELPER'],tmpdir=tmpdir, get_distances=False)
bionj_sc = SequenceCollection(records=bionjrecords, datatype='dna', helper=os.environ['DARWINHELPER'],tmpdir=tmpdir, get_distances=False)

phyml_sc.put_partitions(['geo','euc','sym'],['single','complete','average','ward','kmedoids','MDS','spectral'], 4, gtp_path='/net/isilon7/nobackup/research/goldman/kevin/clustering_project/class_files', tmpdir=tmpdir)
bionj_sc.put_partitions(['geo','euc','sym'],['single','complete','average','ward','kmedoids','MDS','spectral'], 4, gtp_path='/net/isilon7/nobackup/research/goldman/kevin/clustering_project/class_files', tmpdir=tmpdir)
phyml_sc.clustering.partitions['true']=true
bionj_sc.clustering.partitions['true']=true

phyml_sc.put_clusters()
bionj_sc.put_clusters()

if not os.path.isdir('{0}/phyml_clustering'.format(indir)):
    os.mkdir('{0}/phyml_clustering'.format(indir))
if not os.path.isdir('{0}/bionj_clustering'.format(indir)):
    os.mkdir('{0}/bionj_clustering'.format(indir))

コード例 #19

0

ファイルを表示

ファイル: test_code.py プロジェクト: haehn/clustering_project

    col.put_cluster_trees_parallel()
    
5)  Compare scores derived from clusters to random permutation of the original data
    either by making a copy of the SequenceCollection object, with clusters made up
    of the same number of genes with the same number of characters, or by randomising
    the alignments and performing hierarchical clustering on the randomised data

    if the former, do rand1 = col.make_randomised_copy
    if the latterm do rand2 = SequenceCollection(records=col.get_randomised_alignments(),
        datatype = 'protein')
"""

# indir = '/Users/kgori/git/kevin/yeast_data/MSA'
indir = '/Users/kgori/git/kevin/data/simulated_data/eight/MSA'

col = SequenceCollection(indir, datatype='protein')
ran = SequenceCollection(records=col.get_randomised_alignments(),
                         datatype='protein')
col.put_trees_parallel()
ran.put_trees_parallel()
col.put_partitions(metrics=['euc', 'rf', 'sym'],
                   linkages=['ward'],
                   nclasses=[2, 3, 4, 5, 6, 7, 8, 9, 10])
ran.put_partitions(metrics=['euc', 'rf', 'sym'],
                   linkages=['ward'],
                   nclasses=[2, 3, 4, 5, 6, 7, 8, 9, 10])
col.put_clusters()
col.put_cluster_trees_parallel()
ran.put_clusters()
ran.put_cluster_trees_parallel()
rn2 = col.make_randomised_copy()

コード例 #20

0

ファイルを表示

ファイル: test_randomiser.py プロジェクト: haehn/clustering_project

"""
print 'loading sequences...'
col = SequenceCollection(indir, helper=helper, tmpdir=tmpdir)

print 'getting trees...'
col.put_trees_parallel(program='phyml',tmpdir=tmpdir)

print 'getting partitions...'
col.put_partitions(metrics=['sym'],linkages=['ward'],nclasses=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17])
#print 'randomizing bytes...'
col.put_clusters()

#print 'immanentizing the eschaton...'
col.put_cluster_trees_parallel(program='phyml',tmpdir=tmpdir)
"""
plottable = []
#plottable.append(add_to_plot(col,'sym','ward'))
col = cPickle.load(file('col.pickle'))
#print 'whipping into frenzy...'
for i in range(1):
    r = SequenceCollection(records=col.get_randomised_alignments(), helper=helper, tmpdir=tmpdir)
    r.put_trees_parallel(program='phyml',tmpdir=tmpdir)
    r.put_partitions(metrics=['sym'],linkages=['ward'],nclasses=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17])
    r.put_clusters()
    r.put_cluster_trees_parallel(program='phyml',tmpdir=tmpdir)   
    plottable.append(add_to_plot(r, 'sym', 'ward'))
    
cPickle.dump(plottable, file('plottable{0}.pickle'.format(index),'w'))
#cPickle.dump(col, file('col.pickle','w'))

コード例 #21

0

ファイルを表示

ファイル: test_get_trees.py プロジェクト: kgori/clustering_project

#!/usr/bin/python
# -*- coding: utf-8 -*-


from sequence_collection import SequenceCollection
import time

indir = '/Users/kgori/git/kevin/data/real_data/yeast_data/MSA'

print 'test directory = ', indir

load_start = time.time()
print 'loading sequences (parallel)'
col = SequenceCollection(indir, datatype='dna')
print col
load_end = time.time()

col.put_trees_parallel()
col.put_partitions(metrics=['sym','geodesic'], linkages=['ward'], nclasses=[2,3,4,5,6,7,8,9,10,11,12,13,14,15,16])
col.put_clusters()
col.put_cluster_trees_parallel()

timings = [
load_end - load_start
]

print 'time = {0:.3f}'.format(*timings)

コード例 #22

0

ファイルを表示

ファイル: test_scoll.py プロジェクト: kgori/clustering_project

from tree import Tree
from clustering import Clustering
import cPickle
import time
import os
import copy
import numpy as np
np.set_printoptions(precision=2, linewidth=200)

indir = '/Users/kgori/git/kevin/data/simulated_data/small/MSA'

print 'test directory = ', indir

load_start = time.time()
print 'loading sequences (parallel)'
col = SequenceCollection(indir, datatype='protein')
print col
load_end = time.time()

tcseq_start = time.time()
print 'getting TC trees (sequential)'
col.get_trees(program='treecollection', tmpdir='/tmp')
for rec in col.records:
    print rec.name
    print rec.tree
tcseq_end = time.time()

tcpar_start = time.time()
print 'getting TC trees (parallel)'
col.get_trees_parallel(program='treecollection', tmpdir='/tmp')
for rec in col.records:

コード例 #23

0

ファイルを表示

ファイル: read_alignments_and_cluster.py プロジェクト: kgori/clustering_project

datatype = args['datatype']
score = args['score']

directorycheck_and_quit(input_dir)
directorycheck_and_make(tmpdir)

gtp_path = os.environ['GTP_PATH']
helper = os.environ['DARWINHELPER']

from sequence_collection import SequenceCollection

sc = SequenceCollection(
    input_dir,
    file_format='phylip',
    datatype=datatype,
    helper=helper,
    gtp_path=gtp_path,
    tmpdir=tmpdir,
    overwrite=True,
    )

sc.load_phyml_results(input_dir, program=None)
sc.quality_scores = {}
for dist in distance:
    (_, qs) = sc.autotune(dist, max_groups=max_clusters,
                          min_groups=min_clusters)
    sc.quality_scores[dist] = qs
cluster_range = range(min_clusters, max_clusters+1)
if min_clusters > 1:
    cluster_range.insert(0, 1)
sc.put_partitions(distance, method, cluster_range)

コード例 #24

0

ファイルを表示

ファイル: findings.py プロジェクト: kgori/clustering_project

print 'Reading records...'

# some initialisation
if program == 'bionj':
    records = sorted(load_records('/'.join((simdir, record_dir)), '*.nj.pickle'), key=sort_key)
    cluster_records = sorted(load_records('/'.join((simdir, 'bionj_clustering'))), key=sort_key)
else:
    records = sorted(load_records('/'.join((simdir, record_dir)), '*.ml.pickle'), key=sort_key)
    cluster_records = sorted(load_records('/'.join((simdir, 'phyml_clustering'))), key=sort_key)

cluster_dic = make_target_dict(cluster_records)
print '(Done).'

#rebuild sequenceCollection object

sc = SequenceCollection()
sc.records = records

print 'Generating distance matrices...'
for metric in ['euc', 'sym', 'geo']:
    sc.put_distance_matrices(metric, gtp_path = '/homes/kgori/research/clustering_project/class_files', tmpdir=tmpdir)
    for method in ['single', 'complete', 'ward', 'average', 'spectral', 'MDS', 'kmedoids']:
        partition = rebuild_partitions(records, cluster_dic, metric=metric, method=method)
        sc.clustering.partitions[(metric, method, 4)] = partition

sc.clustering.partitions['true'] = [1]*15 + [2]*15 + [3]*15 + [4]*15

sc.put_clusters()
print '(Done).'

for rec in sc.get_cluster_records():

コード例 #25

0

ファイルを表示

ファイル: load_final_results.py プロジェクト: kgori/clustering_project

import argparse
import re
import sys
from errors import filecheck_and_quit, directorycheck_and_quit

progname = re.compile('[A-Za-z0-9.-_]+').search(sys.argv[0]).group()
desc = 'Read in a SequenceCollection from disk and print scores'
input_help = 'Filepath+name of gzipped SequenceCollection object'
category_choices = ['Observed', 'Randomised', 'Simulated', 'NA']

parser = argparse.ArgumentParser(prog=progname, description=desc)
parser.add_argument('-i', dest='input_file', help=input_help, type=str)
parser.add_argument('-t', dest='phyml_dir', help=input_help, type=str)

args = parser.parse_args()
input_file = args.input_file
phyml_dir = args.phyml_dir.rstrip('/')

filecheck_and_quit(input_file)
directorycheck_and_quit(phyml_dir)

from sequence_collection import SequenceCollection

sc = SequenceCollection.gunzip(input_file)
cluster_records = sc.get_cluster_records()
sc.load_phyml_results(phyml_dir, records=cluster_records,
                      use_hashname=True)
sc.update_scores()
sc.gzip(input_file)

コード例 #26

0

ファイルを表示

print indir

rob.r('library(ape)')
rob.r('library(phangorn)')
print 'r libraries loaded'

alignments_dir = '{0}/dna_alignments'.format(indir)

if os.path.isfile('{0}/tmpPickle.pkl'.format(indir)):
    sc = cPickle.load(file('{0}/tmpPickle.pkl'.format(indir)))
else:

    sc = SequenceCollection(
        alignments_dir,
        file_format='fasta',
        datatype='dna',
        gtp_path=gtp_path,
        helper=helper,
        tmpdir=tmpdir,
        )

    sc.put_trees(program='bionj')
    sc.put_partitions('geo', 'spectral', 4)
    sc.concatenate_records()
    sc.put_cluster_trees(program='bionj')
    cPickle.dump(sc, open('{0}/tmpPickle.pkl'.format(indir), 'w'))
print 'SC object available'

# Plot the heatmap of the distance matrix

dm = sc.get_distance_matrices()['geo']
p = sc.partitions[sc.clusters_to_partitions[('geo', 'spectral', 4)]]