#!/usr/bin/env python import csv import sys partis_path = '.' # edit this if you're not running from the main partis dir sys.path.insert(1, partis_path + '/python') import utils import glutils from clusterpath import ClusterPath # read default germline info glfo = glutils.read_glfo(partis_path + '/data/germlines/human', locus='igh') print 'first parse an annotation csv file:' with open(partis_path + '/test/reference-results/annotate-new-simu.csv') as csvfile: reader = csv.DictReader(csvfile) for line in reader: if line['v_gene'] == '': # failed (i.e. couldn't find an annotation) continue utils.process_input_line(line) utils.add_implicit_info(glfo, line) utils.print_reco_event(line) break print 'then parse a partition csv file:' cp = ClusterPath() cp.readfile(partis_path + '/test/reference-results/seed-partition-new-simu.csv') cp.print_partitions(abbreviate=True)
def run_bios2mds(n_components, n_clusters, seqfos, base_workdir, seed, aligned=False, reco_info=None, region=None, max_runs=100, max_iterations=1000, method='euclidean', plotdir=None, plotname='mds', queries_to_include=None, color_scale_vals=None, labels=None, title=None, remove_duplicates=False, debug=False): workdir = base_workdir + '/mds' msafname = workdir + '/msa.fa' mdsfname = workdir + '/components.txt' clusterfname = workdir + '/clusters.txt' if not os.path.exists(workdir): os.makedirs(workdir) if len(set([sfo['seq'] for sfo in seqfos])) < len(seqfos): # it'll just crash when it's running mds later, but this is faster if remove_duplicates: seq_groups = [list(group) for _, group in itertools.groupby(sorted(seqfos, key=lambda x: x['seq']), key=lambda x: x['seq'])] seqs_to_remove = [] for sgroup in seq_groups: seqs_to_remove += [sfo['name'] for sfo in sgroup[1:]] # remove any after the first one seqfos = [sfo for sfo in seqfos if sfo['name'] not in seqs_to_remove] else: raise Exception('duplicate sequences in seqfos') if aligned: # NOTE unlike the sklearn version below, this doesn't modify <seqfos> with open(msafname, 'w') as fastafile: for sfo in seqfos: fastafile.write('>%s\n%s\n' % (sfo['name'], sfo['seq'])) else: utils.align_many_seqs(seqfos, outfname=msafname) # build the R cmd file cmdlines = [ 'options(rgl.useNULL=TRUE)', 'require(bios2mds, quietly=TRUE)', 'set.seed(%d)' % seed, 'human <- import.fasta("%s")' % msafname, 'active <- mat.dif(human, human)', # mat.dif or mat.dis? ] if n_components is not None: cmdlines += ['mmds_active <- mmds(active, pc=%d)' % n_components] cmdlines += ['capture.output(mmds_active$coord, file="%s")' % mdsfname] else: raise Exception('need to implement') if n_clusters is not None: cmdlines += [ 'kmeans.run1 <- kmeans.run(mmds_active$coord, nb.clus=%d, nb.run=%d, iter.max=%d, method="%s")' % (n_clusters, max_runs, max_iterations, method), # 'kmeans.run1$clusters', # 'kmeans.run1$elements', 'options(width=10000)', 'capture.output(kmeans.run1$clusters, file="%s")' % clusterfname, # sil.score(mat, nb.clus = c(2:13), nb.run = 100, iter.max = 1000, # run for every possible number of clusters (?) # method = "euclidean") # random.msa # builds a random [...] ] rstart = time.time() try: utils.run_r(cmdlines, workdir) #, print_time='kmeans') except subprocess.CalledProcessError as e: # typically happens because of complex eigenvalues print e print ' mds failed on cluster' # NOTE will still crash in read_kmeans_clusterfile(), but I'm not using that a.t.m. title = (title if title is not None else '') + ' mds failed' pcvals = read_component_file(mdsfname, n_components, seqfos) partition = read_kmeans_clusterfile(clusterfname, seqfos) if n_clusters is not None else None rstop = time.time() if debug and partition is not None: print ' kmeans partition:' cp = ClusterPath(partition=partition) cp.print_partitions(abbreviate=False) os.remove(msafname) os.rmdir(workdir) plotstart = time.time() if plotdir is not None: # utils.prep_dir(plotdir, wildlings=['*.svg']) plot_mds(n_components, pcvals, plotdir, plotname, partition=partition if n_clusters is not None else None, queries_to_include=queries_to_include, color_scale_vals=color_scale_vals, labels=labels, title=title) if reco_info is not None: labels = {uid : reco_info[uid][region + '_gene'] for uid in pcvals} plot_mds(n_components, pcvals, plotdir, 'true-genes', labels=labels, queries_to_include=queries_to_include, color_scale_vals=color_scale_vals, title=title) if not debug: # this isn't a great way to do this, but I don't want to deal with finding all the calling functions, I just want to add some debug printing to this fcn print ' %5.1f %5.1f' % (rstop - rstart, time.time() - plotstart), return partition
#!/usr/bin/env python import sys sys.path.insert(1, './python') import csv csv.field_size_limit(sys.maxsize) # make sure we can write very large csv fields import argparse from clusterpath import ClusterPath from seqfileopener import get_seqfile_info import utils parser = argparse.ArgumentParser() parser.add_argument('--infname', required=True) parser.add_argument('--dont-abbreviate', action='store_true', help='Print full seq IDs (otherwise just prints an \'o\')') parser.add_argument('--n-to-print', type=int, help='How many partitions to print (centered on the best partition)') parser.add_argument('--datadir', default='data/imgt') parser.add_argument('--simfname') parser.add_argument('--is-data', action='store_true') args = parser.parse_args() glfo = utils.read_germline_set(args.datadir) reco_info = None if args.simfname is not None: input_info, reco_info = get_seqfile_info(args.simfname, args.is_data, glfo=glfo) cp = ClusterPath() cp.readfile(args.infname) cp.print_partitions(abbreviate=(not args.dont_abbreviate), n_to_print=args.n_to_print, reco_info=reco_info)
import sys partis_path = '.' # edit this if you're not running from the main partis dir sys.path.insert(1, partis_path + '/python') import utils import glutils from clusterpath import ClusterPath # read default germline info glfo = glutils.read_glfo(partis_path + '/data/germlines/human', chain='h') print 'first parse an annotation csv file:' with open(partis_path + '/test/reference-results/annotate-new-simu.csv') as csvfile: reader = csv.DictReader(csvfile) for line in reader: utils.process_input_line(line) utils.add_implicit_info(glfo, line) utils.print_reco_event(glfo['seqs'], line) cdr3_bounds = (line['codon_positions']['v'], line['codon_positions']['j'] + 3) print '' print ' should match the above:' print ' %s naive cdr3' % line['naive_seq'][cdr3_bounds[0] : cdr3_bounds[1]] print ' %s mature' % line['indel_reversed_seqs'][0][cdr3_bounds[0] : cdr3_bounds[1]] print '' break print 'then parse a partition csv file:' cp = ClusterPath() cp.readfile(partis_path + '/test/reference-results/seed-partition-new-simu.csv') cp.print_partitions(abbreviate=True)