def simulate(): rearrange() glfo, naive_event_list, cpath = utils.read_output(naive_fname()) assert len(naive_event_list) == args.n_sim_events outdirs = [ '%s/event-%d' % (simdir(), i) for i in range(len(naive_event_list)) ] for ievent, (naive_line, outdir) in enumerate(zip(naive_event_list, outdirs)): run_bcr_phylo(naive_line, outdir, ievent) if utils.output_exists( args, simfname(), outlabel='mutated simu', offset=4 ): # i guess if it crashes during the plotting just below, this'll get confused return mutated_events = [] for ievent, (naive_line, outdir) in enumerate(zip(naive_event_list, outdirs)): mutated_events.append( parse_bcr_phylo_output(glfo, naive_line, outdir, ievent)) print ' writing annotations to %s' % simfname() utils.write_annotations(simfname(), glfo, mutated_events, utils.simulation_headers) import plotting for outdir, event in zip(outdirs, mutated_events): plotting.plot_bcr_phylo_simulation(outdir, event, args.extrastr, args.metric_for_target_distance)
def test_advanced(output_test_path, rendered_template, helper, client): form = AdvancedForm() pack = helper.template_pack helper.layout = Layout( Row( Column( 'simple', css_class='six' ), Column( 'opt_in', css_class='six' ), ), Row( Column( 'longtext' ), ), Row( Column( ButtonHolder(Submit('submit', 'Submit')), ), css_class="large" ), ) rendered = rendered_template(form, helper=helper) attempted = read_output(output_test_path, pack, "test_advanced.html") #write_output(output_test_path, pack, "test_advanced.html", rendered) assert rendered == attempted
def __init__(self, root_user_id="", max_population=24, \ max_friends_per_user=5, community_file="", new=True, safe=False): '''Either load a prexisting community to add to or start a new one. If not starting a new community then root_user_id doesn't do anything. Community is loaded/saved to community_file.''' logging.basicConfig(filename=LOG_FILENAME,level=logging.DEBUG, format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M') self._new = new self._write_path = community_file self._max_population = max_population self._max_friends_per_user = max_friends_per_user self._safe = safe #new community if new and root_user_id: self._root_user_id = root_user_id self._node_pool = {} self._community_members = [] self._community = { 'root_user_id':root_user_id, 'node_pool':{}, 'members':[], } #add to existing community elif not new: self._community = utils.read_output(community_file) self._root_user_id = self._community['root_user_id'] self._node_pool = self._community['node_pool'] self._community_members = self._community['members']
def simulate(): rearrange() glfo, naive_event_list, cpath = utils.read_output('%s/naive-simu.yaml' % simdir(args.stype)) assert len(naive_event_list) == args.n_sim_events outdirs = [ '%s/event-%d' % (simdir(args.stype), i) for i in range(len(naive_event_list)) ] print ' running bcr-phylo for %d naive rearrangements' % len( naive_event_list) for ievent, (naive_line, outdir) in enumerate(zip(naive_event_list, outdirs)): run_bcr_phylo(naive_line, outdir, ievent) mutated_events = [] for ievent, (naive_line, outdir) in enumerate(zip(naive_event_list, outdirs)): mutated_events.append( parse_bcr_phylo_output(glfo, naive_line, outdir, ievent)) print ' writing annotations to %s' % simfname(args.stype) utils.write_annotations(simfname(args.stype), glfo, mutated_events, utils.simulation_headers) import plotting for outdir, event in zip(outdirs, mutated_events): plotting.plot_bcr_phylo_simulation(outdir, event, args.extrastr)
def __init__(self, root_user_id="", max_population=24, \ max_friends_per_user=5, community_file="", new=True, safe=False): '''Either load a prexisting community to add to or start a new one. If not starting a new community then root_user_id doesn't do anything. Community is loaded/saved to community_file.''' logging.basicConfig( filename=LOG_FILENAME, level=logging.DEBUG, format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', datefmt='%m-%d %H:%M') self._new = new self._write_path = community_file self._max_population = max_population self._max_friends_per_user = max_friends_per_user self._safe = safe #new community if new and root_user_id: self._root_user_id = root_user_id self._node_pool = {} self._community_members = [] self._community = { 'root_user_id': root_user_id, 'node_pool': {}, 'members': [], } #add to existing community elif not new: self._community = utils.read_output(community_file) self._root_user_id = self._community['root_user_id'] self._node_pool = self._community['node_pool'] self._community_members = self._community['members']
def read_partis_output(partition_file, glfo_dir=None, locus=None): glfo = (None if utils.getsuffix(partition_file) == ".yaml" else glutils.read_glfo( glfo_dir if glfo_dir else default_glfo_dir, locus)) glfo, annotation_list, cpath = utils.read_output( partition_file, glfo=glfo ) # returns glfo from the file if it's there, otherwise it returns the one we passed in return glfo, annotation_list, cpath
def processed_data(args): """Uses args to find the correct partition, cluster pair and all associated information. Cluster information is returned as by process_cluster.""" print("calling utils.read_output with args:", args.partition_file, args.glfo) file_glfo, annotation_list, cpath = utils.read_output(args.partition_file, glfo=args.glfo) if annotation_list is None: raise Exception('cluster annotation file not found') if file_glfo: # will only be set if we're reading a yaml file args.glfo = file_glfo # select partition, relative to best partition ipart = cpath.i_best + args.partition # select cluster; unique_ids takes highest precedence if args.unique_ids: cluster_unique_ids = args.unique_ids # default to seed, when possibile elif cpath.seed_unique_id and not args.cluster: cluster_unique_ids = next(cluster for cluster in cpath.partitions[ipart] if cpath.seed_unique_id in cluster) # otherwise, assume we have args.cluster or default it to 0 else: clusters = sorted(cpath.partitions[ipart], key=len, reverse=True) cluster_unique_ids = clusters[args.cluster or 0] # Get cluster annotation and put together into annotations = [ l for l in annotation_list if l['unique_ids'] == cluster_unique_ids ] if len(annotations) == 0: raise ValueError( 'requested uids %s not found in %s' % (cluster_unique_ids, args.partition_file) ) # it was a value error before, so I'm leaving it at that elif len(annotations) > 1: print '%s more than one annotation with requested uids %s found in %s' % ( utils.color('red', 'warning'), cluster_unique_ids, args.partition_file) # shouldn't be possible cluster_annotation = annotations[0] data = { 'n_clusters': len(cpath.partitions[ipart]), 'logprob': cpath.logprobs[ipart], 'partition_file': args.partition_file, 'last_modified': time.ctime(os.path.getmtime(args.partition_file)) } if args.seqs_out: data['seqs_file'] = os.path.relpath(args.seqs_out, args.paths_relative_to) # Process the annotation file specific details/data data.update(process_cluster(args, cluster_annotation, cpath.seed_unique_id)) return data
def test_basic(output_test_path, rendered_template, helper, client): form = BasicInputForm() pack = helper.template_pack rendered = rendered_template(form, helper=helper) attempted = read_output(output_test_path, pack, "test_basic.html") #write_output(output_test_path, pack, "test_basic.html", rendered) assert rendered == attempted
def test_layout(output_test_path, rendered_template, helper, client): form = BasicInputFormLayoutIncluded(helper=helper) pack = helper.template_pack rendered = rendered_template(form) attempted = read_output(output_test_path, pack, "test_layout.html") #write_output(output_test_path, pack, "test_layout.html", rendered) assert rendered == attempted
def main(): community = read_output('pickled_populations/lizardbill_11_20_2010') s = PopulationStats(community['members']) community_member_names = s.all_user_names() for user in community['members']: print '' print 'User:'******'screen_name'] print 'ID:', user['uid'] print 'Tweet Count:', len(user['tweets']) print 'Friend IDs:', len(user['friend_ids']) print 'Follower IDs:', len(user['follower_ids']) print 'Community Members:', len(community['members'])
def read_rearrangements(): if args.paired_loci: lp_infos = paircluster.read_lpair_output_files( lpairs(), naive_fname, dbgstr='naive simulation') naive_events = paircluster.get_both_lpair_antn_pairs( lpairs(), lp_infos) glfos, _, _ = paircluster.concat_heavy_chain( lpairs(), lp_infos) # per-locus glfos with concat'd heavy chain else: glfo, naive_events, _ = utils.read_output(naive_fname(None)) glfos = [glfo] return glfos, naive_events
def test_inlinefield(output_test_path, rendered_template, helper, client): form = BasicInputForm() pack = helper.template_pack helper.layout = Layout( InlineField('simple', label_column='large-7', input_column='large-5', label_class='foobar')) rendered = rendered_template(form, helper=helper) attempted = read_output(output_test_path, pack, "test_inlinefield.html") #write_output(output_test_path, pack, "test_inlinefield.html", rendered) assert rendered == attempted
def test_buttongroup(output_test_path, rendered_template, helper, client): form = BasicInputForm() pack = helper.template_pack helper.layout = Layout( 'simple', ButtonGroup( Submit('Save', 'Save'), Button('Cancel', 'Cancel'), ) ) rendered = rendered_template(form, helper=helper) attempted = read_output(output_test_path, pack, "test_buttongroup.html") #write_output(output_test_path, pack, "test_buttongroup.html", rendered) assert rendered == attempted
def test_tab(output_test_path, rendered_template, helper, client): form = AdvancedForm() pack = helper.template_pack helper.layout = Layout( TabHolder( TabItem('My tab 1', 'simple'), TabItem('My tab 2', 'opt_in'), TabItem('My tab 3', 'longtext'), css_id="meep-meep" ) ) rendered = rendered_template(form, helper=helper) attempted = read_output(output_test_path, pack, "test_tab.html") #write_output(output_test_path, pack, "test_tab.html", rendered) assert attempted == rendered
def test_inlineswitchfield(output_test_path, rendered_template, helper, client): form = BoolInputForm() pack = helper.template_pack helper.layout = Layout( InlineSwitchField('opt_in', label_column='large-8', input_column='large-4', label_class='foobar', switch_class="inline")) rendered = rendered_template(form, helper=helper) attempted = read_output(output_test_path, pack, "test_inlineswitchfield.html") #write_output(output_test_path, pack, "test_inlineswitchfield.html", rendered) assert rendered == attempted
def test_accordion(output_test_path, rendered_template, helper, client): form = AdvancedForm() pack = helper.template_pack # Define 'css_id' to avoid test fails with automatic generated random ID helper.layout = Layout( AccordionHolder( AccordionItem('Group 1', 'simple'), AccordionItem('Group 2', 'opt_in'), AccordionItem('Group 3', 'longtext'), css_id="meep-meep" ) ) rendered = rendered_template(form, helper=helper) attempted = read_output(output_test_path, pack, "test_accordion.html") #write_output(output_test_path, pack, "test_accordion.html", rendered) assert attempted == rendered
def simulate(): rearrange() glfo, naive_event_list, cpath = utils.read_output(naive_fname()) assert len(naive_event_list) == args.n_sim_events outdirs = ['%s/event-%d' % (simdir(), i) for i in range(len(naive_event_list))] start = time.time() cmdfos = [] if args.n_procs > 1: print ' starting %d events' % len(naive_event_list) uid_str_len = 6 + int(math.log(len(naive_event_list), 10)) # if the final sample's going to contain many trees, it's worth making the uids longer so there's fewer collisions/duplicates for ievent, (naive_line, outdir) in enumerate(zip(naive_event_list, outdirs)): if args.n_sim_events > 1 and args.n_procs == 1: print ' %s %d' % (utils.color('blue', 'ievent'), ievent) cfo = run_bcr_phylo(naive_line, outdir, ievent, len(naive_event_list), uid_str_len=uid_str_len) # if n_procs > 1, doesn't run, just returns cfo if cfo is not None: print ' %s %s' % (utils.color('red', 'run'), cfo['cmd_str']) cmdfos.append(cfo) if args.n_procs > 1 and len(cmdfos) > 0: utils.run_cmds(cmdfos, shell=True, n_max_procs=args.n_procs, batch_system='slurm' if args.slurm else None, allow_failure=True, debug='print') print ' bcr-phylo run time: %.1fs' % (time.time() - start) if utils.output_exists(args, simfname(), outlabel='mutated simu', offset=4): # i guess if it crashes during the plotting just below, this'll get confused return start = time.time() mutated_events = [] for ievent, (naive_line, outdir) in enumerate(zip(naive_event_list, outdirs)): mutated_events.append(parse_bcr_phylo_output(glfo, naive_line, outdir, ievent)) print ' parsing time: %.1fs' % (time.time() - start) print ' writing annotations to %s' % simfname() utils.write_annotations(simfname(), glfo, mutated_events, utils.simulation_headers) if not args.only_csv_plots: import lbplotting for outdir, event in zip(outdirs, mutated_events): lbplotting.plot_bcr_phylo_simulation(outdir, event, args.extrastr, lbplotting.metric_for_target_distance_labels[args.metric_for_target_distance])
'--max-family-size', type=int, help='subset each family down to this size before passing to treeutils') parser.add_argument( '--min-selection-metric-cluster-size', type=int, default=treeutils.default_min_selection_metric_cluster_size) parser.add_argument('--include-relative-affy-plots', action='store_true') # tree plots turned off in the treeutils fcn atm # parser.add_argument('--ete-path', default=('/home/%s/anaconda_ete/bin' % os.getenv('USER')) if os.getenv('USER') is not None else None) # parser.add_argument('--workdir') # only required to make ete trees args = parser.parse_args() if args.n_max_queries is not None: print ' --n-max-queries set to %d' % args.n_max_queries glfo, true_lines, _ = utils.read_output(args.infname, n_max_queries=args.n_max_queries) # numpy.random.seed(1) if args.max_family_size is not None: for line in [ l for l in true_lines if len(l['unique_ids']) > args.max_family_size ]: iseqs_to_keep = numpy.random.choice(range(len(line['unique_ids'])), args.max_family_size) utils.restrict_to_iseqs(line, iseqs_to_keep, glfo) if args.metric_method == 'dtr': treeutils.calculate_tree_metrics( None, args.lb_tau,
reverse=True) total_prob = 0. for naive_seq, prob in nseq_info: print ' %s %5.2f %s' % (utils.color_mutants( naive_seq if ref_seq is None else ref_seq, naive_seq), prob, utils.color(namecolor, namestr)) if ref_seq is None: ref_seq = naive_seq if 1. - total_prob < args.prob_to_ignore: break total_prob += prob return ref_seq glfo, annotation_list, cpath = utils.read_output( '%s/%s/partition-with-alternative-annotations.yaml' % (args.basedir, args.locus)) lh_info = read_linearham_output() # print annotations for the biggest cluster in the most likely partition annotations = { ':'.join(adict['unique_ids']): adict for adict in annotation_list } # collect the annotations in a dictionary so they're easier to access most_likely_partition = cpath.partitions[ cpath. i_best] # a partition is represented as a list of lists of strings, with each string a sequence id sorted_clusters = sorted(most_likely_partition, key=len, reverse=True) for cluster in sorted_clusters: line = annotations[':'.join(cluster)] print ':'.join(line['unique_ids'])
def run_train(args, hypers): system_check_and_init(args) if args.gpu: print "GPU available" else: print "CPU only" word_v = vocabulary() char_v = vocabulary() actn_v = vocabulary() pretrain = PretrainedEmb(args.pretrain_path) #instances train_input = read_input(args.train_input) dev_input = read_input(args.dev_input) singleton_idx_dict, word_dict, word_v = get_singleton_dict(train_input, word_v) extra_vl = [ vocabulary() for i in range(len(train_input[0])-1)] train_instance, word_v, char_v, extra_vl = input2instance(train_input, word_v, char_v, pretrain, extra_vl, word_dict, args, "train") word_v.freeze() char_v.freeze() for i in range(len(extra_vl)): extra_vl[i].freeze() dev_instance, word_v, char_v, extra_vl = input2instance(dev_input, word_v, char_v, pretrain, extra_vl, {}, args, "dev") train_output = read_output(args.train_action) dev_output = read_output(args.dev_action) train_action, actn_v = output2action(train_output, actn_v) #dev_actoin, actn_v = output2action(dev_output, actn_v) print "word vocabulary size:", word_v.size() print "char vocabulary size:", char_v.size() - 1 print "pretrain vocabulary size:", pretrain.size() - 1 extra_vl_size = [] for i in range(len(extra_vl)): print "extra", i, "vocabulary size:", extra_vl[i].size() extra_vl_size.append(extra_vl[i].size()) print "action vocaluary size:", actn_v.size() - 1 actn_v.freeze() actn_v.dump() # neural components input_representation = token_representation(word_v.size(), char_v.size(), pretrain, extra_vl_size, args) encoder = None if args.encoder == "BILSTM": encoder = bilstm_encoder(args) elif args.encoder == "Transformer": encoder = transformer(args) assert encoder, "please specify encoder type" decoder = in_order_constituent_parser(actn_v.size(), actn_v.toidx("TERM"), args) mask = in_order_constituent_parser_mask(actn_v) if args.gpu: encoder = encoder.cuda() decoder = decoder.cuda() input_representation = input_representation.cuda() #training process model_parameters = list(encoder.parameters()) + list(decoder.parameters()) + list(input_representation.parameters()) #model_optimizer = optimizer(args, model_parameters) lr = args.learning_rate_f i = len(train_instance) check_iter = 0 check_loss = 0 bscore = -1 epoch = -1 while True: for p in model_parameters: if p.grad is not None: p.grad.detach_() p.grad.zero_() if i == len(train_instance): i = 0 epoch += 1 lr = args.learning_rate_f / (1 + epoch * args.learning_rate_decay_f) check_iter += 1 input_t = input_representation(train_instance[i], singleton_idx_dict=singleton_idx_dict, test=False) enc_rep_t = encoder(input_t, test=False) loss_t = decoder(enc_rep_t, mask, train_action[i], test=False) check_loss += loss_t.data.tolist() if check_iter % args.check_per_update == 0: print('epoch %.6f : %.10f [lr: %.6f]' % (check_iter*1.0/len(train_instance), check_loss*1.0 / args.check_per_update, lr)) check_loss = 0 if check_iter % args.eval_per_update == 0: trees = [] for j, instance in enumerate(dev_instance): dev_input_embeddings = input_representation(instance) dev_enc_rep = encoder(dev_input_embeddings) dev_action_output = decoder(dev_enc_rep, mask) #print dev_action_output #print dev_input[j][0][1:-1] #print dev_input[j][-1][1:-1] trees.append(in_order_constituent_action2tree(dev_action_output, actn_v, dev_input[j][0][1:-1], dev_input[j][-1][1:-1])) with open("tmp/dev.output.tmp", "w") as w: for tree in trees: w.write(tree+"\n") w.flush() w.close() score = constituent_parser_eval(args) print('dev F-score %.10f ' % (score)) if score >= bscore: bscore = score torch.save({"encoder":encoder.state_dict(), "decoder":decoder.state_dict(), "input_representation": input_representation.state_dict()}, args.model_path_base+"/model") i += 1 loss_t.backward() torch.nn.utils.clip_grad_value_(model_parameters, 5) #model_optimizer.step() for p in model_parameters: if p.requires_grad: p.data.add_(-lr, p.grad.data)
from clusterpath import ClusterPath parser = argparse.ArgumentParser() parser.add_argument('--fname', default=partis_dir + '/test/reference-results/partition-ref-simu.yaml') parser.add_argument('--glfo-dir', default=partis_dir + '/data/germlines/human') parser.add_argument('--locus', default='igh') args = parser.parse_args() glfo = None if utils.getsuffix(args.fname) == '.csv': print ' reading deprecated csv format, so need to read germline info from somewhere else, using --glfo-dir %s, hopefully it works' % args.glfo_dir glfo = glutils.read_glfo(args.glfo_dir, locus=args.locus) glfo, annotation_list, cpath = utils.read_output(args.fname, glfo=glfo) if cpath is None or len(cpath.partitions) == 0: print 'no partitions read from %s, so just printing first annotation:' % args.fname utils.print_reco_event(annotation_list[0]) sys.exit(0) print utils.color('green', 'list of partitions:') cpath.print_partitions( abbreviate=True ) # 'abbreviate' print little 'o's instead of the full sequence ids # print annotations for the biggest cluster in the most likely partition annotations = { ':'.join(adict['unique_ids']): adict for adict in annotation_list
tweet_days = {} for member in self._community_members: tweets = member['tweets'] for tweet in tweets: raw_timestamp = tweet['created_at'] formatted_timestamp = \ parse_twitter_timestamp(raw_timestamp) if formatted_timestamp in tweet_days: tweet_days[formatted_timestamp].append(tweet) else: tweet_days[formatted_timestamp] = [tweet] return tweet_days def words_by_day(self): word_counts = {} for day, tweets in self._tweets_by_day.items(): tweet_text = '' for tweet in tweets: tweet_text += tweet['text'] day_counts = WordCounter(tweet_text).get_word_data() word_counts[day] = day_counts return word_counts if __name__ == "__main__": community = read_output('pickled_populations/lizardbill_11_20_2010') dc = DayCounts(community) wc = dc.words_by_day() import pprint pprint.pprint(wc)
if 'extract-fasta.py' in sys.argv[0]: # if they're trying to run this old script, which is now just a link to this one, print a warning and rejigger the arguments so it still works print ' note: running deprecated script %s, which currently is just a link pointing to %s' % (os.path.basename(sys.argv[0]), os.path.basename(os.path.realpath( __file__))) print ' note: transferring deprecated arguments --input-file and --fasta-output-file to the first two positional arguments (this will continue to work, you only need to change things if you want this warning to go away)' utils.insert_in_arglist(sys.argv, [utils.get_val_from_arglist(sys.argv, '--input-file'), utils.get_val_from_arglist(sys.argv, '--fasta-output-file')], sys.argv[0]) utils.remove_from_arglist(sys.argv, '--input-file', has_arg=True) utils.remove_from_arglist(sys.argv, '--fasta-output-file', has_arg=True) args = parser.parse_args() args.extra_columns = utils.get_arg_list(args.extra_columns) assert utils.getsuffix(args.outfile) in ['.csv', '.tsv', '.fa', '.fasta'] default_glfo_dir = partis_dir + '/data/germlines/human' if utils.getsuffix(args.infile) == '.csv' and args.glfo_dir is None: print ' note: reading deprecated csv format, so need to get germline info from a separate directory; --glfo-dir was not set, so using default %s. If it doesn\'t crash, it\'s probably ok.' % default_glfo_dir args.glfo_dir = default_glfo_dir glfo, annotation_list, cpath = utils.read_output(args.infile, glfo_dir=args.glfo_dir, locus=args.locus) if args.plotdir is not None: from parametercounter import ParameterCounter setattr(args, 'region_end_exclusions', {r : [0 for e in ['5p', '3p']] for r in utils.regions}) # hackity hackity hackity pcounter = ParameterCounter(glfo, args) for line in annotation_list: pcounter.increment(line) pcounter.plot(args.plotdir) #, make_per_base_plots=True) #, only_overall=True, make_per_base_plots=True sys.exit(0) if cpath is None or cpath.i_best is None: clusters_to_use = [l['unique_ids'] for l in annotation_list] print ' no cluster path in input file, so just using all %d sequences (in %d clusters) in annotations' % (sum(len(c) for c in clusters_to_use), len(clusters_to_use)) else: ipartition = cpath.i_best if args.partition_index is None else args.partition_index
parser.add_argument('--locus', default='igh') args = parser.parse_args() if args.title == 'good': args.title = 'none' elif args.title == 'chimeras': args.title = 'all chimeras' def gk(uids): return ':'.join(uids) glfo = None if utils.getsuffix(args.infile) == '.csv': glfo = glutils.read_glfo(args.glfo_dir, args.locus) glfo, annotation_list, _ = utils.read_output(args.infile, glfo=glfo) annotations = collections.OrderedDict( (line['unique_ids'][0], line) for line in annotation_list) chfo = { uid: { k: v for k, v in zip( ('imax', 'max_abs_diff'), utils.get_chimera_max_abs_diff( annotations[uid], iseq=0, chunk_len=args.chunk_len)) } for uid in annotations } biggest_adiffs = sorted(chfo, key=lambda q: chfo[q]['max_abs_diff'],
def _split_tweets(self): tweet_days = {} for member in self._community_members: tweets = member['tweets'] for tweet in tweets: raw_timestamp = tweet['created_at'] formatted_timestamp = \ parse_twitter_timestamp(raw_timestamp) if formatted_timestamp in tweet_days: tweet_days[formatted_timestamp].append(tweet) else: tweet_days[formatted_timestamp] = [tweet] return tweet_days def words_by_day(self): word_counts = {} for day, tweets in self._tweets_by_day.items(): tweet_text = '' for tweet in tweets: tweet_text += tweet['text'] day_counts = WordCounter(tweet_text).get_word_data() word_counts[day] = day_counts return word_counts if __name__ == "__main__": community = read_output('pickled_populations/lizardbill_11_20_2010') dc = DayCounts(community) wc = dc.words_by_day() import pprint pprint.pprint(wc)
type=int, help= 'take only the first N seqs from both the fasta file and the annotation in the partis output file (e.g. for testing when the family is huge)' ) args = parser.parse_args() new_seqfos = utils.read_fastx(args.new_seq_file, sanitize_seqs=True) print ' read %d seqs from %s' % (len(new_seqfos), args.new_seq_file) glfo = None if utils.getsuffix(args.partis_output_file) == '.csv': print ' reading deprecated csv format, so need to read germline info from somewhere else, using --glfo-dir %s, hopefully it works' % args.glfo_dir glfo = glutils.read_glfo(args.glfo_dir, locus=args.locus) glfo, annotation_list, cpath = utils.read_output(args.partis_output_file, glfo=glfo, locus=args.locus) if args.partition_index is not None: print ' using non-best partition index %d (best is %d)' % ( args.partition_index, cpath.i_best) partition = cpath.partitions[cpath.i_best if args. partition_index is None else args.partition_index] print ' read partition with %d clusters from %s' % (len(partition), args.partis_output_file) new_uids = set(sfo['name'] for sfo in new_seqfos) clusters_with_overlap = [] for cluster in partition: overlap_uids = set(cluster) & new_uids if len(overlap_uids) > 0: clusters_with_overlap.append((cluster, overlap_uids))
parser.add_argument('--seed-unique-id', help='if set, take sequences only from the cluster containing this seed sequence, rather than the default of taking all sequences from all clusters') parser.add_argument('--cluster-index', type=int, help='if set, take sequences only from the cluster at this index in the partition, rather than the default of taking all sequences from all clusters') parser.add_argument('--indel-reversed-seqs', action='store_true', help='if set, take sequences that have had any shm indels "reversed" (i.e. insertions are reversed, and deletions are replaced with the germline bases) rather than the default of using sequences from the original input file. Indel-reversed sequences can be convenient because they are by definition the same length as and aligned to the naive sequence.') parser.add_argument('--glfo-dir', help='Directory with germline info. Only necessary for old-style csv output files. Equivalent to a parameter dir with \'/hmm/germline-sets\' appended.') parser.add_argument('--locus', default='igh', help='only used for old-style csv output files') args = parser.parse_args() glfo = None if utils.getsuffix(args.input_file) == '.csv': default_glfo_dir = partis_dir + '/data/germlines/human' if args.glfo_dir is None: print ' note: reading deprecated csv format, so need to get germline info from a separate directory; --glfo-dir was not set, so using default %s. If it doesn\'t crash, it\'s probably ok.' % default_glfo_dir args.glfo_dir = default_glfo_dir glfo = glutils.read_glfo(args.glfo_dir, locus=args.locus) glfo, annotation_list, cpath = utils.read_output(args.input_file, glfo=glfo) if cpath is None: clusters_to_use = [l['unique_ids'] for l in annotation_list] print ' no cluster path in input file, so just using all %d sequences (in %d clusters) in annotations' % (sum(len(c) for c in clusters_to_use), len(clusters_to_use)) else: ipartition = cpath.i_best if args.partition_index is None else args.partition_index print ' found %d clusters in %s' % (len(cpath.partitions[ipartition]), 'best partition' if args.partition_index is None else 'partition at index %d (of %d)' % (ipartition, len(cpath.partitions))) if args.cluster_index is None: clusters_to_use = cpath.partitions[ipartition] print ' taking all %d clusters' % len(clusters_to_use) else: clusters_to_use = [cpath.partitions[ipartition][args.cluster_index]] print ' taking cluster at index %d' % args.cluster_index if args.seed_unique_id is not None: clusters_to_use = [c for c in clusters_to_use if args.seed_unique_id in c] # NOTE can result in more than one cluster with the seed sequence (e.g. if this file contains intermediate annotations from seed partitioning))