def test_with_cluster_map(self): aln_set = AlignmentSet(self.seqs, cluster_map=self.cluster_map) self.assertEqual(len(aln_set.clusters), 2) self.assertEqual(len(aln_set.cluster_alns), 2) for result in aln_set.multiple_context_analysis([old_pattern]): hm_pos = result['call']['hm_pos'] if result['call']['sequence'] in ['seq1.3', 'seq2.3']: self.assertTrue(hm_pos) else: self.assertFalse(hm_pos)
def test_without_cluster_map(self): aln_set = AlignmentSet(self.seqs) self.assertEqual(len(aln_set.clusters), 1) self.assertEqual(len(aln_set.cluster_alns), 1) self.assertIn('all', aln_set.cluster_alns) for result in aln_set.multiple_context_analysis([old_pattern]): hm_pos = result['call']['hm_pos'] if result['call']['sequence'] == 'seq2.3': self.assertTrue(hm_pos) else: self.assertFalse(hm_pos)
def test_with_reference_seqs(self): ref_seqs = helpers.parse_fasta(""" >cluster1 AAAAAAAAAAAAAAAAACCC >cluster2 CCTTGGCCGGTTGGCCGCCC """) aln_set = AlignmentSet(self.seqs, cluster_map=self.cluster_map, reference_sequences=ref_seqs) self.assertEqual(len(aln_set.clusters), 2) for result in aln_set.multiple_context_analysis([old_pattern]): hm_pos = result['call']['hm_pos'] if result['call']['sequence'] in ['seq2.{}'.format(i) for i in [1,2,3]]: self.assertTrue(hm_pos) else: self.assertFalse(hm_pos)
def test_with_reference_seqs(self): ref_seqs = helpers.parse_fasta(""" >cluster1 AAAAAAAAAAAAAAAAACCC >cluster2 CCTTGGCCGGTTGGCCGCCC """) aln_set = AlignmentSet(self.seqs, cluster_map=self.cluster_map, reference_sequences=ref_seqs) self.assertEqual(len(aln_set.clusters), 2) for result in aln_set.multiple_context_analysis([old_pattern]): hm_pos = result['call']['hm_pos'] if result['call']['sequence'] in [ 'seq2.{}'.format(i) for i in [1, 2, 3] ]: self.assertTrue(hm_pos) else: self.assertFalse(hm_pos)
def analyze(args): import logging logging.captureWarnings(True) # Fetch sequence records and analysis patterns seq_records = SeqIO.to_dict(SeqIO.parse(args.alignment, 'fasta')) patterns = [mut_pattern.patterns[p] for p in args.patterns] pattern_names = [p.name for p in patterns] prefix = path.join(args.out_dir, args.prefix) analysis_settings = dict(rpr_cutoff=args.rpr_cutoff, significance_level=args.significance_level, quants=args.quants, pos_quants_only=args.pos_quants_only, caller=args.caller, prior=args.prior, cdfs=args.cdfs, quadr_maxiter=args.quadr_maxiter, optim_maxiter=args.optim_maxiter) # Need to think about how best to fork things here; for instance, might make sense to let the user specify # the initial clusters for whatever reason... However, specifying the reference sequences shouldn't make # any sense there if args.reference_sequences: reference_sequences = SeqIO.to_dict( SeqIO.parse(args.reference_sequences, 'fasta')) else: reference_sequences = None # This lets the cluster map be optional, so that this script can be used # for naive hm filtering/analysis cluster_map = load_cluster_map( args.cluster_map, cluster_col=args.cluster_col) if args.cluster_map else None alignments = AlignmentSet(seq_records, cluster_map, consensus_threshold=args.consensus_threshold, reference_sequences=reference_sequences) # Create the analysis generator analysis = alignments.multiple_context_analysis(patterns, **analysis_settings) if args.cluster_threshold: for hm_it in range(args.cluster_iterations - 1): print " ..On hm/cluster iteration", hm_it # Grab the HM columns from the most recent analysis and split out the pos sites hm_columns = [] for result in analysis: hm_columns += result['call']['mut_columns'] hm_neg_aln = Alignment( seq_records.values()).split_hypermuts(hm_columns).hm_neg_aln # Cluster with the specified settings clustering = alnclst.Clustering(hm_neg_aln, args.cluster_threshold, args.consensus_threshold) clustering = clustering.recenter(args.recentering_iterations) clustering.merge_small_clusters(args.min_per_cluster) cluster_map = parse_clusters(clustering.mapping_iterator(), cluster_key=0, sequence_key=1) # Create the Alignment set clustered_alignment = AlignmentSet( seq_records, cluster_map, consensus_threshold=args.consensus_threshold) analysis = clustered_alignment.multiple_context_analysis( patterns, **analysis_settings) # write out the final clusters clusterout_handle = file(prefix + '.clst.csv', 'w') clustering.write(clusterout_handle) if args.interactive: local = copy.copy(locals()) import hyperfreq local.update( dict(hyperfreq=hyperfreq, Alignment=Alignment, AlignmentSet=AlignmentSet, mut_pattern=mut_pattern, write_analysis=write_analysis)) code.interact(local=local) # Write the final analysis to file write_analysis(analysis, prefix, pattern_names, args.quants, args.cdfs, call_only=args.call_only) if args.write_references: write_reference_seqs(alignments, prefix) # Closing files args.alignment.close() if args.cluster_map: args.cluster_map.close()
def analyze(args): import logging; logging.captureWarnings(True) # Fetch sequence records and analysis patterns seq_records = SeqIO.to_dict(SeqIO.parse(args.alignment, 'fasta')) patterns = [mut_pattern.patterns[p] for p in args.patterns] pattern_names = [p.name for p in patterns] prefix = path.join(args.out_dir, args.prefix) analysis_settings = dict( rpr_cutoff=args.rpr_cutoff, significance_level=args.significance_level, quants=args.quants, pos_quants_only=args.pos_quants_only, caller=args.caller, prior=args.prior, cdfs=args.cdfs, quadr_maxiter=args.quadr_maxiter, optim_maxiter=args.optim_maxiter) # Need to think about how best to fork things here; for instance, might make sense to let the user specify # the initial clusters for whatever reason... However, specifying the reference sequences shouldn't make # any sense there if args.reference_sequences: reference_sequences = SeqIO.to_dict(SeqIO.parse(args.reference_sequences, 'fasta')) else: reference_sequences = None # This lets the cluster map be optional, so that this script can be used # for naive hm filtering/analysis cluster_map = load_cluster_map(args.cluster_map, cluster_col=args.cluster_col) if args.cluster_map else None alignments = AlignmentSet(seq_records, cluster_map, consensus_threshold=args.consensus_threshold, reference_sequences=reference_sequences) # Create the analysis generator analysis = alignments.multiple_context_analysis(patterns, **analysis_settings) if args.cluster_threshold: for hm_it in range(args.cluster_iterations - 1): print " ..On hm/cluster iteration", hm_it # Grab the HM columns from the most recent analysis and split out the pos sites hm_columns = [] for result in analysis: hm_columns += result['call']['mut_columns'] hm_neg_aln = Alignment(seq_records.values()).split_hypermuts(hm_columns).hm_neg_aln # Cluster with the specified settings clustering = alnclst.Clustering(hm_neg_aln, args.cluster_threshold, args.consensus_threshold) clustering = clustering.recenter(args.recentering_iterations) clustering.merge_small_clusters(args.min_per_cluster) cluster_map = parse_clusters(clustering.mapping_iterator(), cluster_key=0, sequence_key=1) # Create the Alignment set clustered_alignment = AlignmentSet(seq_records, cluster_map, consensus_threshold=args.consensus_threshold) analysis = clustered_alignment.multiple_context_analysis(patterns, **analysis_settings) # write out the final clusters clusterout_handle = file(prefix + '.clst.csv', 'w') clustering.write(clusterout_handle) if args.interactive: local = copy.copy(locals()) import hyperfreq local.update(dict(hyperfreq=hyperfreq, Alignment=Alignment, AlignmentSet=AlignmentSet, mut_pattern=mut_pattern, write_analysis=write_analysis)) code.interact(local=local) # Write the final analysis to file write_analysis(analysis, prefix, pattern_names, args.quants, args.cdfs, call_only=args.call_only) if args.write_references: write_reference_seqs(alignments, prefix) # Closing files args.alignment.close() if args.cluster_map: args.cluster_map.close()