def test_weighted_counts_pattern1(self): weights = {'A': 2, 'C': 2, 'G': 2, 'T': 2} weightmodel = word_vector.WeightModel(weights) cw = word_vector.CountsWeight(self.dna_records.length_list, self.pattern1, weightmodel) exp = ["A\t16 8 12", "G\t12 12 6", "C\t12 6 8", "T\t10 10 4"] self.assertEqual(cw.format(0), "\n".join(exp)) for i in range(len(cw.data)): self.assertEqual(sum(cw.data[i]), cw.seq_lengths[i] * 2)
def test_weighted_freqs_pattern1(self): weights = {'A': 2, 'C': 2, 'G': 2, 'T': 2} weightmodel = word_vector.WeightModel(weights) fw = word_vector.FreqsWeight(self.dna_records.length_list, self.pattern1, weightmodel) exp = [ "A\t0.640 0.444 0.800", "G\t0.480 0.667 0.400", "C\t0.480 0.333 0.533", "T\t0.400 0.556 0.267" ] self.assertEqual(fw.format(), "\n".join(exp))
def test_weighted_counts_pattern2(self): weights = {'A': 2, 'C': 2, 'G': 2, 'T': 2} weightmodel = word_vector.WeightModel(weights) cw = word_vector.CountsWeight(self.dna_records.length_list, self.pattern2, weightmodel) exp = [ "TA\t12 12 8", "AC\t16 4 8", "GG\t4 16 8", "AG\t4 8 4", "CT\t0 12 4", "AA\t8 0 4", "AT\t4 4 4", "CA\t4 0 8", "CC\t8 0 4", "CG\t12 0 0", "GA\t4 4 4", "GT\t12 0 0", "TT\t4 4 0", "TC\t0 4 0", "TG\t4 0 0" ] self.assertEqual(cw.format(0), "\n".join(exp))
def test_weighted_freqs_pattern1(self): weights = {'A': 2, 'C': 2, 'G': 2, 'T': 2} weightmodel = word_vector.WeightModel(weights) fw = word_vector.FreqsWeight(self.dna_records.length_list, self.pattern2, weightmodel) exp = [ "TA\t0.500 0.706 0.571", "GG\t0.167 0.941 0.571", "AC\t0.667 0.235 0.571", "CT\t0.000 0.706 0.286", "AG\t0.167 0.471 0.286", "CA\t0.167 0.000 0.571", "AT\t0.167 0.235 0.286", "GA\t0.167 0.235 0.286", "AA\t0.333 0.000 0.286", "CC\t0.333 0.000 0.286", "CG\t0.500 0.000 0.000", "GT\t0.500 0.000 0.000", "TT\t0.167 0.235 0.000", "TC\t0.000 0.235 0.000", "TG\t0.167 0.000 0.000" ] self.assertEqual(fw.format(), "\n".join(exp))
def main(): parser = get_parser() args = validate_args(parser) seq_records = seqrecords.read_fasta(args.fasta) if args.word_size: p = word_pattern.create(seq_records.seq_list, args.word_size) else: p = word_pattern.read(args.word_pattern) veccls = {'counts': word_vector.Counts, 'freqs': word_vector.Freqs} vecclsw = { 'counts': word_vector.CountsWeight, 'freqs': word_vector.FreqsWeight } if args.vector == 'counts' or args.vector == 'freqs': if args.char_weights is None: vec = veccls[args.vector](seq_records.length_list, p) else: weightmodel = word_vector.WeightModel( char_weights=args.char_weights) vec = vecclsw[args.vector](seq_records.length_list, p, weightmodel) else: if args.alphabet_size: freqmodel = word_vector.EqualFreqs( alphabet_size=args.alphabet_size) else: freqmodel = word_vector.EquilibriumFreqs(args.char_freqs) vec = word_vector.FreqsStd(seq_records.length_list, p, freqmodel) dist = word_distance.Distance(vec, args.distance) matrix = distmatrix.create(seq_records.id_list, dist) if args.out: oh = open(args.out, 'w') matrix.write_to_file(oh, args.outfmt) oh.close() else: matrix.display(args.outfmt)
def main(): parser = get_parser() args = validate_args(parser) seq_records = seqrecords.read_fasta(args.fasta) patterns = [] for i in range(args.min_word_size, args.max_word_size + 1): p = word_pattern.create(seq_records.seq_list, i) patterns.append(p) vecs = [] if args.char_weights is not None: weightmodel = word_vector.WeightModel(char_weights=args.char_weights) vecklas = { 'counts': word_vector.CountsWeight, 'freqs': word_vector.FreqsWeight }[args.vector] kwargs = { 'seq_lengths': seq_records.length_list, 'weightmodel': weightmodel } else: vecklas = { 'counts': word_vector.Counts, 'freqs': word_vector.Freqs }[args.vector] kwargs = {'seq_lengths': seq_records.length_list} for p in patterns: v = vecklas(patterns=p, **kwargs) vecs.append(v) dist = word_d2.Distance(vecs) matrix = distmatrix.create(seq_records.id_list, dist) if args.out: oh = open(args.out, 'w') matrix.write_to_file(oh, args.outfmt) oh.close() else: matrix.display(args.outfmt)
matrix.display() elif method == "d2": patterns = [] for i in range(1, 5 + 1): p = word_pattern.create(seq_records.seq_list, i) patterns.append(p) counts = [] for p in patterns: c = word_vector.Counts(seq_records.length_list, p) counts.append(c) countsweight = [] weights = seqcontent.get_weights('protein') weightmodel = word_vector.WeightModel(weights) for p in patterns: c = word_vector.CountsWeight(seq_records, p, weightmodel) countsweight.append(c) dist = word_d2.Distance(countsweight) matrix = distmatrix.create(seq_records.id_list, dist) matrix.display() elif method == "lempelziv": distance = lempelziv.Distance(seq_records) l = ['d', 'd_star', 'd1', 'd1_star', 'd1_star2'] for el in l: distance.set_disttype(el) matrix = distmatrix.create(seq_records.id_list, distance) matrix.display()
def test_weightmodel_invalid_wtype(self): weights = {'A': 2, 'C': 2, 'G': 2, 'T': 2} with self.assertRaises(Exception) as context: weightmodel = word_vector.WeightModel(weights, 'nonexistent') self.assertIn('unknown weight model', str(context.exception))