def main(): parser = get_parser() args = validate_args(parser) seq_records = seqrecords.read_fasta(args.fasta) if args.word_size: p = word_pattern.create(seq_records.seq_list, args.word_size) else: p = word_pattern.read(args.word_pattern) if args.reduce_alphabet: p = p.reduce_alphabet(seqcontent.get_reduced_alphabet(args.molecule)) if args.merge_revcomp: p = p.merge_revcomp() freqs = word_vector.Freqs(seq_records.length_list, p) dist = word_distance.Distance(freqs, args.distance) matrix = distmatrix.create(seq_records.id_list, dist) if args.out: oh = open(args.out, 'w') matrix.write_to_file(oh, args.outfmt) oh.close() else: matrix.display(args.outfmt)
def __init__(self, *args, **kwargs): super(DistanceTest, self).__init__(*args, **kwargs) utils.ModulesCommonTest.set_test_data() self.pattern = word_pattern.create(self.dna_records.seq_list, 2) self.counts = word_vector.Counts(self.dna_records.length_list, self.pattern) self.freqs = word_vector.Freqs(self.dna_records.length_list, self.pattern)
def test_word_pattern_create_wordsize1_wordposFalse(self): p = word_pattern.create(self.dna_records.seq_list, word_size=1, wordpos=False) exp = [ "18\t3\tA 0:8 1:4 2:6", "15\t3\tG 0:6 1:6 2:3", "13\t3\tC 0:6 1:3 2:4", "12\t3\tT 0:5 1:5 2:2" ] self.assertEqual(p.format(), "\n".join(exp))
def test_reduce_alphabet_wordsize1(self): alphabet_dict = {'A': 'R', 'C': 'Y', 'T': 'Y', 'G': 'R'} p = word_pattern.create(self.dna_records.seq_list, word_size=1, wordpos=False) expected_format = [ '33\t3\tR 0:14 1:10 2:9', '25\t3\tY 0:11 1:8 2:6' ] p = p.reduce_alphabet(alphabet_dict) self.assertEqual(p.format(), "\n".join(expected_format))
def __init__(self, *args, **kwargs): super(DistanceTest, self).__init__(*args, **kwargs) utils.ModulesCommonTest.set_test_data() self.patterns = [] self.counts = [] self.freqs = [] for i in range(1, 5): p = word_pattern.create(self.pep_records.seq_list, i) self.patterns.append(p) c = word_vector.Counts(self.pep_records.length_list, p) self.counts.append(c) f = word_vector.Freqs(self.pep_records.length_list, p) self.freqs.append(f)
def test_reduce_alphabet_wordsize2(self): alphabet_dict = {'A': 'R', 'C': 'Y', 'T': 'Y', 'G': 'R'} p = word_pattern.create(self.dna_records.seq_list, word_size=2, wordpos=False) expected_format = [ '17\t3\tRR 0:5 1:7 2:5', '15\t3\tYR 0:8 1:3 2:4', '13\t3\tRY 0:8 1:2 2:3', '10\t3\tYY 0:3 1:5 2:2' ] p = p.reduce_alphabet(alphabet_dict) self.assertEqual(p.format(), "\n".join(expected_format))
def test_input_output_file_pattern(self): for wordpos in [True, False]: p1 = word_pattern.create(self.dna_records.seq_list, word_size=1, wordpos=wordpos) oh = open(utils.get_test_data('pattern.txt'), 'w') oh.write(p1.format()) oh.close() fh = open(utils.get_test_data('pattern.txt')) p2 = word_pattern.read(fh) fh.close() self.assertEqual(p1.format(), p2.format()) os.remove(utils.get_test_data('pattern.txt'))
def test_word_pattern_format_teiresias(self): p = word_pattern.create(self.dna_records.seq_list, word_size=1, wordpos=True) exp = [ '18\t3\tA 0 0 0 1 0 5 0 8 0 12 0 13 0 17 0 22 1 2 1 7 1 11 1 ' + '15 2 2 2 6 2 7 2 9 2 11 2 14', '15\t3\tG 0 3 0 11 0 15 0 20 0 23 0 24 1 3 1 4 1 5 1 6 1 16 1 ' + '17 2 3 2 4 2 5', '13\t3\tC 0 2 0 6 0 7 0 14 0 18 0 19 1 0 1 8 1 13 2 0 2 8 2 12 ' + '2 13', '12\t3\tT 0 4 0 9 0 10 0 16 0 21 1 1 1 9 1 10 1 12 1 14 2 ' + '1 2 10' ] self.assertEqual(p.format('teiresias'), "\n".join(exp))
def test_equilibrium_freqs_pattern2(self): p = word_pattern.create(self.dna_records.seq_list, 2, True) dna_freqs = {'A': 0.24, 'C': 0.26, 'G': 0.23, 'T': 0.27} freqmodel = word_vector.EquilibriumFreqs(dna_freqs) freqs_std = word_vector.FreqsStd(self.dna_records.length_list, p, freqmodel) exp = [ "TA\t0.111 0.186 0.166", "GG\t0.033 0.219 0.147", "AC\t0.151 0.063 0.169", "CT\t0.000 0.181 0.081", "AG\t0.040 0.132 0.089", "CA\t0.038 0.000 0.169", "GA\t0.040 0.066 0.089", "AT\t0.037 0.062 0.083", "AA\t0.062 0.000 0.070", "CC\t0.057 0.000 0.065", "CG\t0.115 0.000 0.000", "GT\t0.113 0.000 0.000", "TT\t0.028 0.046 0.000", "TC\t0.000 0.060 0.000", "TG\t0.038 0.000 0.000" ] self.assertEqual(freqs_std.format(), "\n".join(exp))
def test_equal_freqs_pattern2(self): # The result of this method is identical to that from decaf+py. p = word_pattern.create(self.dna_records.seq_list, 2, True) freq = word_vector.Freqs(self.dna_records.length_list, p) freqmodel = word_vector.EqualFreqs(alphabet_size=4) freqs_std = word_vector.FreqsStd(self.dna_records.length_list, p, freqmodel) exp = [ "TA\t0.113 0.189 0.169", "AC\t0.150 0.063 0.169", "GG\t0.030 0.201 0.135", "CT\t0.000 0.189 0.084", "AG\t0.038 0.126 0.084", "CA\t0.038 0.000 0.169", "AT\t0.038 0.063 0.084", "GA\t0.038 0.063 0.084", "AA\t0.060 0.000 0.067", "CC\t0.060 0.000 0.067", "CG\t0.113 0.000 0.000", "GT\t0.113 0.000 0.000", "TT\t0.030 0.050 0.000", "TC\t0.000 0.063 0.000", "TG\t0.038 0.000 0.000" ] self.assertEqual(freqs_std.format(), "\n".join(exp))
def test_reduce_alphabet_wordsize1(self): p = word_pattern.create(self.dna_records.seq_list, word_size=2, wordpos=False) p1 = p.merge_revcomp() pat_list = ['AA', 'AC', 'AG', 'CC', 'CA', 'CG', 'AT', 'GA', 'TA'] occr_list = [ {0: 3, 1: 1, 2: 1}, {0: 7, 1: 1, 2: 2}, {0: 1, 1: 5, 2: 2}, {0: 3, 1: 4, 2: 3}, {0: 2, 2: 2}, {0: 3}, {0: 1, 1: 1, 2: 1}, {0: 1, 1: 2, 2: 1}, {0: 3, 1: 3, 2: 2} ] p2 = word_pattern.Pattern(pat_list, occr_list, []) self.assertEqual(p1.format(), p2.format())
def main(): parser = get_parser() args = validate_args(parser) seq_records = seqrecords.read_fasta(args.fasta) if args.word_size: p = word_pattern.create(seq_records.seq_list, args.word_size) else: p = word_pattern.read(args.word_pattern) bools = word_vector.Bools(seq_records.length_list, p) dist = word_bool_distance.Distance(bools, args.distance) matrix = distmatrix.create(seq_records.id_list, dist) if args.out: oh = open(args.out, 'w') matrix.write_to_file(oh, args.outfmt) oh.close() else: matrix.display(args.outfmt)
def main(): parser = get_parser() args = validate_args(parser) seq_records = seqrecords.read_fasta(args.fasta) if args.word_size: p = word_pattern.create(seq_records.seq_list, args.word_size) else: p = word_pattern.read(args.word_pattern) veccls = {'counts': word_vector.Counts, 'freqs': word_vector.Freqs} vecclsw = { 'counts': word_vector.CountsWeight, 'freqs': word_vector.FreqsWeight } if args.vector == 'counts' or args.vector == 'freqs': if args.char_weights is None: vec = veccls[args.vector](seq_records.length_list, p) else: weightmodel = word_vector.WeightModel( char_weights=args.char_weights) vec = vecclsw[args.vector](seq_records.length_list, p, weightmodel) else: if args.alphabet_size: freqmodel = word_vector.EqualFreqs( alphabet_size=args.alphabet_size) else: freqmodel = word_vector.EquilibriumFreqs(args.char_freqs) vec = word_vector.FreqsStd(seq_records.length_list, p, freqmodel) dist = word_distance.Distance(vec, args.distance) matrix = distmatrix.create(seq_records.id_list, dist) if args.out: oh = open(args.out, 'w') matrix.write_to_file(oh, args.outfmt) oh.close() else: matrix.display(args.outfmt)
def main(): parser = get_parser() args = validate_args(parser) seq_records = seqrecords.read_fasta(args.fasta) patterns = [] for i in range(args.min_word_size, args.max_word_size + 1): p = word_pattern.create(seq_records.seq_list, i) patterns.append(p) vecs = [] if args.char_weights is not None: weightmodel = word_vector.WeightModel(char_weights=args.char_weights) vecklas = { 'counts': word_vector.CountsWeight, 'freqs': word_vector.FreqsWeight }[args.vector] kwargs = { 'seq_lengths': seq_records.length_list, 'weightmodel': weightmodel } else: vecklas = { 'counts': word_vector.Counts, 'freqs': word_vector.Freqs }[args.vector] kwargs = {'seq_lengths': seq_records.length_list} for p in patterns: v = vecklas(patterns=p, **kwargs) vecs.append(v) dist = word_d2.Distance(vecs) matrix = distmatrix.create(seq_records.id_list, dist) if args.out: oh = open(args.out, 'w') matrix.write_to_file(oh, args.outfmt) oh.close() else: matrix.display(args.outfmt)
def main(): parser = get_parser() args = validate_args(parser) seq_records = seqrecords.read_fasta(args.fasta) if args.word_size: p = word_pattern.create(seq_records.seq_list, args.word_size, True) else: p = args.word_pattern vector = word_rtd.create_vector(seq_records.count, p) dist = word_rtd.Distance(vector, args.distance) matrix = distmatrix.create(seq_records.id_list, dist) if args.out: oh = open(args.out, 'w') matrix.write_to_file(oh, args.outfmt) oh.close() else: matrix.display(args.outfmt)
def test_word_pattern_create_wordsize2_wordposFalse(self): p = word_pattern.create(self.dna_records.seq_list, word_size=2, wordpos=False) exp = ["8\t3\tTA 0:3 1:3 2:2", "7\t3\tAC 0:4 1:1 2:2", "7\t3\tGG 0:1 1:4 2:2", "4\t3\tAG 0:1 1:2 2:1", "4\t2\tCT 1:3 2:1", "3\t3\tAT 0:1 1:1 2:1", "3\t3\tGA 0:1 1:1 2:1", "3\t2\tAA 0:2 2:1", "3\t2\tCA 0:1 2:2", "3\t2\tCC 0:2 2:1", "3\t1\tCG 0:3", "3\t1\tGT 0:3", "2\t2\tTT 0:1 1:1", "1\t1\tTC 1:1", "1\t1\tTG 0:1", ] self.assertEqual(p.format(), "\n".join(exp))
def main(): parser = get_parser() args = validate_args(parser) if args.teiresias: args.fasta.close() p = word_pattern.run_teiresias(args.fasta.name, w=args.word_size, l=args.l, k=args.k, output_filename=args.out) else: seq_records = seqrecords.read_fasta(args.fasta) args.fasta.close() p = word_pattern.create(seq_records.seq_list, args.word_size, args.word_position) if args.out: oh = open(args.out, 'w') oh.write(p.format()) oh.close() else: print(p.format())
def main(): parser = get_parser() args = validate_args(parser) seq_records = seqrecords.read_fasta(args.fasta) if args.word_patterns: l = args.word_patterns else: l = [] for i in range(args.word_size, args.word_size - 3, -1): p = word_pattern.create(seq_records.seq_list, i) l.append(p) compos = word_vector.Composition(seq_records.length_list, *l) dist = word_distance.Distance(compos, 'angle_cos_diss') matrix = distmatrix.create(seq_records.id_list, dist) if args.out: oh = open(args.out, 'w') matrix.write_to_file(oh, args.outfmt) oh.close() else: matrix.display(args.outfmt)
elif method == "ncd": dist = ncd.Distance(seq_records) matrix = distmatrix.create(seq_records.id_list, dist) matrix.display() elif method == "wmetric": matrix = subsmat.get('blosum62') dist = wmetric.Distance(seq_records, matrix) matrix = distmatrix.create(seq_records.id_list, dist) matrix.display() elif method == "d2": patterns = [] for i in range(1, 5 + 1): p = word_pattern.create(seq_records.seq_list, i) patterns.append(p) counts = [] for p in patterns: c = word_vector.Counts(seq_records.length_list, p) counts.append(c) countsweight = [] weights = seqcontent.get_weights('protein') weightmodel = word_vector.WeightModel(weights) for p in patterns: c = word_vector.CountsWeight(seq_records, p, weightmodel) countsweight.append(c) dist = word_d2.Distance(countsweight) matrix = distmatrix.create(seq_records.id_list, dist)
def __init__(self, *args, **kwargs): super(WordVectorTest, self).__init__(*args, **kwargs) utils.ModulesCommonTest.set_test_data() self.pattern1 = word_pattern.create(self.dna_records.seq_list, 1) self.pattern2 = word_pattern.create(self.dna_records.seq_list, 2) self.pattern3 = word_pattern.create(self.dna_records.seq_list, 3)
def __init__(self, *args, **kwargs): super(Test, self).__init__(*args, **kwargs) utils.ModulesCommonTest.set_test_data() self.p = word_pattern.create(self.pep_records.seq_list, 2)