def main(): parser = get_parser() args = validate_args(parser) seq_records = seqrecords.read_fasta(args.fasta) if args.word_size: p = word_pattern.create(seq_records.seq_list, args.word_size) else: p = word_pattern.read(args.word_pattern) if args.reduce_alphabet: p = p.reduce_alphabet(seqcontent.get_reduced_alphabet(args.molecule)) if args.merge_revcomp: p = p.merge_revcomp() freqs = word_vector.Freqs(seq_records.length_list, p) dist = word_distance.Distance(freqs, args.distance) matrix = distmatrix.create(seq_records.id_list, dist) if args.out: oh = open(args.out, 'w') matrix.write_to_file(oh, args.outfmt) oh.close() else: matrix.display(args.outfmt)
def test_minkowski_freqs(self): dist = word_distance.Distance(self.freqs, 'minkowski') matrix = distmatrix.create(self.dna_records.id_list, dist) data = [ " 3", "seq1 0.0000000 0.3763512 0.2532387", "seq2 0.3763512 0.0000000 0.2603008", "seq3 0.2532387 0.2603008 0.0000000" ] self.assertEqual(matrix.format(), "\n".join(data))
def test_manhattan_freqs(self): dist = word_distance.Distance(self.freqs, 'manhattan') matrix = distmatrix.create(self.dna_records.id_list, dist) data = [ " 3", "seq1 0.0000000 1.2156863 0.7619048", "seq2 1.2156863 0.0000000 0.7899160", "seq3 0.7619048 0.7899160 0.0000000" ] self.assertEqual(matrix.format(), "\n".join(data))
def test_chebyshev_freqs(self): dist = word_distance.Distance(self.freqs, 'chebyshev') matrix = distmatrix.create(self.dna_records.id_list, dist) data = [ " 3", "seq1 0.0000000 0.1936275 0.1250000", "seq2 0.1936275 0.0000000 0.1428571", "seq3 0.1250000 0.1428571 0.0000000" ] self.assertEqual(matrix.format(), "\n".join(data))
def test_lcc_freqs(self): dist = word_distance.Distance(self.freqs, 'lcc') matrix = distmatrix.create(self.dna_records.id_list, dist) data = [ " 3", "seq1 0.0000000 0.6205496 0.4017554", "seq2 0.6205496 0.0000000 0.2550506", "seq3 0.4017554 0.2550506 0.0000000" ] self.assertEqual(matrix.format(), "\n".join(data))
def test_google_freqs(self): dist = word_distance.Distance(self.freqs, 'google') matrix = distmatrix.create(self.dna_records.id_list, dist) data = [ ' 3', 'seq1 0.0000000 0.6078431 0.3809524', 'seq2 0.6078431 0.0000000 0.3949580', 'seq3 0.3809524 0.3949580 0.0000000' ] self.assertEqual(matrix.format(), "\n".join(data))
def test_canberra_freqs(self): dist = word_distance.Distance(self.freqs, 'canberra') matrix = distmatrix.create(self.dna_records.id_list, dist) data = [ " 3", "seq1 0.0000000 10.3372258 7.1836838", "seq2 10.3372258 0.0000000 6.6280959", "seq3 7.1836838 6.6280959 0.0000000" ] self.assertEqual(matrix.format(), "\n".join(data))
def test_kld_freqs(self): dist = word_distance.Distance(self.freqs, 'kld') matrix = distmatrix.create(self.dna_records.id_list, dist) data = [ " 3", "seq1 0.0000000 0.0932800 0.0435210", "seq2 0.0932800 0.0000000 0.0447391", "seq3 0.0435210 0.0447391 0.0000000" ] self.assertEqual(matrix.format(), "\n".join(data))
def test_diff_abs_mult_freqs(self): dist = word_distance.Distance(self.freqs, 'diff_abs_mult') matrix = distmatrix.create(self.dna_records.id_list, dist) data = [ " 3", "seq1 0.0000000 0.0621975 0.0404611", "seq2 0.0621975 0.0000000 0.0531478", "seq3 0.0404611 0.0531478 0.0000000" ] self.assertEqual(matrix.format(), "\n".join(data))
def test_braycurtis_freqs(self): dist = word_distance.Distance(self.freqs, 'braycurtis') matrix = distmatrix.create(self.dna_records.id_list, dist) data = [ " 3", "seq1 0.0000000 0.6078431 0.3809524", "seq2 0.6078431 0.0000000 0.3949580", "seq3 0.3809524 0.3949580 0.0000000" ] self.assertEqual(matrix.format(), "\n".join(data))
def test_jsd_freqs(self): dist = word_distance.Distance(self.freqs, 'jsd') matrix = distmatrix.create(self.dna_records.id_list, dist) data = [ " 3", "seq1 0.0000000 0.4608882 0.2550278", "seq2 0.4608882 0.0000000 0.2457790", "seq3 0.2550278 0.2457790 0.0000000" ] self.assertEqual(matrix.format(), "\n".join(data))
def test_euclid_norm_counts(self): # The result of this method is identical to that from decaf+py. dist = word_distance.Distance(self.counts, 'euclid_norm') matrix = distmatrix.create(self.dna_records.id_list, dist) data = [ ' 3', 'seq1 0.0000000 7.5498344 5.4772256', 'seq2 7.5498344 0.0000000 4.3588989', 'seq3 5.4772256 4.3588989 0.0000000' ] self.assertEqual(matrix.format(), "\n".join(data))
def test_angle_cos_diss_freqs(self): # The result of this method is identical to that from decaf+py. dist = word_distance.Distance(self.freqs, 'angle_cos_diss') matrix = distmatrix.create(self.dna_records.id_list, dist) data = [ ' 3', 'seq1 0.0000000 0.2797355 0.1500672', 'seq2 0.2797355 0.0000000 0.1261027', 'seq3 0.1500672 0.1261027 0.0000000' ] self.assertEqual(matrix.format(), "\n".join(data))
def test_euclid_squared_freqs(self): # The result of this method is identical to that from decaf+py. dist = word_distance.Distance(self.freqs, 'euclid_squared') matrix = distmatrix.create(self.dna_records.id_list, dist) data = [ ' 3', 'seq1 0.0000000 0.1416402 0.0641298', 'seq2 0.1416402 0.0000000 0.0677565', 'seq3 0.0641298 0.0677565 0.0000000' ] self.assertEqual(matrix.format(), "\n".join(data))
def test_euclid_seqlen2_freqs(self): # The result of this method is identical to that from decaf+py. dist = word_distance.Distance(self.freqs, 'euclid_seqlen2') matrix = distmatrix.create(self.dna_records.id_list, dist) data = [ ' 3', 'seq1 0.0000000 0.0072101 0.0038263', 'seq2 0.0072101 0.0000000 0.0039866', 'seq3 0.0038263 0.0039866 0.0000000' ] self.assertEqual(matrix.format(), "\n".join(data))
def test_euclid_norm_freqs(self): # The result of this method is identical to that from decaf+py. dist = word_distance.Distance(self.freqs, 'euclid_norm') matrix = distmatrix.create(self.dna_records.id_list, dist) data = [ ' 3', 'seq1 0.0000000 0.3763512 0.2532387', 'seq2 0.3763512 0.0000000 0.2603008', 'seq3 0.2532387 0.2603008 0.0000000' ] self.assertEqual(matrix.format(), "\n".join(data))
def test_diff_abs_mult2_freqs(self): # The result of this method is identical to that from decaf+py. dist = word_distance.Distance(self.freqs, 'diff_abs_mult2') matrix = distmatrix.create(self.dna_records.id_list, dist) data = [ ' 3', 'seq1 0.0000000 0.0621975 0.0404611', 'seq2 0.0621975 0.0000000 0.0531478', 'seq3 0.0404611 0.0531478 0.0000000' ] self.assertEqual(matrix.format(), "\n".join(data))
def test_angle_cos_evol_freqs(self): # The result of this method is identical to that from decaf+py. dist = word_distance.Distance(self.freqs, 'angle_cos_evol') matrix = distmatrix.create(self.dna_records.id_list, dist) data = [ ' 3', 'seq1 0.0000000 0.3281368 0.1625980', 'seq2 0.3281368 0.0000000 0.1347925', 'seq3 0.1625980 0.1347925 0.0000000' ] self.assertEqual(matrix.format(), "\n".join(data))
def test_diff_abs_add_freqs(self): # The result of this method is identical to that from decaf+py. dist = word_distance.Distance(self.freqs, 'diff_abs_add') matrix = distmatrix.create(self.dna_records.id_list, dist) data = [ ' 3', 'seq1 0.0000000 0.0810458 0.0507937', 'seq2 0.0810458 0.0000000 0.0526611', 'seq3 0.0507937 0.0526611 0.0000000' ] self.assertEqual(matrix.format(), "\n".join(data))
def test_create_matrix(self): l = [[3, 6, 4, 1, 3, 4, 3, 0, 1, 1, 6, 4, 5, 0, 3, 4], [0, 3, 0, 3, 0, 0, 0, 2, 9, 0, 3, 3, 0, 6, 3, 6], [9, 0, 0, 3, 0, 0, 0, 2, 6, 0, 3, 3, 0, 3, 3, 3]] vector = np.array(l) dist = word_distance.Distance(vector, 'minkowski') id_list = ['seq1', 'seq2', 'seq3'] matrix = distmatrix.create(id_list, dist) exp = [ ' 3', 'seq1 0.0000000 14.6969385 14.1774469', 'seq2 14.6969385 0.0000000 10.8166538', 'seq3 14.1774469 10.8166538 0.0000000' ] self.assertEqual(matrix.format(), "\n".join(exp))
def main(): parser = get_parser() args = validate_args(parser) seq_records = seqrecords.read_fasta(args.fasta) if args.word_size: p = word_pattern.create(seq_records.seq_list, args.word_size) else: p = word_pattern.read(args.word_pattern) veccls = {'counts': word_vector.Counts, 'freqs': word_vector.Freqs} vecclsw = { 'counts': word_vector.CountsWeight, 'freqs': word_vector.FreqsWeight } if args.vector == 'counts' or args.vector == 'freqs': if args.char_weights is None: vec = veccls[args.vector](seq_records.length_list, p) else: weightmodel = word_vector.WeightModel( char_weights=args.char_weights) vec = vecclsw[args.vector](seq_records.length_list, p, weightmodel) else: if args.alphabet_size: freqmodel = word_vector.EqualFreqs( alphabet_size=args.alphabet_size) else: freqmodel = word_vector.EquilibriumFreqs(args.char_freqs) vec = word_vector.FreqsStd(seq_records.length_list, p, freqmodel) dist = word_distance.Distance(vec, args.distance) matrix = distmatrix.create(seq_records.id_list, dist) if args.out: oh = open(args.out, 'w') matrix.write_to_file(oh, args.outfmt) oh.close() else: matrix.display(args.outfmt)
def main(): parser = get_parser() args = validate_args(parser) seq_records = seqrecords.read_fasta(args.fasta) if args.word_patterns: l = args.word_patterns else: l = [] for i in range(args.word_size, args.word_size - 3, -1): p = word_pattern.create(seq_records.seq_list, i) l.append(p) compos = word_vector.Composition(seq_records.length_list, *l) dist = word_distance.Distance(compos, 'angle_cos_diss') matrix = distmatrix.create(seq_records.id_list, dist) if args.out: oh = open(args.out, 'w') matrix.write_to_file(oh, args.outfmt) oh.close() else: matrix.display(args.outfmt)
def test_minkowski_throws_exception(self): dist = word_distance.Distance(self.freqs, 'minkowski') with self.assertRaises(Exception) as context: dist.pwdist_minkowski(0, 1, 0.2) self.assertIn('p must be at least 1', str(context.exception))