def __init__(self, *args): unittest.TestCase.__init__(self, *args) fname = "data/test_data/sequences/alignment.fa" dl = DataLoader() _, self.seq_x, self.ann_x, self.seq_y, self.ann_y = dl.loadSequence(fname) self.seq_xs = Fasta.alnToSeq(self.seq_x) self.seq_ys = Fasta.alnToSeq(self.seq_y)
def _classification(self, sequence_x, ann_x, sequence_y, ann_y): def state(i): if sequence_x[i] == '-' and sequence_y[i] == '-': return 0 if sequence_x[i] == '-': return 2 if sequence_y[i] == '-': return 1 return 0 def get_pos(): def state(i): if sequence_x[i] == '-' and sequence_y[i] == '-': return -1 if sequence_x[i] == '-': return 2 if sequence_y[i] == '-': return 1 return 0 pos_x, pos_y = 0, 0 pos = list() for i in xrange(len(sequence_x)): pos.append((pos_x, pos_y)) s = state(i) if s == 0: pos_x += 1 pos_y += 1 if s == 1: pos_x += 1 if s == 2: pos_y += 1 return pos assert len(sequence_y) == len(sequence_x) l = len(sequence_x) sequence_xs = Fasta.alnToSeq(sequence_x) sequence_ys = Fasta.alnToSeq(sequence_y) positions = get_pos() ret_match = ( 0 for _ in filter(lambda x: state(x) == 0, xrange(l)) ) ret_insertX = self.clf.multi_prepare_predict( (sequence_xs, positions[pos][0], ann_x, sequence_ys, positions[pos][1], ann_y) for pos in filter(lambda x: state(x) == 1, xrange(l)) ) ret_insertY = self.clf.multi_prepare_predict( (sequence_ys, positions[pos][1], ann_y, sequence_xs, positions[pos][0], ann_x) for pos in filter(lambda x: state(x) == 2, xrange(l)) ) ret = merge((state(x) for x in xrange(l)), ret_match, ret_insertX, ret_insertY) return ret
def __init__(self, *args): unittest.TestCase.__init__(self, *args) fname = "data/test_data/sequences/alignment.fa" dl = DataLoader() _, self.seq_x, self.ann_x, self.seq_y, self.ann_y = dl.loadSequence( fname) self.seq_xs = Fasta.alnToSeq(self.seq_x) self.seq_ys = Fasta.alnToSeq(self.seq_y)
def _classification(self, sequence_x, ann_x, sequence_y, ann_y): def state(i): if sequence_x[i] == '-' and sequence_y[i] == '-': return 0 if sequence_x[i] == '-': return 2 if sequence_y[i] == '-': return 1 return 0 def get_pos(): def state(i): if sequence_x[i] == '-' and sequence_y[i] == '-': return -1 if sequence_x[i] == '-': return 2 if sequence_y[i] == '-': return 1 return 0 pos_x, pos_y = 0, 0 pos = list() for i in xrange(len(sequence_x)): pos.append((pos_x, pos_y)) s = state(i) if s == 0: pos_x += 1 pos_y += 1 if s == 1: pos_x += 1 if s == 2: pos_y += 1 return pos assert len(sequence_y) == len(sequence_x) l = len(sequence_x) sequence_xs = Fasta.alnToSeq(sequence_x) sequence_ys = Fasta.alnToSeq(sequence_y) positions = get_pos() ret_match = (0 for _ in filter(lambda x: state(x) == 0, xrange(l))) ret_insertX = self.clf.multi_prepare_predict( (sequence_xs, positions[pos][0], ann_x, sequence_ys, positions[pos][1], ann_y) for pos in filter(lambda x: state(x) == 1, xrange(l))) ret_insertY = self.clf.multi_prepare_predict( (sequence_ys, positions[pos][1], ann_y, sequence_xs, positions[pos][0], ann_x) for pos in filter(lambda x: state(x) == 2, xrange(l))) ret = merge((state(x) for x in xrange(l)), ret_match, ret_insertX, ret_insertY) return ret
def main(input_files, output_file): global width alignments = [ list( Fasta.load(name, '', Alignment, ['^sequence1', '^sequence2', '^[av].*'])) if os.path.exists(name) else None for name in input_files ] x_len = len(Fasta.alnToSeq(alignments[0][0].sequences[0])) y_len = len(Fasta.alnToSeq(alignments[0][0].sequences[1])) I = Image.new('RGB', (x_len * width + 50, y_len * width + 50), (255, 255, 255)) D = ImageDraw.Draw(I) colors = [(0, 0, 0), (255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 0, 255), (0, 255, 255)] i = -1 for aln in alignments: i += 1 if aln == None: continue aln = list(aln) if len(aln) == 0: continue aln = aln[0] try: annotation = aln.sequences[2] coords = aln.getCoordPairs() print coords x_shift = width / 2 + 25 + i y_shift = width / 2 + 25 + i * 2 D.line([(x * width + x_shift, y * width + y_shift) for x, y, _ in coords], fill=colors[i]) if annotation != None: for x, y, ind in coords: if annotation[ind] != 'R': continue D.rectangle([(x * width + x_shift - width / 4, y * width + y_shift - width / 4), (x * width + x_shift + width / 4, y * width + y_shift + width / 4)], outline=colors[i]) except IndexError: pass except IOError: pass del D I.save(output_file)
def prepare_training_data( self, sequence_x, annotations_x, sequence_y, annotations_y, ): """Takes sequences with spaces and prepares training data for classifier """ assert len(sequence_y) == len(sequence_x) sequence_xs = Fasta.alnToSeq(sequence_x) sequence_ys = Fasta.alnToSeq(sequence_y) train_data1, matched_pos, seq_size, weights_set = self.prepare_positive_data( sequence_x, sequence_xs, annotations_x, sequence_y, sequence_ys, annotations_y, ) train_data0 = self.prepare_negative_data( sequence_x, sequence_xs, annotations_x, sequence_y, sequence_ys, annotations_y, matched_pos, seq_size, weights_set, ) return train_data1[0] + train_data0[0],\ train_data1[1] + train_data0[1],\ train_data1[2] + train_data0[2]
def main(input_files, output_file): global width alignments = [list(Fasta.load(name, '', Alignment, ['^sequence1', '^sequence2', '^[av].*'])) if os.path.exists(name) else None for name in input_files] x_len = len(Fasta.alnToSeq(alignments[0][0].sequences[0])) y_len = len(Fasta.alnToSeq(alignments[0][0].sequences[1])) I = Image.new('RGB', (x_len * width + 50, y_len * width + 50), (255, 255, 255)) D = ImageDraw.Draw(I) colors = [(0, 0, 0), (255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 0, 255), (0, 255, 255)] i = -1 for aln in alignments: i += 1 if aln == None: continue aln = list(aln) if len(aln) == 0: continue aln = aln[0] try: annotation = aln.sequences[2] coords = aln.getCoordPairs() print coords x_shift = width / 2 + 25 + i y_shift = width / 2 + 25 + i * 2 D.line([(x * width + x_shift, y * width + y_shift) for x, y, _ in coords], fill=colors[i]) if annotation != None: for x, y, ind in coords: if annotation[ind] != 'R': continue D.rectangle([(x * width + x_shift - width / 4, y * width + y_shift - width / 4), (x * width + x_shift + width / 4, y * width + y_shift + width / 4)], outline=colors[i]) except IndexError: pass except IOError: pass del D I.save(output_file)
def prepare_training_data( self, sequence_x, annotations_x, sequence_y, annotations_y, ): train_data = (list(), list(), list()) if self.insert_sequence == 0: reference = sequence_x annotations_r = annotations_x space = sequence_y annotations_s = annotations_y else: reference = sequence_y annotations_r = annotations_y space = sequence_x annotations_s = annotations_x sequence_rs = Fasta.alnToSeq(reference) sequence_ss = Fasta.alnToSeq(space) pos_s, pos_r = 0, 0 match_pos = set() for i in range(len(space)): br, bs = reference[i], space[i] if bs != '-': if br != '-': match_pos.add((pos_r, pos_s)) pos_r += 1 pos_s += 1 continue if br == '-': continue d = self.prepare_data( sequence_rs, pos_r, annotations_r, sequence_ss, pos_s, annotations_s, 0, ) if d is not None: train_data[0].append(d) train_data[1].append(1) train_data[2].append(1.0) pos_r += 1 matches = sample(match_pos, len(train_data[0])) for x, y in matches: d = self.prepare_data( sequence_rs, x, annotations_r, sequence_ss, y, annotations_s, 0, ) train_data[0].append(d) train_data[1].append(0) train_data[2].append(1.0) return train_data