def main(input_filename, output_filename): task_ids = [1] if os.environ.has_key('SGE_TASK_ID'): sge_task_id = int(os.environ['SGE_TASK_ID']) if not os.environ.has_key('SGE_STEP_SIZE'): sge_step_size = 1 else: sge_step_size = int(os.environ['SGE_STEP_SIZE']) sge_task_last = int(os.environ['SGE_TASK_LAST']) task_ids = range( sge_task_id, min(sge_task_id + sge_step_size, sge_task_last + 1) ) print task_ids, sge_task_id, sge_step_size, sge_task_last for task_id in task_ids: inp_filename = input_filename.format(id=task_id - 1) out_filename = output_filename.format(id=task_id - 1) aln = list(Fasta.load( inp_filename, '', Alignment, sequence_selectors=['sequence1', 'sequence2']))[0] tmp_filename = get_temp_filename() Fasta.save(zip(aln.names, aln.sequences), tmp_filename) os.system("muscle -in {inp} -out {out} 2> /dev/null".format( inp=tmp_filename, out=out_filename, )) os.system("cp {inp}.repeats {out}.repeats".format( inp=inp_filename, out=out_filename, ))
def __init__(self, *args): unittest.TestCase.__init__(self, *args) fname = "data/test_data/sequences/alignment.fa" dl = DataLoader() _, self.seq_x, self.ann_x, self.seq_y, self.ann_y = dl.loadSequence(fname) self.seq_xs = Fasta.alnToSeq(self.seq_x) self.seq_ys = Fasta.alnToSeq(self.seq_y)
def realign_file(args, model, output_filename, alignment_filename): # begin of HACK if args.expand_model: old_tracks = args.tracks args.tracks.add('trf_cons') m = model if args.annotation_model: m = args.annotation_model annotations = compute_annotations(args, alignment_filename, m) if args.expand_model: consensuses = annotations['trf_cons'] args.tracks = old_tracks if 'trf_cons' not in old_tracks: del args.tracks['trf_cons'] # end of HACK with Open(output_filename, 'w') as output_file_object: for aln in Fasta.load( alignment_filename, args.alignment_regexp, Alignment, sequence_selectors=args.sequence_regexp): if len(aln.sequences) < 2: sys.stderr.write("ERROR: not enough sequences in file\n") return 1 if len(args.draw) == 0: drawer = brainwash(AlignmentCanvas)() else: drawer = AlignmentCanvas() drawer.add_original_alignment(aln) aln, unmask_repeats = args.mask_repeats(aln, annotations) seq1, seq2 = tuple(map(Fasta.alnToSeq, aln.sequences[:2])) perf.msg("Data loaded in {time} seconds.") perf.replace() if args.expand_model: # Potrebujem zistit konsenzy A = consensuses[aln.names[0]] B = consensuses[aln.names[1]] cons = list(A.union(B)) real_model = model.expandModel({'consensus': cons}) else: real_model = model realigner = args.algorithm() realigner.setDrawer(drawer) realigner.prepareData(seq1, aln.names[0], seq2, aln.names[1], aln, real_model, annotations, args) aln = realigner.realign(0, len(seq1), 0, len(seq2)) aln = unmask_repeats(aln) perf.msg("Sequence was realigned in {time} seconds.") perf.replace() if len(args.draw) > 0: drawer.add_sequence('X', seq1) drawer.add_sequence('Y', seq2) drawer.add_alignment_line(101, (255, 0, 255, 255), 2, AlignmentPositionGenerator( Alignment([aln[0], aln[2]]))) drawer.draw(args.draw, 2000, 2000) perf.msg("Image was drawn in {time} seconds.") # Save output_file Fasta.saveAlignmentPiece(aln, output_file_object)
def _classification(self, sequence_x, ann_x, sequence_y, ann_y): def state(i): if sequence_x[i] == '-' and sequence_y[i] == '-': return 0 if sequence_x[i] == '-': return 2 if sequence_y[i] == '-': return 1 return 0 def get_pos(): def state(i): if sequence_x[i] == '-' and sequence_y[i] == '-': return -1 if sequence_x[i] == '-': return 2 if sequence_y[i] == '-': return 1 return 0 pos_x, pos_y = 0, 0 pos = list() for i in xrange(len(sequence_x)): pos.append((pos_x, pos_y)) s = state(i) if s == 0: pos_x += 1 pos_y += 1 if s == 1: pos_x += 1 if s == 2: pos_y += 1 return pos assert len(sequence_y) == len(sequence_x) l = len(sequence_x) sequence_xs = Fasta.alnToSeq(sequence_x) sequence_ys = Fasta.alnToSeq(sequence_y) positions = get_pos() ret_match = ( 0 for _ in filter(lambda x: state(x) == 0, xrange(l)) ) ret_insertX = self.clf.multi_prepare_predict( (sequence_xs, positions[pos][0], ann_x, sequence_ys, positions[pos][1], ann_y) for pos in filter(lambda x: state(x) == 1, xrange(l)) ) ret_insertY = self.clf.multi_prepare_predict( (sequence_ys, positions[pos][1], ann_y, sequence_xs, positions[pos][0], ann_x) for pos in filter(lambda x: state(x) == 2, xrange(l)) ) ret = merge((state(x) for x in xrange(l)), ret_match, ret_insertX, ret_insertY) return ret
def __init__(self, *args): unittest.TestCase.__init__(self, *args) fname = "data/test_data/sequences/alignment.fa" dl = DataLoader() _, self.seq_x, self.ann_x, self.seq_y, self.ann_y = dl.loadSequence( fname) self.seq_xs = Fasta.alnToSeq(self.seq_x) self.seq_ys = Fasta.alnToSeq(self.seq_y)
def _classification(self, sequence_x, ann_x, sequence_y, ann_y): def state(i): if sequence_x[i] == '-' and sequence_y[i] == '-': return 0 if sequence_x[i] == '-': return 2 if sequence_y[i] == '-': return 1 return 0 def get_pos(): def state(i): if sequence_x[i] == '-' and sequence_y[i] == '-': return -1 if sequence_x[i] == '-': return 2 if sequence_y[i] == '-': return 1 return 0 pos_x, pos_y = 0, 0 pos = list() for i in xrange(len(sequence_x)): pos.append((pos_x, pos_y)) s = state(i) if s == 0: pos_x += 1 pos_y += 1 if s == 1: pos_x += 1 if s == 2: pos_y += 1 return pos assert len(sequence_y) == len(sequence_x) l = len(sequence_x) sequence_xs = Fasta.alnToSeq(sequence_x) sequence_ys = Fasta.alnToSeq(sequence_y) positions = get_pos() ret_match = (0 for _ in filter(lambda x: state(x) == 0, xrange(l))) ret_insertX = self.clf.multi_prepare_predict( (sequence_xs, positions[pos][0], ann_x, sequence_ys, positions[pos][1], ann_y) for pos in filter(lambda x: state(x) == 1, xrange(l))) ret_insertY = self.clf.multi_prepare_predict( (sequence_ys, positions[pos][1], ann_y, sequence_xs, positions[pos][0], ann_x) for pos in filter(lambda x: state(x) == 2, xrange(l))) ret = merge((state(x) for x in xrange(l)), ret_match, ret_insertX, ret_insertY) return ret
def create_fasta(fname, seq_basename, sequences, rename=False): if rename: names = [seq_basename + str(i + 1) for i in range(len(sequences))] else: names = [s['name'] for s in sequences] assert (len(names) == len(sequences)) seq = [s['sequence'].upper() for s in sequences] Fasta.save(zip(names, seq), fname) return names
def create_fasta(fname, seq_basename, sequences, rename=False): if rename: names = [seq_basename + str(i + 1) for i in range(len(sequences))] else: names = [s["name"] for s in sequences] assert len(names) == len(sequences) seq = [s["sequence"].upper() for s in sequences] Fasta.save(zip(names, seq), fname) return names
def main(input_files, output_file): global width alignments = [ list( Fasta.load(name, '', Alignment, ['^sequence1', '^sequence2', '^[av].*'])) if os.path.exists(name) else None for name in input_files ] x_len = len(Fasta.alnToSeq(alignments[0][0].sequences[0])) y_len = len(Fasta.alnToSeq(alignments[0][0].sequences[1])) I = Image.new('RGB', (x_len * width + 50, y_len * width + 50), (255, 255, 255)) D = ImageDraw.Draw(I) colors = [(0, 0, 0), (255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 0, 255), (0, 255, 255)] i = -1 for aln in alignments: i += 1 if aln == None: continue aln = list(aln) if len(aln) == 0: continue aln = aln[0] try: annotation = aln.sequences[2] coords = aln.getCoordPairs() print coords x_shift = width / 2 + 25 + i y_shift = width / 2 + 25 + i * 2 D.line([(x * width + x_shift, y * width + y_shift) for x, y, _ in coords], fill=colors[i]) if annotation != None: for x, y, ind in coords: if annotation[ind] != 'R': continue D.rectangle([(x * width + x_shift - width / 4, y * width + y_shift - width / 4), (x * width + x_shift + width / 4, y * width + y_shift + width / 4)], outline=colors[i]) except IndexError: pass except IOError: pass del D I.save(output_file)
def main(files_filename, output_filename, suffix, base_dir): X = "" Y = "" A = "" with Open(output_filename, 'w') as ff: files = json.load(Open(files_filename)) total = len(files) done = 0 for filename in files: if done %100 ==0: print '{}/{} {:.2}%'.format(done, total, 100.0 * done / total) if filename == "": Fasta.saveAlignmentPiece([(X_name, X), (Y_name, Y), (A_name, A)], ff) X = "" Y = "" A = "" continue done += 1 old_filename = filename keep = False if filename.count('keep') == 0: filename = filename[:-2] + suffix if base_dir != None: filename = base_dir + '/' + filename.split('/')[-1] try: with Open(filename, 'r') as f: l = len(''.join(f).strip()) if l == 0: filename = old_filename keep = True except IOError: filename = old_filename keep = True if filename.count('keep') > 0: keep = True aln = list(Fasta.load(filename, ''))[0] assert(len(aln) == 3) assert(len(aln[0][1]) == len(aln[1][1]) == len(aln[2][1])) X += aln[0][1] if keep: A += '.' * len(aln[0][1]) else: A += aln[1][1] Y += aln[2][1] X_name = aln[0][0] A_name = aln[1][0] Y_name = aln[2][0]
def main(files_filename, output_filename, suffix, base_dir): X = "" Y = "" A = "" with Open(output_filename, "w") as ff: files = json.load(Open(files_filename)) total = len(files) done = 0 for filename in files: if done % 100 == 0: print "{}/{} {:.2}%".format(done, total, 100.0 * done / total) if filename == "": Fasta.saveAlignmentPiece([(X_name, X), (Y_name, Y), (A_name, A)], ff) X = "" Y = "" A = "" continue done += 1 old_filename = filename keep = False if filename.count("keep") == 0: filename = filename[:-2] + suffix if base_dir != None: filename = base_dir + "/" + filename.split("/")[-1] try: with Open(filename, "r") as f: l = len("".join(f).strip()) if l == 0: filename = old_filename keep = True except IOError: filename = old_filename keep = True if filename.count("keep") > 0: keep = True aln = list(Fasta.load(filename, ""))[0] assert len(aln) == 3 assert len(aln[0][1]) == len(aln[1][1]) == len(aln[2][1]) X += aln[0][1] if keep: A += "." * len(aln[0][1]) else: A += aln[1][1] Y += aln[2][1] X_name = aln[0][0] A_name = aln[1][0] Y_name = aln[2][0]
def main(input_file, index1, index2, emissionOutput, transitionOutput): emissions = defaultdict(int) transitions = defaultdict(int) X = None Y = None def aggregate(X, Y): pairs = zip(X, Y) for p in pairs: if skip(p): continue emissions[str(upper(p))] += 1 Types = list(sortTypes([getType(x) for x in pairs if x != ('-', '-')])) for p in zip(Types, Types[1:]): transitions[str(p)] += 1 for aln in Fasta.load(input_file, '[.][0-9]+$'): count = 0 for _, sequence in aln: if count == index1: X = sequence if count == index2: Y = sequence count += 1 aggregate(X, Y) with Open(emissionOutput, 'w') as f: json.dump(emissions, f, indent=4) with Open(transitionOutput, 'w') as f: json.dump(transitions, f, indent=4)
def main(arg): with open(arg.output, 'w') as f: for alignment_file in arg.alignment: alns = Fasta.load(alignment_file, arg.alignment_regexp, Alignment, sequence_selectors=arg.sequence_regexp) for aln in alns: l = len(aln.sequences[0]) poss = [] for i in range(arg.min_split_size, arg.max_split_size + 1): mod = l % i rest = None if mod >= arg.min_split_size: rest = mod if mod + i <= arg.max_split_size and mod + i >= arg.min_split_size: rest = mod + i if rest == None: continue rest = min(rest, i) poss.append((-rest, i)) poss.sort() _, best = poss[0] splits = [i for i in range(0, l, best)] if best + l % best < arg.max_split_size: splits.pop() splits.append(l) for fr, to in zip(splits, splits[1:]): s1 = aln.sequences[0][fr:to] s2 = aln.sequences[1][fr:to] if min(map(len,[s1.strip('-'), s2.strip('-')])) > 0: f.write('{}.{}-{} {}\n'.format(aln.names[0], fr, to, s1)) f.write('{}.{}-{} {}\n'.format(aln.names[1], fr, to, s2))
def do_find_repeats(alignment_file, paramSeq, model, mathType, _stats, separator): modelParam = { "mathType": mathType, "modelFactory": model, } driver = TRFDriver() trf_repeats = driver.run(alignment_file, paramSeq) alignments = Fasta.load(alignment_file, separator, Alignment) D = dict() stats = defaultdict(int) count = 1; for alignment in alignments: print ("Annotating alignment {0}".format(count)) count += 1 consensus_list = list(set([ x.consensus for x in itertools.chain(*[ trf_repeats[name] for name in alignment.names ]) ])) repeats = find_repeats_in_alignment(alignment, consensus_list, modelParam) print repeats D.update(repeats) if _stats != None: s = compute_statistics(repeats) for key, value in s.iteritems(): stats[key] += value return D, stats
def getSequences(self, fname, sequence_regexp=None): alignment_regexp = '' if sequence_regexp is None: sequence_regexp = ["^sequence1$", "^sequence2$"] self.sequence_regexp = sequence_regexp aln = next( Fasta.load(fname, alignment_regexp, Alignment, sequence_regexp)) if aln is None or len(aln.sequences) < 2: raise ParseException('Not enough sequences in file\n') seq1 = aln.sequences[0] seq2 = aln.sequences[1] return seq1, seq2
def getSequences(self, fname, sequence_regexp=None): alignment_regexp = '' if sequence_regexp is None: sequence_regexp = ["^sequence1$", "^sequence2$"] self.sequence_regexp = sequence_regexp aln = next( Fasta.load(fname, alignment_regexp, Alignment, sequence_regexp) ) if aln is None or len(aln.sequences) < 2: raise ParseException('Not enough sequences in file\n') seq1 = aln.sequences[0] seq2 = aln.sequences[1] return seq1, seq2
def prepare_training_data( self, sequence_x, annotations_x, sequence_y, annotations_y, ): """Takes sequences with spaces and prepares training data for classifier """ assert len(sequence_y) == len(sequence_x) sequence_xs = Fasta.alnToSeq(sequence_x) sequence_ys = Fasta.alnToSeq(sequence_y) train_data1, matched_pos, seq_size, weights_set = self.prepare_positive_data( sequence_x, sequence_xs, annotations_x, sequence_y, sequence_ys, annotations_y, ) train_data0 = self.prepare_negative_data( sequence_x, sequence_xs, annotations_x, sequence_y, sequence_ys, annotations_y, matched_pos, seq_size, weights_set, ) return train_data1[0] + train_data0[0],\ train_data1[1] + train_data0[1],\ train_data1[2] + train_data0[2]
def main(input_files, output_file): global width alignments = [list(Fasta.load(name, '', Alignment, ['^sequence1', '^sequence2', '^[av].*'])) if os.path.exists(name) else None for name in input_files] x_len = len(Fasta.alnToSeq(alignments[0][0].sequences[0])) y_len = len(Fasta.alnToSeq(alignments[0][0].sequences[1])) I = Image.new('RGB', (x_len * width + 50, y_len * width + 50), (255, 255, 255)) D = ImageDraw.Draw(I) colors = [(0, 0, 0), (255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 0, 255), (0, 255, 255)] i = -1 for aln in alignments: i += 1 if aln == None: continue aln = list(aln) if len(aln) == 0: continue aln = aln[0] try: annotation = aln.sequences[2] coords = aln.getCoordPairs() print coords x_shift = width / 2 + 25 + i y_shift = width / 2 + 25 + i * 2 D.line([(x * width + x_shift, y * width + y_shift) for x, y, _ in coords], fill=colors[i]) if annotation != None: for x, y, ind in coords: if annotation[ind] != 'R': continue D.rectangle([(x * width + x_shift - width / 4, y * width + y_shift - width / 4), (x * width + x_shift + width / 4, y * width + y_shift + width / 4)], outline=colors[i]) except IndexError: pass except IOError: pass del D I.save(output_file)
def main(input_file, output_file, trf): # THIS IS ONLY GENERATOR!!! alns = (Alignment(a) for a in Fasta.load(input_file, '[.][0-9]+$', Alignment)) # 1. run trf, for trf_executable in trf: if os.path.exists(trf_executable): trf = TRFDriver(trf_executable) break repeats = trf.run(input_file) A = list(compute_annotation_track(alns, repeats)) json.dump(A, Open(output_file, 'w'), indent=4)
def expectation_generator(args, model, alignment_filename, annotations): for aln in Fasta.load(alignment_filename, args.alignment_regexp, Alignment, sequence_selectors=args.sequence_regexp): if len(aln.sequences) < 2: sys.stderr.write("ERROR: not enough sequences in file\n") raise "ERROR: not enough sequences in file" seq1, seq2 = tuple(map(Fasta.alnToSeq, aln.sequences[:2])) positionGenerator = list(AlignmentBeamGenerator(aln, args.beam_width)) RX = RepeatGenerator(None, args.repeat_width) RY = RepeatGenerator(None, args.repeat_width) for rt in ['trf', 'original_repeats']: if rt in annotations: RX.addRepeats(annotations[rt][aln.names[0]]) RY.addRepeats(annotations[rt][aln.names[1]]) RX.buildRepeatDatabase() RY.buildRepeatDatabase() if 'Repeat' in model.statenameToID: model.states[model.statenameToID['Repeat']].addRepeatGenerator( RX, RY) (transitions, emissions), probability = model.getBaumWelchCounts( seq1, 0, len(seq1), seq2, 0, len(seq2), positionGenerator=positionGenerator) yield { "probability": probability, "transitions": transitions, "emissions": emissions, }
def main(input_file, realign_output, do_not_touch_output, list_of_files_output, max_length, wrap_length, min_seq_length): realign_counter = 0 do_not_touch_counter = 0 files = [] for alignment in Fasta.load(input_file, '\.[0-9]*$'): if realign_counter % 100 == 0: print(realign_counter, do_not_touch_counter,alignment[0][0]) alignment_len = len(alignment[0][1]) annotation = alignment[2][1] # !!! We expect that first block is not repeat changes = [i for i in range(1, len(annotation)) if annotation[i-1] != annotation[i]] + [len(annotation)] Blocks = zip(changes, changes[1:]) + [(len(annotation), len(annotation) + max_length + 10)] Blocks = [(0, Blocks[0][0])] + Blocks printed = 0 block_start = 0#None block_end = None intervals = [] for block_id in range(1, len(Blocks), 2): current = Blocks[block_id] previous = Blocks[block_id - 1] if block_start == None: startpp = max(printed, previous[0]) if previous[1] - startpp > wrap_length: intervals.append((printed, startpp)) printed = startpp block_start = startpp else: # Pridam tento blok, alebo zacnem novy? if current[1] - block_start > max_length: if previous[1] - previous[0] > wrap_length * 2: intervals.append((block_start, previous[0] + wrap_length)) intervals.append((previous[0] + wrap_length, previous[1] - wrap_length)) printed = previous[1] - wrap_length block_start = previous[1] - wrap_length else: split = (previous[0] + previous[1]) / 2 intervals.append((block_start, split)) block_start = split printed = split #Zacnem novy intervals.append((printed, len(annotation))) assert(len(annotation) == sum([y - x for x, y in intervals])) for i in range(1, len(intervals)): assert(intervals[i - 1][1] == intervals[i][0]) #t = list(range(0, alignment_len, max_length)) + [alignment_len] #intervals = zip(t, t[1:]) for start, stop in intervals: if start >= len(annotation): continue if start == stop: continue assert(start < stop) ann = alignment[2][1][start:stop] output = None seq1 = alignment[0][1] seq2 = alignment[4][1] seq1_len = len(seq1) - seq1.count('-') - seq1.count('.') seq2_len = len(seq2) - seq2.count('-') - seq2.count('.') if ann.count('R') == 0 or min(seq1_len, seq2_len) < min_seq_length or ann.count('R') == len(ann): output = do_not_touch_output.format(id=do_not_touch_counter) do_not_touch_counter += 1 else: output = realign_output.format(id=realign_counter) realign_counter += 1 files.append(output) aln = [ (alignment[0][0], alignment[0][1][start:stop]), (alignment[2][0], alignment[2][1][start:stop]), (alignment[4][0], alignment[4][1][start:stop]) ] #Fasta.save(aln, output, width=-1) files.append(''); with Open(list_of_files_output, 'w') as f: json.dump(files, f, indent=4)
def main(inp, out, alignment_regexp, sequence_regexp, trf=trf_paths): for trf_executable in trf: if os.path.exists(trf_executable): trf = TRFDriver(trf_executable, mathType=float) break repeats = trf.run(inp) stats = defaultdict(int) for aln in Fasta.load(inp, alignment_regexp, Alignment, sequence_selectors=sequence_regexp): X_index = 0 Y_index = 1 X_trf = list( translate_repeat_to_annotation(repeats[aln.names[X_index]], aln.seq_to_aln[X_index])) Y_trf = list( translate_repeat_to_annotation(repeats[aln.names[Y_index]], aln.seq_to_aln[Y_index])) X_ann = list("M" * len(aln.sequences[X_index])) Y_ann = list("M" * len(aln.sequences[Y_index])) B_ann = list("M" * len(aln.sequences[Y_index])) for repeat in X_trf: if repeat.end >= len(X_ann): repeat.end = len(X_ann) - 1 rlen = 1 + repeat.end - repeat.start X_ann[repeat.start:repeat.end + 1] = list("R" * rlen) B_ann[repeat.start:repeat.end + 1] = list("R" * rlen) for repeat in Y_trf: if repeat.end >= len(Y_ann): repeat.end = len(Y_ann) - 1 rlen = 1 + repeat.end - repeat.start Y_ann[repeat.start:repeat.end + 1] = list("R" * rlen) B_ann[repeat.start:repeat.end + 1] = list("R" * rlen) assert (len(X_ann) == len(Y_ann) and len(B_ann) == len(Y_ann)) M_count = len([x for x in B_ann if x == 'M']) R_count = len([x for x in B_ann if x == 'R']) R_segments_count = len([ x for x in zip('M' + ''.join(B_ann), ''.join(B_ann) + 'M') if x[0] != 'R' and x[1] == 'R' ]) stats['M_count'] += M_count stats['R_count'] += R_count stats['R_segment_count'] += R_segments_count changes = [ i for i, x in zip(range(len(B_ann) + 1), zip('M' + ''.join(B_ann), ''.join(B_ann) + 'M')) if x[0] != x[1] ] R_segments = [(changes[i], changes[i + 1]) for i in range(0, len(changes) - (len(changes) % 2), 2)] assert (R_segments_count == len(R_segments)) for start, stop in R_segments: XX = 'M' YY = 'M' for i in range(start, stop): if X_ann[i] == 'R': XX = 'R' if Y_ann[i] == 'R': YY = 'R' assert (B_ann[i] == 'R') stats[XX + YY] += 1 with Open(out, 'w') as f: json.dump(stats, f, indent=4)
def select_sequences(inp_filename, out_filename, sequences): aln = list(Fasta.load(inp_filename, '', Alignment, sequence_selectors=sequences))[0] Fasta.save(zip(aln.names, aln.sequences), out_filename)
def main(correct_file, aln_file, output_file, interval=None): task_ids = [None] if os.environ.has_key('SGE_TASK_ID'): if os.environ['SGE_TASK_ID'] != 'undefined': sge_task_id = int(os.environ['SGE_TASK_ID']) if not os.environ.has_key('SGE_STEP_SIZE'): sge_step_size = 1 else: sge_step_size = int(os.environ['SGE_STEP_SIZE']) sge_task_last = int(os.environ['SGE_TASK_LAST']) task_ids = range( sge_task_id, min(sge_task_id + sge_step_size, sge_task_last + 1)) if interval != None: task_ids = range(interval[0], interval[1] + 1) for task_id in task_ids: separator = '' output = {} for fun, tp in [(identity, 'standard'), (expand_repeats, 'expanded_repeats'), (remove_repeats, 'removed_repeats')]: try: for correct, alignment in zip( Fasta.load(correct_file.format(id=task_id - 1), separator, Alignment), Fasta.load(aln_file.format(id=task_id - 1), separator, Alignment)): correct_len = len(correct.getCoordPairs(False)) total_len = correct_len * 2 - correct.sequences[0].count( '-') - correct.sequences[2].count('-') ccc = fun(correct.getCoordPairs(False), correct) if tp == 'removed_repeats': correct_len = len(ccc) total_len = 0 for v1, _, v2 in ccc: if v1 >= 0: total_len += 1 if v2 >= 0: total_len += 1 acc = alignment.getCoordPairs(False) cc = map(lambda x: (x[0], x[2]), ccc) if len(acc[0]) == 3: ac = map(lambda x: (x[0], x[2]), acc) elif len(acc[0]) == 2: ac = acc else: ac = None c = set(cc) a = set(ac) intersect = c.intersection(a) not_in_c = c.difference(a) not_in_a = a.difference(c) symm_diff = c.symmetric_difference(a) score = 0 for v1, v2 in intersect: if v1 >= 0: score += 1 if v2 >= 0: score += 1 dists_correct = defaultdict(int) dists_total = defaultdict(int) position = dict() dists = [99999999] * len(correct.sequences[1]) dst = 9999999 for x, a, y in ccc: position[(x, y)] = a for i in range(len(correct.sequences[1])): if correct.sequences[1][i] == 'R': dst = 0 else: dst += 1 dists[i] = min(dists[i], dst) for i in reversed(range(len(correct.sequences[1]))): if correct.sequences[1][i] == 'R': dst = 0 else: dst += 1 dists[i] = min(dists[i], dst) for pos in c: d = dists[position[pos]] if d == 0: continue dists_total[d] += 1 if pos in ac: dists_correct[d] += 1 def getRepeatAnnotation(coord, annotation): if len(coord[0]) != 3: return set() ret = set() for x, a, y in coord: if annotation[a] == 'R': if x >= 0: ret.add((x, -1)) if y >= 0: ret.add((-1, y)) return ret crann = getRepeatAnnotation(correct.getCoordPairs(False), correct.sequences[1]) arann = getRepeatAnnotation(alignment.getCoordPairs(False), alignment.sequences[1]) def getRepeatBlocks(coord, annotation): if len(coord[0]) != 3: return set() ret = set() x = set() y = set() for _x, a, _y in coord: if annotation[a] == 'R': if _x >= 0: x.add(_x) if _y >= 0: y.add(_y) else: if len(x) + len(y) > 0: if len(x) == 0: x.add(-1) if len(y) == 0: y.add(-1) ret.add(((min(x), max(x) + 1), (min(y), max(y) + 1))) x = set() y = set() if len(x) + len(y) > 0: if len(x) == 0: x.add(-1) if len(y) == 0: y.add(-1) ret.add( ((min(x), max(x) + 1), (min(y), max(y) + 1))) x = set() y = set() return ret cbann = getRepeatBlocks(correct.getCoordPairs(False), correct.sequences[1]) abann = getRepeatBlocks(alignment.getCoordPairs(False), alignment.sequences[1]) def dst(x1, x2): if x1 == -1: return 0 return x2 - x1 def getPoints(s): return sum([ dst(x1, x2) + dst(y1, y2) for ((x1, x2), (y1, y2)) in s ]) # Find long segments that are correctly aligned cseg = [1 if x in c else 0 for x in ac] seg_len = [] length = 0 segment_length_histogram = defaultdict(int) for x in cseg: if x == 0 and length != 0: segment_length_histogram[length] += 1 length = length * x + x seg_len.append(length) if length > 0: segment_length_histogram[length] += 1 getPoints = len output[tp] = { 'corect': correct_file, 'alignment': aln_file, 'c-lenght': len(cc), 'a-length': len(ac), 'intersect': len(intersect), '%correct': 100.0 - float(len(intersect) * 100) / correct_len if correct_len > 0 else 100, '+mistakes': len(intersect), '+len': correct_len, '+RepTP': len(crann & arann), '+RepTN': total_len - len(crann | arann), '+RepFP': len(arann - crann), '+RepFN': len(crann - arann), '+BlkTP': getPoints(cbann & abann), '+BlkTN': 0, '+BlkFP': getPoints(abann - cbann), '+BlkFN': getPoints(cbann - abann), '%score': float(score) * 100 / total_len if total_len > 0 else 0, 'c-a': len(not_in_c), 'a-c': len(not_in_a), 'symmetric_difference': len(symm_diff), 'correct_len_histogram': segment_length_histogram, '@+dists_correct': dists_correct, '@+dists_total': dists_total, } if correct_len == 0: del output[tp]['%correct'] if total_len == 0: del output[tp]['%score'] except IOError: pass with Open(output_file.format(id=task_id - 1), 'w') as f: json.dump(output, f, indent=4)
def main(input_file, output_file): for trf_executable in trf_paths: if os.path.exists(trf_executable): trf = TRFDriver(trf_executable) #break if not trf: raise "No trf found" repeats = trf.run(input_file) with open(output_file, 'w') as f: for alignment in Fasta.load(input_file, '\.[0-9]*$', Alignment): if len(alignment.sequences) != 2: print 'error' continue #print alignment.names annotation = list('.' * len(alignment.sequences[0])) annotationX = list('.' * len(alignment.sequences[0])) annotationY = list('.' * len(alignment.sequences[0])) trf = None for seq_name in alignment.names: index = None for i in range(len(alignment.names)): if seq_name == alignment.names[i]: index = i translator = alignment.seq_to_aln[index] revtranslator = alignment.aln_to_seq[index] for repeat in repeats[seq_name]: for i in range(translator[repeat.start], translator[repeat.end]): annotation[i] = 'R' j = i - translator[repeat.start] if index == 0: annotationX[i] = repeat.consensus[ revtranslator[j] % len(repeat.consensus)] else: annotationY[i] = repeat.consensus[ revtranslator[j] % len(repeat.consensus)] d = defaultdict(int) ll = 0 for v in annotation: if v != 'R': if ll > 0: d[ll] += 1 ll = 0 else: ll += 1 #for x, y in sorted(d.iteritems(), key=lambda x: x[1]): # print '{}: {}'.format(x, y) #if len(d.keys()) > 0: # print('Number of repeats: {}, average length: {}, maximum length: {}, minimum length: {}'.format( # sum(d.values()), # sum([x * y for x, y in d.iteritems()])/ max(sum(d.values()), 1), # max(d.keys()), # min(d.keys()) # )) seqX = alignment.sequences nm = alignment.names[0] aln = [ (alignment.names[0], alignment.sequences[0].replace('.', '-')), ('consensusX' + nm, ''.join(annotationX)), ('annotation' + nm, ''.join(annotation)), ('consensusY' + nm, ''.join(annotationY)), (alignment.names[1], alignment.sequences[1].replace('.', '-')) ] Fasta.saveAlignmentPiece(aln, f, -1)
def simulate( n, datadir='data/sequences/train_sequences/', fname='simulated_alignment', ): s1name = "sequence1" s2name = "sequence2" s3name = "sequence3" annotation_name = 'gene' alignment_extension = ".fa" annotations_extension = ".bed" config_extension = ".js" # if len(sys.argv) > 1: # n = int(sys.argv[1]) # if len(sys.argv) > 2: # fname = sys.argv[2] master_gene_sequence = MarkovChain(P_START_GENE, P_STOP_GENE) human_delete_sequence = MarkovChain(P_START_DELETE, P_STOP_DELETE) mouse_delete_sequence = MarkovChain(P_START_DELETE, P_STOP_DELETE) horse_delete_sequence = MarkovChain(P_START_DELETE, P_STOP_DELETE) mutator_coin = BiasedCoin(P_NOT_MUTATE_GENE) master_gene = list() human_gene = list() mouse_gene = list() horse_gene = list() human_dna = list() mouse_dna = list() horse_dna = list() for i in range(n): # create master_gene item g = g2 = g3 = g4 = master_gene_sequence.get_state() # mutate master_gene item if g: g2 = mutator_coin.flip() g3 = mutator_coin.flip() g4 = mutator_coin.flip() dna_mutation_coin = create_dna_mutation_coin(g2 + g3) dna_mutation_coin2 = create_dna_mutation_coin(g2 + g4) # create DNA item c = c1 = c2 = c3 = DNA_CHARS[random.randint(0, 3)] if not dna_mutation_coin.flip(): char_index = random.randint(0, 2) if DNA_CHARS[char_index] == c1: char_index = 3 c1 = DNA_CHARS[char_index] if not dna_mutation_coin.flip(): char_index = random.randint(0, 2) if DNA_CHARS[char_index] == c2: char_index = 3 c2 = DNA_CHARS[char_index] if not dna_mutation_coin2.flip(): char_index = random.randint(0, 2) if DNA_CHARS[char_index] == c3: char_index = 3 c3 = DNA_CHARS[char_index] # delete DNA item if human_delete_sequence.get_state(): c1 = '-' if mouse_delete_sequence.get_state(): c2 = '-' if horse_delete_sequence.get_state(): c3 = '-' # add items to sequence master_gene.append(g) human_gene.append(g2) mouse_gene.append(g3) horse_gene.append(g4) human_dna.append(c1) mouse_dna.append(c2) horse_dna.append(c3) # output s1fname = os.path.join( datadir, fname+'_'+s1name+'_'+annotation_name+annotations_extension ) if os.path.isfile(s1fname): os.remove(s1fname) s2fname = os.path.join( datadir, fname+'_'+s2name+'_'+annotation_name+annotations_extension ) if os.path.isfile(s2fname): os.remove(s2fname) s3fname = os.path.join( datadir, fname+'_'+s3name+'_'+annotation_name+annotations_extension ) if os.path.isfile(s3fname): os.remove(s3fname) intervals1 = sequence_to_intervals( get_sequence(human_gene, human_dna), annotation_name ) intervals2 = sequence_to_intervals( get_sequence(mouse_gene, mouse_dna), annotation_name ) intervals3 = sequence_to_intervals( get_sequence(horse_gene, horse_dna), annotation_name ) annotations = Annotations() annotations.setAnnotations([annotation_name]) annotations.addSequences([s1name, s2name, s3name]) annotations.addAnnotationFile(s1name, annotation_name, s1fname) annotations.addAnnotationFile(s2name, annotation_name, s2fname) annotations.addAnnotationFile(s3name, annotation_name, s3fname) Fasta.save( [ (s1name, ''.join(human_dna)), (s2name, ''.join(mouse_dna)), (s3name, ''.join(horse_dna)) ], os.path.join(datadir, fname+alignment_extension) ) with track.new(s1fname, 'bed') as t: t.fields = ['start', 'end', 'name'] t.write("chr1", intervals1) with track.new(s2fname, 'bed') as t: t.fields = ['start', 'end', 'name'] t.write("chr1", intervals2) with track.new(s3fname, 'bed') as t: t.fields = ['start', 'end', 'name'] t.write("chr1", intervals3) with Open(os.path.join(datadir, fname+config_extension), "w") as f: json.dump(annotations.toJSON(), f)
def main(correct_file, aln_file, output_file, interval=None): task_ids = [None] if os.environ.has_key('SGE_TASK_ID'): if os.environ['SGE_TASK_ID'] != 'undefined': sge_task_id = int(os.environ['SGE_TASK_ID']) if not os.environ.has_key('SGE_STEP_SIZE'): sge_step_size = 1 else: sge_step_size = int(os.environ['SGE_STEP_SIZE']) sge_task_last = int(os.environ['SGE_TASK_LAST']) task_ids = range( sge_task_id, min(sge_task_id + sge_step_size, sge_task_last + 1) ) if interval != None: task_ids = range(interval[0], interval[1] + 1) for task_id in task_ids: separator = '' output = {} for fun, tp in [(identity, 'standard'), (expand_repeats, 'expanded_repeats'), (remove_repeats, 'removed_repeats')]: try: for correct, alignment in zip( Fasta.load(correct_file.format(id=task_id - 1), separator, Alignment), Fasta.load(aln_file.format(id=task_id - 1), separator, Alignment) ): correct_len = len(correct.getCoordPairs(False)) total_len = correct_len * 2 - correct.sequences[0].count('-') - correct.sequences[2].count('-') ccc = fun(correct.getCoordPairs(False), correct) if tp == 'removed_repeats': correct_len = len(ccc) total_len = 0 for v1, _, v2 in ccc: if v1 >= 0: total_len += 1 if v2 >= 0: total_len += 1 acc = alignment.getCoordPairs(False) cc = map(lambda x: (x[0], x[2]), ccc) if len(acc[0]) == 3: ac = map(lambda x: (x[0], x[2]), acc) elif len(acc[0]) ==2: ac = acc else: ac = None c = set(cc) a = set(ac) intersect = c.intersection(a) not_in_c = c.difference(a) not_in_a = a.difference(c) symm_diff = c.symmetric_difference(a) score = 0 for v1, v2 in intersect: if v1 >= 0: score += 1 if v2 >= 0: score += 1 dists_correct = defaultdict(int) dists_total = defaultdict(int) position = dict() dists = [99999999] * len(correct.sequences[1]) dst = 9999999 for x, a, y in ccc: position[(x,y)] = a for i in range(len(correct.sequences[1])): if correct.sequences[1][i] == 'R': dst = 0 else: dst += 1 dists[i] = min(dists[i], dst) for i in reversed(range(len(correct.sequences[1]))): if correct.sequences[1][i] == 'R': dst = 0 else: dst += 1 dists[i] = min(dists[i], dst) for pos in c: d = dists[position[pos]] if d == 0: continue dists_total[d] += 1 if pos in ac: dists_correct[d] += 1 def getRepeatAnnotation(coord, annotation): if len(coord[0]) != 3: return set() ret = set() for x, a, y in coord: if annotation[a] == 'R': if x >= 0: ret.add((x, -1)) if y >= 0: ret.add((-1, y)) return ret crann = getRepeatAnnotation(correct.getCoordPairs(False), correct.sequences[1]) arann = getRepeatAnnotation(alignment.getCoordPairs(False), alignment.sequences[1]) def getRepeatBlocks(coord, annotation): if len(coord[0]) != 3: return set() ret = set() x = set() y = set() for _x, a, _y in coord: if annotation[a] == 'R': if _x >= 0: x.add(_x) if _y >= 0: y.add(_y) else: if len(x) + len(y) > 0: if len(x) == 0: x.add(-1) if len(y) == 0: y.add(-1) ret.add(((min(x), max(x) + 1), (min(y), max(y) + 1))) x = set() y = set() if len(x) + len(y) > 0: if len(x) == 0: x.add(-1) if len(y) == 0: y.add(-1) ret.add(((min(x), max(x) + 1), (min(y), max(y) + 1))) x = set() y = set() return ret cbann = getRepeatBlocks(correct.getCoordPairs(False), correct.sequences[1]) abann = getRepeatBlocks(alignment.getCoordPairs(False), alignment.sequences[1]) def dst(x1, x2): if x1 == -1: return 0 return x2 - x1 def getPoints(s): return sum([dst(x1,x2) + dst(y1,y2) for ((x1, x2), (y1, y2)) in s]) # Find long segments that are correctly aligned cseg = [1 if x in c else 0 for x in ac] seg_len = [] length = 0 segment_length_histogram = defaultdict(int) for x in cseg: if x == 0 and length != 0: segment_length_histogram[length] += 1 length = length * x + x seg_len.append(length) if length > 0: segment_length_histogram[length] += 1 getPoints = len output[tp] = { 'corect': correct_file, 'alignment': aln_file, 'c-lenght': len(cc), 'a-length': len(ac), 'intersect': len(intersect), '%correct': 100.0 - float(len(intersect) * 100) / correct_len if correct_len > 0 else 100, '+mistakes': len(intersect), '+len': correct_len, '+RepTP': len(crann & arann), '+RepTN': total_len - len(crann | arann), '+RepFP': len(arann - crann), '+RepFN': len(crann - arann), '+BlkTP': getPoints(cbann & abann), '+BlkTN': 0, '+BlkFP': getPoints(abann - cbann), '+BlkFN': getPoints(cbann - abann), '%score': float(score) * 100 / total_len if total_len > 0 else 0, 'c-a': len(not_in_c), 'a-c': len(not_in_a), 'symmetric_difference': len(symm_diff), 'correct_len_histogram': segment_length_histogram, '@+dists_correct': dists_correct, '@+dists_total': dists_total, } if correct_len == 0: del output[tp]['%correct'] if total_len == 0: del output[tp]['%score'] except IOError: pass with Open(output_file.format(id=task_id - 1), 'w') as f: json.dump(output, f, indent=4)
def main(n, datadir='data/train_sequences/', fname='simulated_alignment'): s1name = "sequence1" s2name = "sequence2" s3name = "sequence3" annotation_name = 'gene' alignment_extension = ".fa" annotations_extension = ".bed" config_extension = ".js" if len(sys.argv) > 1: n = int(sys.argv[1]) if len(sys.argv) > 2: fname = sys.argv[2] master_gene_sequence = MarkovChain(P_START_GENE, P_STOP_GENE) human_delete_sequence = MarkovChain(P_START_DELETE, P_STOP_DELETE) mouse_delete_sequence = MarkovChain(P_START_DELETE, P_STOP_DELETE) horse_delete_sequence = MarkovChain(P_START_DELETE, P_STOP_DELETE) mutator_coin = BiasedCoin(P_NOT_MUTATE_GENE) master_gene = list() human_gene = list() mouse_gene = list() horse_gene = list() human_dna = list() mouse_dna = list() horse_dna = list() for i in range(n): # create master_gene item g = g2 = g3 = g4 = master_gene_sequence.get_state() # mutate master_gene item if g: g2 = mutator_coin.flip() g3 = mutator_coin.flip() g4 = mutator_coin.flip() dna_mutation_coin = create_dna_mutation_coin(g2 + g3) dna_mutation_coin2 = create_dna_mutation_coin(g2 + g4) # create DNA item c = c2 = c3 = random.randint(0, 3) c2 = mutate(c2, g2 + g3) c, c2, c3 = [DNA_CHARS[i] for i in (c, c2, c3)] if not dna_mutation_coin.flip(): char_index = random.randint(0, 2) if DNA_CHARS[char_index] == c2: char_index = 3 c2 = DNA_CHARS[char_index] if not dna_mutation_coin2.flip(): char_index = random.randint(0, 2) if DNA_CHARS[char_index] == c3: char_index = 3 c3 = DNA_CHARS[char_index] # delete DNA item if human_delete_sequence.get_state(): c = '-' if mouse_delete_sequence.get_state(): c2 = '-' if horse_delete_sequence.get_state(): c3 = '-' # add items to sequence master_gene.append(g) human_gene.append(g2) mouse_gene.append(g3) horse_gene.append(g4) human_dna.append(c) mouse_dna.append(c2) horse_dna.append(c3) # output s1fname = os.path.join( datadir, fname + '_' + s1name + '_' + annotation_name + annotations_extension) if os.path.isfile(s1fname): os.remove(s1fname) s2fname = os.path.join( datadir, fname + '_' + s2name + '_' + annotation_name + annotations_extension) if os.path.isfile(s2fname): os.remove(s2fname) s3fname = os.path.join( datadir, fname + '_' + s3name + '_' + annotation_name + annotations_extension) if os.path.isfile(s3fname): os.remove(s3fname) intervals1 = sequence_to_intervals(get_sequence(human_gene, human_dna), annotation_name) intervals2 = sequence_to_intervals(get_sequence(mouse_gene, mouse_dna), annotation_name) intervals3 = sequence_to_intervals(get_sequence(horse_gene, horse_dna), annotation_name) annotations = Annotations() annotations.setAnnotations([annotation_name]) annotations.addSequences([s1name, s2name, s3name]) annotations.addAnnotationFile(s1name, annotation_name, s1fname) annotations.addAnnotationFile(s2name, annotation_name, s2fname) # annotations.addAnnotationFile(s3name, annotation_name, s3fname) Fasta.save( [ (s1name, ''.join(human_dna)), (s2name, ''.join(mouse_dna)), # (s3name, ''.join(horse_dna)) ], os.path.join(datadir, fname + alignment_extension)) with track.new(s1fname, 'bed') as t: t.fields = ['start', 'end', 'name'] t.write("chr1", intervals1) with track.new(s2fname, 'bed') as t: t.fields = ['start', 'end', 'name'] t.write("chr1", intervals2) # with track.new(s3fname, 'bed') as t: # t.fields = ['start', 'end', 'name'] # t.write("chr1", intervals3) with Open(os.path.join(datadir, fname + config_extension), "w") as f: json.dump(annotations.toJSON(), f)
def select_sequences(inp_filename, out_filename, sequences): aln = list( Fasta.load(inp_filename, '', Alignment, sequence_selectors=sequences))[0] Fasta.save(zip(aln.names, aln.sequences), out_filename)
def prepare_training_data( self, sequence_x, annotations_x, sequence_y, annotations_y, ): train_data = (list(), list(), list()) if self.insert_sequence == 0: reference = sequence_x annotations_r = annotations_x space = sequence_y annotations_s = annotations_y else: reference = sequence_y annotations_r = annotations_y space = sequence_x annotations_s = annotations_x sequence_rs = Fasta.alnToSeq(reference) sequence_ss = Fasta.alnToSeq(space) pos_s, pos_r = 0, 0 match_pos = set() for i in range(len(space)): br, bs = reference[i], space[i] if bs != '-': if br != '-': match_pos.add((pos_r, pos_s)) pos_r += 1 pos_s += 1 continue if br == '-': continue d = self.prepare_data( sequence_rs, pos_r, annotations_r, sequence_ss, pos_s, annotations_s, 0, ) if d is not None: train_data[0].append(d) train_data[1].append(1) train_data[2].append(1.0) pos_r += 1 matches = sample(match_pos, len(train_data[0])) for x, y in matches: d = self.prepare_data( sequence_rs, x, annotations_r, sequence_ss, y, annotations_s, 0, ) train_data[0].append(d) train_data[1].append(0) train_data[2].append(1.0) return train_data
def main(inp, out, alignment_regexp, sequence_regexp, trf=trf_paths): for trf_executable in trf: if os.path.exists(trf_executable): trf = TRFDriver(trf_executable, mathType=float) break repeats = trf.run(inp) stats = defaultdict(int) for aln in Fasta.load( inp, alignment_regexp, Alignment, sequence_selectors=sequence_regexp ): X_index = 0 Y_index = 1 X_trf = list(translate_repeat_to_annotation( repeats[aln.names[X_index]], aln.seq_to_aln[X_index])) Y_trf = list(translate_repeat_to_annotation( repeats[aln.names[Y_index]], aln.seq_to_aln[Y_index])) X_ann = list("M" * len(aln.sequences[X_index])) Y_ann = list("M" * len(aln.sequences[Y_index])) B_ann = list("M" * len(aln.sequences[Y_index])) for repeat in X_trf: if repeat.end >= len(X_ann): repeat.end = len(X_ann) - 1 rlen = 1 + repeat.end - repeat.start X_ann[repeat.start : repeat.end + 1] = list("R" * rlen) B_ann[repeat.start : repeat.end + 1] = list("R" * rlen) for repeat in Y_trf: if repeat.end >= len(Y_ann): repeat.end = len(Y_ann) - 1 rlen = 1 + repeat.end - repeat.start Y_ann[repeat.start : repeat.end + 1] = list("R" * rlen) B_ann[repeat.start : repeat.end + 1] = list("R" * rlen) assert(len(X_ann) == len(Y_ann) and len(B_ann) == len(Y_ann)) M_count = len([x for x in B_ann if x == 'M']) R_count = len([x for x in B_ann if x == 'R']) R_segments_count = len([x for x in zip('M' + ''.join(B_ann), ''.join(B_ann) + 'M') if x[0] != 'R' and x[1] == 'R']) stats['M_count'] += M_count stats['R_count'] += R_count stats['R_segment_count'] += R_segments_count changes = [i for i, x in zip( range(len(B_ann) + 1), zip('M' + ''.join(B_ann), ''.join(B_ann) + 'M')) if x[0] != x[1]] R_segments = [(changes[i], changes[i+1]) for i in range(0, len(changes) - (len(changes) % 2), 2)] assert(R_segments_count == len(R_segments)) for start, stop in R_segments: XX = 'M' YY = 'M' for i in range(start, stop): if X_ann[i] == 'R': XX = 'R' if Y_ann[i] == 'R': YY = 'R' assert(B_ann[i] == 'R') stats[XX + YY] += 1 with Open(out, 'w') as f: json.dump(stats, f, indent=4);
def main(): parser = argparse.ArgumentParser(description='Sample alignments.') parser.add_argument('output_file_template', type=str, help="Template for output file. Have to contain " + \ "string '{id}' as placeholder for sequence number.") parser.add_argument('--output_files', type=str, help="File where the " + \ 'list of output files will be written.', default='-') parser.add_argument('--model', type=str, default='data/models/repeatHMM.js', help="Model file") parser.add_argument('--bind_file', nargs='*', help='Replace filenames in ' + 'the input_file model.', default=[]) parser.add_argument('--bind_constant', nargs='*', help='Replace constants' + ' in the input_file model.', default=[]) parser.add_argument('--bind_constant_file', nargs='*', help='Replace' + ' constants in the input_file model.', default=[]) parser.add_argument('n_samples', type=int, help='Number of samples.') parser.add_argument('seq1_length',type=int, help='Length of first sequence.') parser.add_argument('seq2_length', type=int, help='Length of second sequence.') parsed_arg = parser.parse_args() # ====== Validate input parameters ========================================= if parsed_arg.output_file_template.count("{id}") < 1: sys.stderr.write('ERROR: If sampling, output_file filename has to ' +\ 'contain at least one "%d".\n') return 1 if len(parsed_arg.bind_file) % 2 != 0: sys.stderr.write('ERROR: If binding files, the number of arguments has' + 'to be divisible by 2\n') return 1 if len(parsed_arg.bind_constant_file) % 2 != 0: sys.stderr.write('ERROR: If binding constants (as files), the number of' + ' arguments has to be divisible by 2\n') return 1 if len(parsed_arg.bind_constant) % 2 != 0: sys.stderr.write('ERROR: If binding constants, the number of' + ' arguments has to be divisible by 2\n') return 1 # ====== Parse parameters ================================================== output_filename = parsed_arg.output_file_template output_files_filename = parsed_arg.output_files output_files = list() # ====== Load model ======================================================== loader = HMMLoader() for i in range(0, len(parsed_arg.bind_constant), 2): loader.addFile(parsed_arg.bind_file[i], parsed_arg.bind_file[i + 1]) for i in range(0, len(parsed_arg.bind_constant_file), 2): loader.addConstant( parsed_arg.bind_constant_file[i], loader.load(parsed_arg.bind_constant_file[i + 1]) ) for i in range(0, len(parsed_arg.bind_constant), 2): loader.addConstant( parsed_arg.bind_constant[i], loader.loads(parsed_arg.bind_constant[i + 1]), ) model_filename = parsed_arg.model PHMM = loader.load(model_filename)["model"] # ====== Sample ============================================================ PHMM.buildSampleTransitions() n_samples = parsed_arg.n_samples X_len = parsed_arg.seq1_length Y_len = parsed_arg.seq2_length dirname = os.path.dirname(output_filename) if not os.path.exists(dirname): os.makedirs(dirname) for i in range(n_samples): done = False while not done: tandemRepeats = {'sequence1': [], 'sequence2': []} seq = PHMM.generateSequence((X_len, Y_len)) X = "" Y = "" A = "" for (seq, state) in seq: ann_data = None if len(seq) == 2: x, y = seq else: x, y, ann_data = seq dx, dy = len(x), len(y) if ann_data != None: xlen = len(X.replace('-', '')) ylen = len(Y.replace('-', '')) if dx > 0: tandemRepeats['sequence1'].append(( xlen, xlen + dx, dx / ann_data[1], ann_data[0], x )) done = True if dy > 0: tandemRepeats['sequence2'].append(( ylen, ylen + dy, dy / ann_data[2], ann_data[0], y )) done = True A += PHMM.states[state].getChar() * max(dx, dy) X += x + ('-' * (dy - dx)) Y += y + ('-' * (dx - dy)) #if len(X) - X.count('-') > 2 * X_len: # done = False #if len(Y) - Y.count('-') > 2 * Y_len: # done = False aln = [("sequence1", X), ("alignment", A), ("sequence2", Y)] json.dump(tandemRepeats, Open(output_filename.format(id=i) + '.repeats', 'w'), indent=4) Fasta.save(aln, output_filename.format(id=i)) output_files.append(output_filename.format(id=i)) with Open(output_files_filename, 'w') as output_file_object: json.dump(output_files, output_file_object, indent=4) return 0
def main(input_file, output_file): for trf_executable in trf_paths: if os.path.exists(trf_executable): trf = TRFDriver(trf_executable) #break if not trf: raise "No trf found" repeats = trf.run(input_file) with open(output_file, 'w') as f: for alignment in Fasta.load(input_file, '\.[0-9]*$', Alignment): if len(alignment.sequences) != 2: print 'error' continue #print alignment.names annotation = list('.' * len(alignment.sequences[0])) annotationX = list('.' * len(alignment.sequences[0])) annotationY = list('.' * len(alignment.sequences[0])) trf = None for seq_name in alignment.names: index = None for i in range(len(alignment.names)): if seq_name == alignment.names[i]: index = i translator = alignment.seq_to_aln[index] revtranslator = alignment.aln_to_seq[index] for repeat in repeats[seq_name]: for i in range(translator[repeat.start], translator[repeat.end]): annotation[i] = 'R' j = i - translator[repeat.start] if index == 0: annotationX[i] = repeat.consensus[revtranslator[j] % len(repeat.consensus)] else: annotationY[i] = repeat.consensus[revtranslator[j] % len(repeat.consensus)] d = defaultdict(int) ll = 0 for v in annotation: if v != 'R': if ll > 0: d[ll] += 1 ll = 0 else: ll += 1 #for x, y in sorted(d.iteritems(), key=lambda x: x[1]): # print '{}: {}'.format(x, y) #if len(d.keys()) > 0: # print('Number of repeats: {}, average length: {}, maximum length: {}, minimum length: {}'.format( # sum(d.values()), # sum([x * y for x, y in d.iteritems()])/ max(sum(d.values()), 1), # max(d.keys()), # min(d.keys()) # )) seqX = alignment.sequences nm = alignment.names[0] aln = [(alignment.names[0], alignment.sequences[0].replace('.', '-')), ('consensusX' + nm, ''.join(annotationX)), ('annotation' + nm, ''.join(annotation)), ('consensusY' + nm, ''.join(annotationY)), (alignment.names[1], alignment.sequences[1].replace('.','-'))] Fasta.saveAlignmentPiece(aln, f, -1)