def main(input_file, index1, index2, emissionOutput, transitionOutput): emissions = defaultdict(int) transitions = defaultdict(int) X = None Y = None def aggregate(X, Y): pairs = zip(X, Y) for p in pairs: if skip(p): continue emissions[str(upper(p))] += 1 Types = list(sortTypes([getType(x) for x in pairs if x != ('-', '-')])) for p in zip(Types, Types[1:]): transitions[str(p)] += 1 for aln in Fasta.load(input_file, '[.][0-9]+$'): count = 0 for _, sequence in aln: if count == index1: X = sequence if count == index2: Y = sequence count += 1 aggregate(X, Y) with Open(emissionOutput, 'w') as f: json.dump(emissions, f, indent=4) with Open(transitionOutput, 'w') as f: json.dump(transitions, f, indent=4)
def do_find_repeats(alignment_file, paramSeq, model, mathType, _stats, separator): modelParam = { "mathType": mathType, "modelFactory": model, } driver = TRFDriver() trf_repeats = driver.run(alignment_file, paramSeq) alignments = Fasta.load(alignment_file, separator, Alignment) D = dict() stats = defaultdict(int) count = 1; for alignment in alignments: print ("Annotating alignment {0}".format(count)) count += 1 consensus_list = list(set([ x.consensus for x in itertools.chain(*[ trf_repeats[name] for name in alignment.names ]) ])) repeats = find_repeats_in_alignment(alignment, consensus_list, modelParam) print repeats D.update(repeats) if _stats != None: s = compute_statistics(repeats) for key, value in s.iteritems(): stats[key] += value return D, stats
def main(arg): with open(arg.output, 'w') as f: for alignment_file in arg.alignment: alns = Fasta.load(alignment_file, arg.alignment_regexp, Alignment, sequence_selectors=arg.sequence_regexp) for aln in alns: l = len(aln.sequences[0]) poss = [] for i in range(arg.min_split_size, arg.max_split_size + 1): mod = l % i rest = None if mod >= arg.min_split_size: rest = mod if mod + i <= arg.max_split_size and mod + i >= arg.min_split_size: rest = mod + i if rest == None: continue rest = min(rest, i) poss.append((-rest, i)) poss.sort() _, best = poss[0] splits = [i for i in range(0, l, best)] if best + l % best < arg.max_split_size: splits.pop() splits.append(l) for fr, to in zip(splits, splits[1:]): s1 = aln.sequences[0][fr:to] s2 = aln.sequences[1][fr:to] if min(map(len,[s1.strip('-'), s2.strip('-')])) > 0: f.write('{}.{}-{} {}\n'.format(aln.names[0], fr, to, s1)) f.write('{}.{}-{} {}\n'.format(aln.names[1], fr, to, s2))
def main(input_filename, output_filename): task_ids = [1] if os.environ.has_key('SGE_TASK_ID'): sge_task_id = int(os.environ['SGE_TASK_ID']) if not os.environ.has_key('SGE_STEP_SIZE'): sge_step_size = 1 else: sge_step_size = int(os.environ['SGE_STEP_SIZE']) sge_task_last = int(os.environ['SGE_TASK_LAST']) task_ids = range( sge_task_id, min(sge_task_id + sge_step_size, sge_task_last + 1) ) print task_ids, sge_task_id, sge_step_size, sge_task_last for task_id in task_ids: inp_filename = input_filename.format(id=task_id - 1) out_filename = output_filename.format(id=task_id - 1) aln = list(Fasta.load( inp_filename, '', Alignment, sequence_selectors=['sequence1', 'sequence2']))[0] tmp_filename = get_temp_filename() Fasta.save(zip(aln.names, aln.sequences), tmp_filename) os.system("muscle -in {inp} -out {out} 2> /dev/null".format( inp=tmp_filename, out=out_filename, )) os.system("cp {inp}.repeats {out}.repeats".format( inp=inp_filename, out=out_filename, ))
def realign_file(args, model, output_filename, alignment_filename): # begin of HACK if args.expand_model: old_tracks = args.tracks args.tracks.add('trf_cons') m = model if args.annotation_model: m = args.annotation_model annotations = compute_annotations(args, alignment_filename, m) if args.expand_model: consensuses = annotations['trf_cons'] args.tracks = old_tracks if 'trf_cons' not in old_tracks: del args.tracks['trf_cons'] # end of HACK with Open(output_filename, 'w') as output_file_object: for aln in Fasta.load( alignment_filename, args.alignment_regexp, Alignment, sequence_selectors=args.sequence_regexp): if len(aln.sequences) < 2: sys.stderr.write("ERROR: not enough sequences in file\n") return 1 if len(args.draw) == 0: drawer = brainwash(AlignmentCanvas)() else: drawer = AlignmentCanvas() drawer.add_original_alignment(aln) aln, unmask_repeats = args.mask_repeats(aln, annotations) seq1, seq2 = tuple(map(Fasta.alnToSeq, aln.sequences[:2])) perf.msg("Data loaded in {time} seconds.") perf.replace() if args.expand_model: # Potrebujem zistit konsenzy A = consensuses[aln.names[0]] B = consensuses[aln.names[1]] cons = list(A.union(B)) real_model = model.expandModel({'consensus': cons}) else: real_model = model realigner = args.algorithm() realigner.setDrawer(drawer) realigner.prepareData(seq1, aln.names[0], seq2, aln.names[1], aln, real_model, annotations, args) aln = realigner.realign(0, len(seq1), 0, len(seq2)) aln = unmask_repeats(aln) perf.msg("Sequence was realigned in {time} seconds.") perf.replace() if len(args.draw) > 0: drawer.add_sequence('X', seq1) drawer.add_sequence('Y', seq2) drawer.add_alignment_line(101, (255, 0, 255, 255), 2, AlignmentPositionGenerator( Alignment([aln[0], aln[2]]))) drawer.draw(args.draw, 2000, 2000) perf.msg("Image was drawn in {time} seconds.") # Save output_file Fasta.saveAlignmentPiece(aln, output_file_object)
def main(input_file, index1, index2, emissionOutput, transitionOutput): emissions = defaultdict(int) transitions = defaultdict(int) X = None Y = None def aggregate(X, Y): pairs = zip(X, Y) for p in pairs: if skip(p): continue emissions[str(upper(p))] += 1 Types = list(sortTypes([getType(x) for x in pairs if x != ('-', '-')])) for p in zip(Types, Types[1:]): transitions[str(p)] += 1 for aln in Fasta.load(input_file, '[.][0-9]+$'): count = 0 for _, sequence in aln: if count == index1: X = sequence if count == index2: Y = sequence count += 1 aggregate(X, Y) with Open(emissionOutput, 'w') as f: json.dump(emissions, f, indent=4) with Open(transitionOutput, 'w') as f: json.dump(transitions, f, indent=4)
def getSequences(self, fname, sequence_regexp=None): alignment_regexp = '' if sequence_regexp is None: sequence_regexp = ["^sequence1$", "^sequence2$"] self.sequence_regexp = sequence_regexp aln = next( Fasta.load(fname, alignment_regexp, Alignment, sequence_regexp)) if aln is None or len(aln.sequences) < 2: raise ParseException('Not enough sequences in file\n') seq1 = aln.sequences[0] seq2 = aln.sequences[1] return seq1, seq2
def getSequences(self, fname, sequence_regexp=None): alignment_regexp = '' if sequence_regexp is None: sequence_regexp = ["^sequence1$", "^sequence2$"] self.sequence_regexp = sequence_regexp aln = next( Fasta.load(fname, alignment_regexp, Alignment, sequence_regexp) ) if aln is None or len(aln.sequences) < 2: raise ParseException('Not enough sequences in file\n') seq1 = aln.sequences[0] seq2 = aln.sequences[1] return seq1, seq2
def main(files_filename, output_filename, suffix, base_dir): X = "" Y = "" A = "" with Open(output_filename, 'w') as ff: files = json.load(Open(files_filename)) total = len(files) done = 0 for filename in files: if done %100 ==0: print '{}/{} {:.2}%'.format(done, total, 100.0 * done / total) if filename == "": Fasta.saveAlignmentPiece([(X_name, X), (Y_name, Y), (A_name, A)], ff) X = "" Y = "" A = "" continue done += 1 old_filename = filename keep = False if filename.count('keep') == 0: filename = filename[:-2] + suffix if base_dir != None: filename = base_dir + '/' + filename.split('/')[-1] try: with Open(filename, 'r') as f: l = len(''.join(f).strip()) if l == 0: filename = old_filename keep = True except IOError: filename = old_filename keep = True if filename.count('keep') > 0: keep = True aln = list(Fasta.load(filename, ''))[0] assert(len(aln) == 3) assert(len(aln[0][1]) == len(aln[1][1]) == len(aln[2][1])) X += aln[0][1] if keep: A += '.' * len(aln[0][1]) else: A += aln[1][1] Y += aln[2][1] X_name = aln[0][0] A_name = aln[1][0] Y_name = aln[2][0]
def main(files_filename, output_filename, suffix, base_dir): X = "" Y = "" A = "" with Open(output_filename, "w") as ff: files = json.load(Open(files_filename)) total = len(files) done = 0 for filename in files: if done % 100 == 0: print "{}/{} {:.2}%".format(done, total, 100.0 * done / total) if filename == "": Fasta.saveAlignmentPiece([(X_name, X), (Y_name, Y), (A_name, A)], ff) X = "" Y = "" A = "" continue done += 1 old_filename = filename keep = False if filename.count("keep") == 0: filename = filename[:-2] + suffix if base_dir != None: filename = base_dir + "/" + filename.split("/")[-1] try: with Open(filename, "r") as f: l = len("".join(f).strip()) if l == 0: filename = old_filename keep = True except IOError: filename = old_filename keep = True if filename.count("keep") > 0: keep = True aln = list(Fasta.load(filename, ""))[0] assert len(aln) == 3 assert len(aln[0][1]) == len(aln[1][1]) == len(aln[2][1]) X += aln[0][1] if keep: A += "." * len(aln[0][1]) else: A += aln[1][1] Y += aln[2][1] X_name = aln[0][0] A_name = aln[1][0] Y_name = aln[2][0]
def main(input_files, output_file): global width alignments = [ list( Fasta.load(name, '', Alignment, ['^sequence1', '^sequence2', '^[av].*'])) if os.path.exists(name) else None for name in input_files ] x_len = len(Fasta.alnToSeq(alignments[0][0].sequences[0])) y_len = len(Fasta.alnToSeq(alignments[0][0].sequences[1])) I = Image.new('RGB', (x_len * width + 50, y_len * width + 50), (255, 255, 255)) D = ImageDraw.Draw(I) colors = [(0, 0, 0), (255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 0, 255), (0, 255, 255)] i = -1 for aln in alignments: i += 1 if aln == None: continue aln = list(aln) if len(aln) == 0: continue aln = aln[0] try: annotation = aln.sequences[2] coords = aln.getCoordPairs() print coords x_shift = width / 2 + 25 + i y_shift = width / 2 + 25 + i * 2 D.line([(x * width + x_shift, y * width + y_shift) for x, y, _ in coords], fill=colors[i]) if annotation != None: for x, y, ind in coords: if annotation[ind] != 'R': continue D.rectangle([(x * width + x_shift - width / 4, y * width + y_shift - width / 4), (x * width + x_shift + width / 4, y * width + y_shift + width / 4)], outline=colors[i]) except IndexError: pass except IOError: pass del D I.save(output_file)
def main(input_file, output_file, trf): # THIS IS ONLY GENERATOR!!! alns = (Alignment(a) for a in Fasta.load(input_file, '[.][0-9]+$', Alignment)) # 1. run trf, for trf_executable in trf: if os.path.exists(trf_executable): trf = TRFDriver(trf_executable) break repeats = trf.run(input_file) A = list(compute_annotation_track(alns, repeats)) json.dump(A, Open(output_file, 'w'), indent=4)
def main(input_files, output_file): global width alignments = [list(Fasta.load(name, '', Alignment, ['^sequence1', '^sequence2', '^[av].*'])) if os.path.exists(name) else None for name in input_files] x_len = len(Fasta.alnToSeq(alignments[0][0].sequences[0])) y_len = len(Fasta.alnToSeq(alignments[0][0].sequences[1])) I = Image.new('RGB', (x_len * width + 50, y_len * width + 50), (255, 255, 255)) D = ImageDraw.Draw(I) colors = [(0, 0, 0), (255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 0, 255), (0, 255, 255)] i = -1 for aln in alignments: i += 1 if aln == None: continue aln = list(aln) if len(aln) == 0: continue aln = aln[0] try: annotation = aln.sequences[2] coords = aln.getCoordPairs() print coords x_shift = width / 2 + 25 + i y_shift = width / 2 + 25 + i * 2 D.line([(x * width + x_shift, y * width + y_shift) for x, y, _ in coords], fill=colors[i]) if annotation != None: for x, y, ind in coords: if annotation[ind] != 'R': continue D.rectangle([(x * width + x_shift - width / 4, y * width + y_shift - width / 4), (x * width + x_shift + width / 4, y * width + y_shift + width / 4)], outline=colors[i]) except IndexError: pass except IOError: pass del D I.save(output_file)
def expectation_generator(args, model, alignment_filename, annotations): for aln in Fasta.load(alignment_filename, args.alignment_regexp, Alignment, sequence_selectors=args.sequence_regexp): if len(aln.sequences) < 2: sys.stderr.write("ERROR: not enough sequences in file\n") raise "ERROR: not enough sequences in file" seq1, seq2 = tuple(map(Fasta.alnToSeq, aln.sequences[:2])) positionGenerator = list(AlignmentBeamGenerator(aln, args.beam_width)) RX = RepeatGenerator(None, args.repeat_width) RY = RepeatGenerator(None, args.repeat_width) for rt in ['trf', 'original_repeats']: if rt in annotations: RX.addRepeats(annotations[rt][aln.names[0]]) RY.addRepeats(annotations[rt][aln.names[1]]) RX.buildRepeatDatabase() RY.buildRepeatDatabase() if 'Repeat' in model.statenameToID: model.states[model.statenameToID['Repeat']].addRepeatGenerator( RX, RY) (transitions, emissions), probability = model.getBaumWelchCounts( seq1, 0, len(seq1), seq2, 0, len(seq2), positionGenerator=positionGenerator) yield { "probability": probability, "transitions": transitions, "emissions": emissions, }
def main(input_file, output_file): for trf_executable in trf_paths: if os.path.exists(trf_executable): trf = TRFDriver(trf_executable) #break if not trf: raise "No trf found" repeats = trf.run(input_file) with open(output_file, 'w') as f: for alignment in Fasta.load(input_file, '\.[0-9]*$', Alignment): if len(alignment.sequences) != 2: print 'error' continue #print alignment.names annotation = list('.' * len(alignment.sequences[0])) annotationX = list('.' * len(alignment.sequences[0])) annotationY = list('.' * len(alignment.sequences[0])) trf = None for seq_name in alignment.names: index = None for i in range(len(alignment.names)): if seq_name == alignment.names[i]: index = i translator = alignment.seq_to_aln[index] revtranslator = alignment.aln_to_seq[index] for repeat in repeats[seq_name]: for i in range(translator[repeat.start], translator[repeat.end]): annotation[i] = 'R' j = i - translator[repeat.start] if index == 0: annotationX[i] = repeat.consensus[revtranslator[j] % len(repeat.consensus)] else: annotationY[i] = repeat.consensus[revtranslator[j] % len(repeat.consensus)] d = defaultdict(int) ll = 0 for v in annotation: if v != 'R': if ll > 0: d[ll] += 1 ll = 0 else: ll += 1 #for x, y in sorted(d.iteritems(), key=lambda x: x[1]): # print '{}: {}'.format(x, y) #if len(d.keys()) > 0: # print('Number of repeats: {}, average length: {}, maximum length: {}, minimum length: {}'.format( # sum(d.values()), # sum([x * y for x, y in d.iteritems()])/ max(sum(d.values()), 1), # max(d.keys()), # min(d.keys()) # )) seqX = alignment.sequences nm = alignment.names[0] aln = [(alignment.names[0], alignment.sequences[0].replace('.', '-')), ('consensusX' + nm, ''.join(annotationX)), ('annotation' + nm, ''.join(annotation)), ('consensusY' + nm, ''.join(annotationY)), (alignment.names[1], alignment.sequences[1].replace('.','-'))] Fasta.saveAlignmentPiece(aln, f, -1)
def main(input_file, realign_output, do_not_touch_output, list_of_files_output, max_length, wrap_length, min_seq_length): realign_counter = 0 do_not_touch_counter = 0 files = [] for alignment in Fasta.load(input_file, '\.[0-9]*$'): if realign_counter % 100 == 0: print(realign_counter, do_not_touch_counter,alignment[0][0]) alignment_len = len(alignment[0][1]) annotation = alignment[2][1] # !!! We expect that first block is not repeat changes = [i for i in range(1, len(annotation)) if annotation[i-1] != annotation[i]] + [len(annotation)] Blocks = zip(changes, changes[1:]) + [(len(annotation), len(annotation) + max_length + 10)] Blocks = [(0, Blocks[0][0])] + Blocks printed = 0 block_start = 0#None block_end = None intervals = [] for block_id in range(1, len(Blocks), 2): current = Blocks[block_id] previous = Blocks[block_id - 1] if block_start == None: startpp = max(printed, previous[0]) if previous[1] - startpp > wrap_length: intervals.append((printed, startpp)) printed = startpp block_start = startpp else: # Pridam tento blok, alebo zacnem novy? if current[1] - block_start > max_length: if previous[1] - previous[0] > wrap_length * 2: intervals.append((block_start, previous[0] + wrap_length)) intervals.append((previous[0] + wrap_length, previous[1] - wrap_length)) printed = previous[1] - wrap_length block_start = previous[1] - wrap_length else: split = (previous[0] + previous[1]) / 2 intervals.append((block_start, split)) block_start = split printed = split #Zacnem novy intervals.append((printed, len(annotation))) assert(len(annotation) == sum([y - x for x, y in intervals])) for i in range(1, len(intervals)): assert(intervals[i - 1][1] == intervals[i][0]) #t = list(range(0, alignment_len, max_length)) + [alignment_len] #intervals = zip(t, t[1:]) for start, stop in intervals: if start >= len(annotation): continue if start == stop: continue assert(start < stop) ann = alignment[2][1][start:stop] output = None seq1 = alignment[0][1] seq2 = alignment[4][1] seq1_len = len(seq1) - seq1.count('-') - seq1.count('.') seq2_len = len(seq2) - seq2.count('-') - seq2.count('.') if ann.count('R') == 0 or min(seq1_len, seq2_len) < min_seq_length or ann.count('R') == len(ann): output = do_not_touch_output.format(id=do_not_touch_counter) do_not_touch_counter += 1 else: output = realign_output.format(id=realign_counter) realign_counter += 1 files.append(output) aln = [ (alignment[0][0], alignment[0][1][start:stop]), (alignment[2][0], alignment[2][1][start:stop]), (alignment[4][0], alignment[4][1][start:stop]) ] #Fasta.save(aln, output, width=-1) files.append(''); with Open(list_of_files_output, 'w') as f: json.dump(files, f, indent=4)
def main(inp, out, alignment_regexp, sequence_regexp, trf=trf_paths): for trf_executable in trf: if os.path.exists(trf_executable): trf = TRFDriver(trf_executable, mathType=float) break repeats = trf.run(inp) stats = defaultdict(int) for aln in Fasta.load(inp, alignment_regexp, Alignment, sequence_selectors=sequence_regexp): X_index = 0 Y_index = 1 X_trf = list( translate_repeat_to_annotation(repeats[aln.names[X_index]], aln.seq_to_aln[X_index])) Y_trf = list( translate_repeat_to_annotation(repeats[aln.names[Y_index]], aln.seq_to_aln[Y_index])) X_ann = list("M" * len(aln.sequences[X_index])) Y_ann = list("M" * len(aln.sequences[Y_index])) B_ann = list("M" * len(aln.sequences[Y_index])) for repeat in X_trf: if repeat.end >= len(X_ann): repeat.end = len(X_ann) - 1 rlen = 1 + repeat.end - repeat.start X_ann[repeat.start:repeat.end + 1] = list("R" * rlen) B_ann[repeat.start:repeat.end + 1] = list("R" * rlen) for repeat in Y_trf: if repeat.end >= len(Y_ann): repeat.end = len(Y_ann) - 1 rlen = 1 + repeat.end - repeat.start Y_ann[repeat.start:repeat.end + 1] = list("R" * rlen) B_ann[repeat.start:repeat.end + 1] = list("R" * rlen) assert (len(X_ann) == len(Y_ann) and len(B_ann) == len(Y_ann)) M_count = len([x for x in B_ann if x == 'M']) R_count = len([x for x in B_ann if x == 'R']) R_segments_count = len([ x for x in zip('M' + ''.join(B_ann), ''.join(B_ann) + 'M') if x[0] != 'R' and x[1] == 'R' ]) stats['M_count'] += M_count stats['R_count'] += R_count stats['R_segment_count'] += R_segments_count changes = [ i for i, x in zip(range(len(B_ann) + 1), zip('M' + ''.join(B_ann), ''.join(B_ann) + 'M')) if x[0] != x[1] ] R_segments = [(changes[i], changes[i + 1]) for i in range(0, len(changes) - (len(changes) % 2), 2)] assert (R_segments_count == len(R_segments)) for start, stop in R_segments: XX = 'M' YY = 'M' for i in range(start, stop): if X_ann[i] == 'R': XX = 'R' if Y_ann[i] == 'R': YY = 'R' assert (B_ann[i] == 'R') stats[XX + YY] += 1 with Open(out, 'w') as f: json.dump(stats, f, indent=4)
def select_sequences(inp_filename, out_filename, sequences): aln = list(Fasta.load(inp_filename, '', Alignment, sequence_selectors=sequences))[0] Fasta.save(zip(aln.names, aln.sequences), out_filename)
def main(correct_file, aln_file, output_file, interval=None): task_ids = [None] if os.environ.has_key('SGE_TASK_ID'): if os.environ['SGE_TASK_ID'] != 'undefined': sge_task_id = int(os.environ['SGE_TASK_ID']) if not os.environ.has_key('SGE_STEP_SIZE'): sge_step_size = 1 else: sge_step_size = int(os.environ['SGE_STEP_SIZE']) sge_task_last = int(os.environ['SGE_TASK_LAST']) task_ids = range( sge_task_id, min(sge_task_id + sge_step_size, sge_task_last + 1)) if interval != None: task_ids = range(interval[0], interval[1] + 1) for task_id in task_ids: separator = '' output = {} for fun, tp in [(identity, 'standard'), (expand_repeats, 'expanded_repeats'), (remove_repeats, 'removed_repeats')]: try: for correct, alignment in zip( Fasta.load(correct_file.format(id=task_id - 1), separator, Alignment), Fasta.load(aln_file.format(id=task_id - 1), separator, Alignment)): correct_len = len(correct.getCoordPairs(False)) total_len = correct_len * 2 - correct.sequences[0].count( '-') - correct.sequences[2].count('-') ccc = fun(correct.getCoordPairs(False), correct) if tp == 'removed_repeats': correct_len = len(ccc) total_len = 0 for v1, _, v2 in ccc: if v1 >= 0: total_len += 1 if v2 >= 0: total_len += 1 acc = alignment.getCoordPairs(False) cc = map(lambda x: (x[0], x[2]), ccc) if len(acc[0]) == 3: ac = map(lambda x: (x[0], x[2]), acc) elif len(acc[0]) == 2: ac = acc else: ac = None c = set(cc) a = set(ac) intersect = c.intersection(a) not_in_c = c.difference(a) not_in_a = a.difference(c) symm_diff = c.symmetric_difference(a) score = 0 for v1, v2 in intersect: if v1 >= 0: score += 1 if v2 >= 0: score += 1 dists_correct = defaultdict(int) dists_total = defaultdict(int) position = dict() dists = [99999999] * len(correct.sequences[1]) dst = 9999999 for x, a, y in ccc: position[(x, y)] = a for i in range(len(correct.sequences[1])): if correct.sequences[1][i] == 'R': dst = 0 else: dst += 1 dists[i] = min(dists[i], dst) for i in reversed(range(len(correct.sequences[1]))): if correct.sequences[1][i] == 'R': dst = 0 else: dst += 1 dists[i] = min(dists[i], dst) for pos in c: d = dists[position[pos]] if d == 0: continue dists_total[d] += 1 if pos in ac: dists_correct[d] += 1 def getRepeatAnnotation(coord, annotation): if len(coord[0]) != 3: return set() ret = set() for x, a, y in coord: if annotation[a] == 'R': if x >= 0: ret.add((x, -1)) if y >= 0: ret.add((-1, y)) return ret crann = getRepeatAnnotation(correct.getCoordPairs(False), correct.sequences[1]) arann = getRepeatAnnotation(alignment.getCoordPairs(False), alignment.sequences[1]) def getRepeatBlocks(coord, annotation): if len(coord[0]) != 3: return set() ret = set() x = set() y = set() for _x, a, _y in coord: if annotation[a] == 'R': if _x >= 0: x.add(_x) if _y >= 0: y.add(_y) else: if len(x) + len(y) > 0: if len(x) == 0: x.add(-1) if len(y) == 0: y.add(-1) ret.add(((min(x), max(x) + 1), (min(y), max(y) + 1))) x = set() y = set() if len(x) + len(y) > 0: if len(x) == 0: x.add(-1) if len(y) == 0: y.add(-1) ret.add( ((min(x), max(x) + 1), (min(y), max(y) + 1))) x = set() y = set() return ret cbann = getRepeatBlocks(correct.getCoordPairs(False), correct.sequences[1]) abann = getRepeatBlocks(alignment.getCoordPairs(False), alignment.sequences[1]) def dst(x1, x2): if x1 == -1: return 0 return x2 - x1 def getPoints(s): return sum([ dst(x1, x2) + dst(y1, y2) for ((x1, x2), (y1, y2)) in s ]) # Find long segments that are correctly aligned cseg = [1 if x in c else 0 for x in ac] seg_len = [] length = 0 segment_length_histogram = defaultdict(int) for x in cseg: if x == 0 and length != 0: segment_length_histogram[length] += 1 length = length * x + x seg_len.append(length) if length > 0: segment_length_histogram[length] += 1 getPoints = len output[tp] = { 'corect': correct_file, 'alignment': aln_file, 'c-lenght': len(cc), 'a-length': len(ac), 'intersect': len(intersect), '%correct': 100.0 - float(len(intersect) * 100) / correct_len if correct_len > 0 else 100, '+mistakes': len(intersect), '+len': correct_len, '+RepTP': len(crann & arann), '+RepTN': total_len - len(crann | arann), '+RepFP': len(arann - crann), '+RepFN': len(crann - arann), '+BlkTP': getPoints(cbann & abann), '+BlkTN': 0, '+BlkFP': getPoints(abann - cbann), '+BlkFN': getPoints(cbann - abann), '%score': float(score) * 100 / total_len if total_len > 0 else 0, 'c-a': len(not_in_c), 'a-c': len(not_in_a), 'symmetric_difference': len(symm_diff), 'correct_len_histogram': segment_length_histogram, '@+dists_correct': dists_correct, '@+dists_total': dists_total, } if correct_len == 0: del output[tp]['%correct'] if total_len == 0: del output[tp]['%score'] except IOError: pass with Open(output_file.format(id=task_id - 1), 'w') as f: json.dump(output, f, indent=4)
def main(input_file, output_file): for trf_executable in trf_paths: if os.path.exists(trf_executable): trf = TRFDriver(trf_executable) #break if not trf: raise "No trf found" repeats = trf.run(input_file) with open(output_file, 'w') as f: for alignment in Fasta.load(input_file, '\.[0-9]*$', Alignment): if len(alignment.sequences) != 2: print 'error' continue #print alignment.names annotation = list('.' * len(alignment.sequences[0])) annotationX = list('.' * len(alignment.sequences[0])) annotationY = list('.' * len(alignment.sequences[0])) trf = None for seq_name in alignment.names: index = None for i in range(len(alignment.names)): if seq_name == alignment.names[i]: index = i translator = alignment.seq_to_aln[index] revtranslator = alignment.aln_to_seq[index] for repeat in repeats[seq_name]: for i in range(translator[repeat.start], translator[repeat.end]): annotation[i] = 'R' j = i - translator[repeat.start] if index == 0: annotationX[i] = repeat.consensus[ revtranslator[j] % len(repeat.consensus)] else: annotationY[i] = repeat.consensus[ revtranslator[j] % len(repeat.consensus)] d = defaultdict(int) ll = 0 for v in annotation: if v != 'R': if ll > 0: d[ll] += 1 ll = 0 else: ll += 1 #for x, y in sorted(d.iteritems(), key=lambda x: x[1]): # print '{}: {}'.format(x, y) #if len(d.keys()) > 0: # print('Number of repeats: {}, average length: {}, maximum length: {}, minimum length: {}'.format( # sum(d.values()), # sum([x * y for x, y in d.iteritems()])/ max(sum(d.values()), 1), # max(d.keys()), # min(d.keys()) # )) seqX = alignment.sequences nm = alignment.names[0] aln = [ (alignment.names[0], alignment.sequences[0].replace('.', '-')), ('consensusX' + nm, ''.join(annotationX)), ('annotation' + nm, ''.join(annotation)), ('consensusY' + nm, ''.join(annotationY)), (alignment.names[1], alignment.sequences[1].replace('.', '-')) ] Fasta.saveAlignmentPiece(aln, f, -1)
def main(correct_file, aln_file, output_file, interval=None): task_ids = [None] if os.environ.has_key('SGE_TASK_ID'): if os.environ['SGE_TASK_ID'] != 'undefined': sge_task_id = int(os.environ['SGE_TASK_ID']) if not os.environ.has_key('SGE_STEP_SIZE'): sge_step_size = 1 else: sge_step_size = int(os.environ['SGE_STEP_SIZE']) sge_task_last = int(os.environ['SGE_TASK_LAST']) task_ids = range( sge_task_id, min(sge_task_id + sge_step_size, sge_task_last + 1) ) if interval != None: task_ids = range(interval[0], interval[1] + 1) for task_id in task_ids: separator = '' output = {} for fun, tp in [(identity, 'standard'), (expand_repeats, 'expanded_repeats'), (remove_repeats, 'removed_repeats')]: try: for correct, alignment in zip( Fasta.load(correct_file.format(id=task_id - 1), separator, Alignment), Fasta.load(aln_file.format(id=task_id - 1), separator, Alignment) ): correct_len = len(correct.getCoordPairs(False)) total_len = correct_len * 2 - correct.sequences[0].count('-') - correct.sequences[2].count('-') ccc = fun(correct.getCoordPairs(False), correct) if tp == 'removed_repeats': correct_len = len(ccc) total_len = 0 for v1, _, v2 in ccc: if v1 >= 0: total_len += 1 if v2 >= 0: total_len += 1 acc = alignment.getCoordPairs(False) cc = map(lambda x: (x[0], x[2]), ccc) if len(acc[0]) == 3: ac = map(lambda x: (x[0], x[2]), acc) elif len(acc[0]) ==2: ac = acc else: ac = None c = set(cc) a = set(ac) intersect = c.intersection(a) not_in_c = c.difference(a) not_in_a = a.difference(c) symm_diff = c.symmetric_difference(a) score = 0 for v1, v2 in intersect: if v1 >= 0: score += 1 if v2 >= 0: score += 1 dists_correct = defaultdict(int) dists_total = defaultdict(int) position = dict() dists = [99999999] * len(correct.sequences[1]) dst = 9999999 for x, a, y in ccc: position[(x,y)] = a for i in range(len(correct.sequences[1])): if correct.sequences[1][i] == 'R': dst = 0 else: dst += 1 dists[i] = min(dists[i], dst) for i in reversed(range(len(correct.sequences[1]))): if correct.sequences[1][i] == 'R': dst = 0 else: dst += 1 dists[i] = min(dists[i], dst) for pos in c: d = dists[position[pos]] if d == 0: continue dists_total[d] += 1 if pos in ac: dists_correct[d] += 1 def getRepeatAnnotation(coord, annotation): if len(coord[0]) != 3: return set() ret = set() for x, a, y in coord: if annotation[a] == 'R': if x >= 0: ret.add((x, -1)) if y >= 0: ret.add((-1, y)) return ret crann = getRepeatAnnotation(correct.getCoordPairs(False), correct.sequences[1]) arann = getRepeatAnnotation(alignment.getCoordPairs(False), alignment.sequences[1]) def getRepeatBlocks(coord, annotation): if len(coord[0]) != 3: return set() ret = set() x = set() y = set() for _x, a, _y in coord: if annotation[a] == 'R': if _x >= 0: x.add(_x) if _y >= 0: y.add(_y) else: if len(x) + len(y) > 0: if len(x) == 0: x.add(-1) if len(y) == 0: y.add(-1) ret.add(((min(x), max(x) + 1), (min(y), max(y) + 1))) x = set() y = set() if len(x) + len(y) > 0: if len(x) == 0: x.add(-1) if len(y) == 0: y.add(-1) ret.add(((min(x), max(x) + 1), (min(y), max(y) + 1))) x = set() y = set() return ret cbann = getRepeatBlocks(correct.getCoordPairs(False), correct.sequences[1]) abann = getRepeatBlocks(alignment.getCoordPairs(False), alignment.sequences[1]) def dst(x1, x2): if x1 == -1: return 0 return x2 - x1 def getPoints(s): return sum([dst(x1,x2) + dst(y1,y2) for ((x1, x2), (y1, y2)) in s]) # Find long segments that are correctly aligned cseg = [1 if x in c else 0 for x in ac] seg_len = [] length = 0 segment_length_histogram = defaultdict(int) for x in cseg: if x == 0 and length != 0: segment_length_histogram[length] += 1 length = length * x + x seg_len.append(length) if length > 0: segment_length_histogram[length] += 1 getPoints = len output[tp] = { 'corect': correct_file, 'alignment': aln_file, 'c-lenght': len(cc), 'a-length': len(ac), 'intersect': len(intersect), '%correct': 100.0 - float(len(intersect) * 100) / correct_len if correct_len > 0 else 100, '+mistakes': len(intersect), '+len': correct_len, '+RepTP': len(crann & arann), '+RepTN': total_len - len(crann | arann), '+RepFP': len(arann - crann), '+RepFN': len(crann - arann), '+BlkTP': getPoints(cbann & abann), '+BlkTN': 0, '+BlkFP': getPoints(abann - cbann), '+BlkFN': getPoints(cbann - abann), '%score': float(score) * 100 / total_len if total_len > 0 else 0, 'c-a': len(not_in_c), 'a-c': len(not_in_a), 'symmetric_difference': len(symm_diff), 'correct_len_histogram': segment_length_histogram, '@+dists_correct': dists_correct, '@+dists_total': dists_total, } if correct_len == 0: del output[tp]['%correct'] if total_len == 0: del output[tp]['%score'] except IOError: pass with Open(output_file.format(id=task_id - 1), 'w') as f: json.dump(output, f, indent=4)
def select_sequences(inp_filename, out_filename, sequences): aln = list( Fasta.load(inp_filename, '', Alignment, sequence_selectors=sequences))[0] Fasta.save(zip(aln.names, aln.sequences), out_filename)
def main(inp, out, alignment_regexp, sequence_regexp, trf=trf_paths): for trf_executable in trf: if os.path.exists(trf_executable): trf = TRFDriver(trf_executable, mathType=float) break repeats = trf.run(inp) stats = defaultdict(int) for aln in Fasta.load( inp, alignment_regexp, Alignment, sequence_selectors=sequence_regexp ): X_index = 0 Y_index = 1 X_trf = list(translate_repeat_to_annotation( repeats[aln.names[X_index]], aln.seq_to_aln[X_index])) Y_trf = list(translate_repeat_to_annotation( repeats[aln.names[Y_index]], aln.seq_to_aln[Y_index])) X_ann = list("M" * len(aln.sequences[X_index])) Y_ann = list("M" * len(aln.sequences[Y_index])) B_ann = list("M" * len(aln.sequences[Y_index])) for repeat in X_trf: if repeat.end >= len(X_ann): repeat.end = len(X_ann) - 1 rlen = 1 + repeat.end - repeat.start X_ann[repeat.start : repeat.end + 1] = list("R" * rlen) B_ann[repeat.start : repeat.end + 1] = list("R" * rlen) for repeat in Y_trf: if repeat.end >= len(Y_ann): repeat.end = len(Y_ann) - 1 rlen = 1 + repeat.end - repeat.start Y_ann[repeat.start : repeat.end + 1] = list("R" * rlen) B_ann[repeat.start : repeat.end + 1] = list("R" * rlen) assert(len(X_ann) == len(Y_ann) and len(B_ann) == len(Y_ann)) M_count = len([x for x in B_ann if x == 'M']) R_count = len([x for x in B_ann if x == 'R']) R_segments_count = len([x for x in zip('M' + ''.join(B_ann), ''.join(B_ann) + 'M') if x[0] != 'R' and x[1] == 'R']) stats['M_count'] += M_count stats['R_count'] += R_count stats['R_segment_count'] += R_segments_count changes = [i for i, x in zip( range(len(B_ann) + 1), zip('M' + ''.join(B_ann), ''.join(B_ann) + 'M')) if x[0] != x[1]] R_segments = [(changes[i], changes[i+1]) for i in range(0, len(changes) - (len(changes) % 2), 2)] assert(R_segments_count == len(R_segments)) for start, stop in R_segments: XX = 'M' YY = 'M' for i in range(start, stop): if X_ann[i] == 'R': XX = 'R' if Y_ann[i] == 'R': YY = 'R' assert(B_ann[i] == 'R') stats[XX + YY] += 1 with Open(out, 'w') as f: json.dump(stats, f, indent=4);