def realign(self, x, dx, y, dy): if 'viterbi_path' not in self.io_files['input']: path = self.model.getViterbiPath(self.table) if 'viterbi_path' in self.io_files['output']: with Open(self.io_files['output']['viterbi_path.js'], 'w') as f: json.dump(jsonize(path), f, indent=4) else: path = dejsonize_struct( json.load(Open(self.io_files['input']['viterbi_path'])), (list, (tuple, int, (tuple, int), (tuple, int), lambda x: LogNum(x, False)))) X = "" Y = "" A = "" for (state, (_x, _y), (_dx, _dy), _) in path: X += self.X[_x - _dx:_x] + ('-' * (max(_dx, _dy) - _dx)) Y += self.Y[_y - _dy:_y] + ('-' * (max(_dx, _dy) - _dy)) A += self.model.states[state].getChar() * max(_dx, _dy) return [ (self.X_name, X), ("viterbi annotation of " + self.X_name + " and " + self.Y_name, A), (self.Y_name, Y) ]
def main(input_file, index1, index2, emissionOutput, transitionOutput): emissions = defaultdict(int) transitions = defaultdict(int) X = None Y = None def aggregate(X, Y): pairs = zip(X, Y) for p in pairs: if skip(p): continue emissions[str(upper(p))] += 1 Types = list(sortTypes([getType(x) for x in pairs if x != ('-', '-')])) for p in zip(Types, Types[1:]): transitions[str(p)] += 1 for aln in Fasta.load(input_file, '[.][0-9]+$'): count = 0 for _, sequence in aln: if count == index1: X = sequence if count == index2: Y = sequence count += 1 aggregate(X, Y) with Open(emissionOutput, 'w') as f: json.dump(emissions, f, indent=4) with Open(transitionOutput, 'w') as f: json.dump(transitions, f, indent=4)
def computeViterbiTable(self): if 'viterbi' not in self.io_files['input']: for state in self.model.states: if isinstance(state, ClassifierState): emission_table = SequenceTablePrecompute( state.clf, self.positionGenerator, self.X, self.Y, state.ann_x, state.ann_y) emission_table.compute() state.set_emission_table(emission_table) self.table = self.model.getViterbiTable( self.X, 0, len(self.X), self.Y, 0, len(self.Y), positionGenerator=self.positionGenerator) x = jsonize(self.table) if 'viterbi' in self.io_files['output']: with Open(self.io_files['output']['viterbi'], 'w') as f: json.dump(x, f, indent=4) else: self.table = dejsonize_struct( json.load(Open(self.io_files['input']['viterbi'])), (list, (dict, int, (dict, (tuple, int), (tuple, lambda x: LogNum(x, False), int)))))
def main(alignment, working_directory, split_count, output_file, seq_selectors): seq_selectors = map(re.compile, seq_selectors) # TODO: check na to, ci to uz existuje, a ked ano, tak to nespravil znova if not os.path.exists(working_directory): os.makedirs(working_directory) # Prepare alignment into right format filename = os.path.basename(alignment) extension = filename.split('.')[-1].lower() base = '.'.join(filename.split('.')[:-2]) if extension == 'gz': extension = filename.split('.')[-2].lower() base = '.'.join(filename.split('.')[:-3]) if extension == 'fa': # fasta_generator = alignment assert (False) elif extension == 'maf': fasta_generator = Maf2FastaGen(alignment) else: assert (False) parallel_dir = '{dir}/{base}_parallel'.format( dir=working_directory, base=base, ) if not os.path.exists(parallel_dir): os.makedirs(parallel_dir) filenames = [ '{dir}/alignment_{index:04d}.fa'.format(dir=parallel_dir, index=i + 1) for i in range(split_count) ] files = [Open(name, 'w') for name in filenames] for aln in fasta_generator: new_aln = [] for src, aln_count, text in aln: add = False for selector in seq_selectors: if selector.match(src) != None: add = True if add: new_aln.append('>{0}.{1}\n{2}\n'.format(src, aln_count, text)) if len(new_aln) == 2: new_aln.sort(key=lambda x: x[0]) files[aln_count % split_count].writelines(new_aln) map(lambda x: x.close(), files) with Open(output_file, 'w') as f: json.dump(filenames, f, indent=4)
def main(config_file, output_file): with Open(config_file, 'r') as f: config = json.load(f) graph = dict() for name, item in config.iteritems(): graph[name] = [] if "depends" not in item else item['depends'] with Open(output_file, 'w') as f: f.write('#!/bin/bash\n\n') for job in toposort(graph): item = config[job] param = ['-terse', '-cwd'] if "depends" in item: param.append('-hold_jid') param.append(','.join(['$' + x for x in item['depends']])) if "array" in item: param.append('-t') assert(len(item['array']) > 0 and len(item['array']) < 4) param.append( ''.join([ ''.join(x) for x in zip(['', '-', ':'], map(str, item['array'])) ]) ) if 'stdout' in item: param.append('-o') param.append("'{}'".format(item['stdout'])) if 'stderr' in item: param.append('-e') param.append("'{}'".format(item['stderr'])) if "resources" in item: assert(len(item['resources']) > 0) param.append('-l') param.append(','.join([ '='.join(x) for x in item['resources'].iteritems() ])) if "params" in item: assert(len(item['params']) > 0) param.append(' '.join(item['params'])) query = ("{jobname}=`qsub -N '{name}' {parameters} {command} " + \ "| sed -e 's/[.].*$//'`").format( name=job, jobname=job, parameters=' '.join(param), command=cmd_to_string(item['cmd']) ) f.write(query + '\n')
def main(args_input, args_output, interval, ignore): aggr = dict() for task_id in range(interval[0] - 1, interval[1]): if task_id == 68 and filename.count('0002')>0: print 'removing task_id 68' continue if task_id in ignore: print 'removing task {}'.format(task_id) continue for filename in args_input: with Open(filename.format(id=task_id), 'r') as f: data = json.load(f) add_dictionaries(aggr, data) compute_stats(aggr) json.dump(aggr, Open(args_output, 'w'), indent=4)
def main(files_filename, output_filename, suffix, base_dir): X = "" Y = "" A = "" with Open(output_filename, 'w') as ff: files = json.load(Open(files_filename)) total = len(files) done = 0 for filename in files: if done %100 ==0: print '{}/{} {:.2}%'.format(done, total, 100.0 * done / total) if filename == "": Fasta.saveAlignmentPiece([(X_name, X), (Y_name, Y), (A_name, A)], ff) X = "" Y = "" A = "" continue done += 1 old_filename = filename keep = False if filename.count('keep') == 0: filename = filename[:-2] + suffix if base_dir != None: filename = base_dir + '/' + filename.split('/')[-1] try: with Open(filename, 'r') as f: l = len(''.join(f).strip()) if l == 0: filename = old_filename keep = True except IOError: filename = old_filename keep = True if filename.count('keep') > 0: keep = True aln = list(Fasta.load(filename, ''))[0] assert(len(aln) == 3) assert(len(aln[0][1]) == len(aln[1][1]) == len(aln[2][1])) X += aln[0][1] if keep: A += '.' * len(aln[0][1]) else: A += aln[1][1] Y += aln[2][1] X_name = aln[0][0] A_name = aln[1][0] Y_name = aln[2][0]
def realign_file(args, model, output_filename, alignment_filename): # begin of HACK if args.expand_model: old_tracks = args.tracks args.tracks.add('trf_cons') m = model if args.annotation_model: m = args.annotation_model annotations = compute_annotations(args, alignment_filename, m) if args.expand_model: consensuses = annotations['trf_cons'] args.tracks = old_tracks if 'trf_cons' not in old_tracks: del args.tracks['trf_cons'] # end of HACK with Open(output_filename, 'w') as output_file_object: for aln in Fasta.load( alignment_filename, args.alignment_regexp, Alignment, sequence_selectors=args.sequence_regexp): if len(aln.sequences) < 2: sys.stderr.write("ERROR: not enough sequences in file\n") return 1 if len(args.draw) == 0: drawer = brainwash(AlignmentCanvas)() else: drawer = AlignmentCanvas() drawer.add_original_alignment(aln) aln, unmask_repeats = args.mask_repeats(aln, annotations) seq1, seq2 = tuple(map(Fasta.alnToSeq, aln.sequences[:2])) perf.msg("Data loaded in {time} seconds.") perf.replace() if args.expand_model: # Potrebujem zistit konsenzy A = consensuses[aln.names[0]] B = consensuses[aln.names[1]] cons = list(A.union(B)) real_model = model.expandModel({'consensus': cons}) else: real_model = model realigner = args.algorithm() realigner.setDrawer(drawer) realigner.prepareData(seq1, aln.names[0], seq2, aln.names[1], aln, real_model, annotations, args) aln = realigner.realign(0, len(seq1), 0, len(seq2)) aln = unmask_repeats(aln) perf.msg("Sequence was realigned in {time} seconds.") perf.replace() if len(args.draw) > 0: drawer.add_sequence('X', seq1) drawer.add_sequence('Y', seq2) drawer.add_alignment_line(101, (255, 0, 255, 255), 2, AlignmentPositionGenerator( Alignment([aln[0], aln[2]]))) drawer.draw(args.draw, 2000, 2000) perf.msg("Image was drawn in {time} seconds.") # Save output_file Fasta.saveAlignmentPiece(aln, output_file_object)
def Maf2FastaGen(input_file, sequences, min_size=0): regs = map(re.compile, sequences) with Open(input_file, 'r') as inp: aln_count = 0 output = [] for line in inp: line = line.strip() if len(line) == 0: continue if line[0] not in ['a', 's']: continue if line[0] == 'a': #out.write("\n") if len(output) > 0 and (len(regs) == 0 or (len(output) == len(regs))): aln_count += 1 yield output output = [] continue line = tuple(re.split('\s+', line)) if len(line) != 7: continue s, src, start, size, strand, srcSize, text = line #if strand == '-': # text = reverseStrand(text) if matched(regs, src) and size >= min_size: output.append( (src, aln_count, text, [start, size, strand, srcSize])) if len(output) > 0 and (len(regs) == 0 or (len(output) == len(regs))): yield output
def main(files, columns, headers, ignore): r = re.compile("^.*/([^/.]*)[.]evaluated.js$") out = [] x = ['type'] if headers == None: x.extend(columns) else: x.extend(headers) x.extend(columns[len(headers):]) out.append(x) columns = map(lambda x: x.split(':'), columns) for filename in files: with Open(filename, 'r') as f: data = json.load(f) rr = r.match(filename) if rr == None: row = [filename] else: row = [rr.group(1)] if row[0] in ignore: continue for column in columns: sel = data for key in column: if isinstance(sel, list): key = int(key) sel = sel[key] row.append(sel) out.append(row) return out
def jbug(structure, text=None, filename=None): dump = json.dumps(jsonize(structure), sort_keys=True, indent=4) if filename: with Open(filename, 'w') as f: f.write(dump) else: print text + ': ' + dump
def main(filelist_filenames, output_filebase, filelist_output): filelist = defaultdict(list) for filelist_filename in filelist_filenames: with Open(filelist_filename, 'r') as f: files = json.load(f) for key, value in files.iteritems(): filelist[key].extend(value) files = list() for key, stat in aggregate(filelist).iteritems(): output_filename = '{base}.{type}.stat'.format(base=output_filebase, type=key) with Open(output_filename, 'w') as f: json.dump(stat, f, indent=4) files.append(output_filename) with Open(filelist_output, 'w') as f: json.dump(files, f, indent=4)
def jcpoint( structure_generator, file_type, io_files, mathType=float, serializer=jsonize, deserializer=dejsonize, ): if file_type in io_files['input']: with Open(io_files['input'][file_type], 'r') as f: return deserializer(json.load(f), mathType) structure = structure_generator() if file_type in io_files['output']: if inspect.isgenerator(structure): structure = list(structure) with Open(io_files['output'][file_type], 'w') as f: json.dump(serializer(structure), f, sort_keys=True, indent=4) return structure
def main(input_file, output_file, sequences, output_type): with Open(output_file, 'w') as out: for alignment in Maf2FastaGen(input_file, sequences): for src, aln_count, text, rest in alignment: if output_type == "normal": out.write('>{0}.{1}\n{2}\n'.format(src, aln_count, text)) elif output_type == "params": out.write('>{0}.{1} {2}\n'.format(src, aln_count, ' '.join(rest)))
def compute_expectations(args, model, output_filename, alignment_filename): annotations = compute_annotations(args, alignment_filename) with Open(output_filename, 'w') as fp: json.dump(jsonize_to_list( list( expectation_generator( args, model, alignment_filename, annotations, ))), fp, indent=4)
def compute_annotations(args, alignment_filename, model): annotations = dict() if 'trf' in args.tracks: trf = None for trf_executable in args.trf: if os.path.exists(trf_executable): trf = TRFDriver(trf_executable, mathType=args.mathType) #break if trf: repeats = trf.run(alignment_filename) annotations['trf'] = repeats if 'original_repeats' in args.tracks: repeats = json.load(Open(alignment_filename + '.repeats', 'r')) for k, v in repeats.iteritems(): repeats[k] = [Repeat(_v[0], _v[1], _v[2], _v[3], _v[4]) for _v in v] annotations['original_repeats'] = repeats if 'trf_cons' in args.tracks: trf = None for trf_executable in args.trf: if os.path.exists(trf_executable): trf = TRFDriver(trf_executable, mathType=args.mathType) #break if trf: repeats = trf.run(alignment_filename) # repeats = json.load(Open(alignment_filename + '.repeats', # 'r')) # for k, v in repeats.iteritems(): # repeats[k] = [Repeat(_v[0], _v[1], _v[2], _v[3], _v[4]) # for _v in v] annotations['trf_cons'] = {} for seq_name in repeats: cons = set([repeat.consensus for repeat in repeats[seq_name]]) annotations['trf_cons'][seq_name] = cons if 'hmm' in args.tracks: paths = None; if args.trf != None and len(args.trf) > 0: paths = args.trf driver = HMMDriver(paths, args.mathType, model) if driver: repeats = driver.run(alignment_filename) annotations['hmm'] = repeats perf.msg("Hints computed in {time} seconds.") perf.replace() return annotations
def main(input_file, length_output, consensus_output, full_length_output): statLen = defaultdict(int) statStr = defaultdict(int) statFull = defaultdict(int) with Open(input_file, 'r') as f: lines = (listConverter(line.strip().split(' '), (int, 0, 2)) for line in f if len(line.split(' ')) >= 15) for line in lines: if line == None: continue statLen[round(10 * (1 + line[1] - line[0]) / len(line[-2])) / 10.0] \ += 1 statStr[line[-2]] += 1 statFull[1 + line[1] - line[0]] += 1 with Open(length_output, 'w') as f: json.dump(statLen, f, indent=4) with Open(consensus_output, 'w') as f: json.dump(statStr, f, indent=4) with Open(full_length_output, 'w') as f: json.dump(statFull, f, indent=4)
def main(input_file, output_file, trf): # THIS IS ONLY GENERATOR!!! alns = (Alignment(a) for a in Fasta.load(input_file, '[.][0-9]+$', Alignment)) # 1. run trf, for trf_executable in trf: if os.path.exists(trf_executable): trf = TRFDriver(trf_executable) break repeats = trf.run(input_file) A = list(compute_annotation_track(alns, repeats)) json.dump(A, Open(output_file, 'w'), indent=4)
def loadGenerator(filename): with Open(filename, 'r') as f: seq_name = "" sequence = "" for line in f: line = line.strip() if len(line) == 0: continue if line[0] == '>': if len(sequence) > 0: yield (seq_name, sequence) seq_name = line[1:] sequence = "" else: sequence += line.strip() if len(sequence) > 0: yield (seq_name, sequence)
def main(): parser = argparse.ArgumentParser(description='Sample alignments.') parser.add_argument('output_file_template', type=str, help="Template for output file. Have to contain " + \ "string '{id}' as placeholder for sequence number.") parser.add_argument('--output_files', type=str, help="File where the " + \ 'list of output files will be written.', default='-') parser.add_argument('--model', type=str, default='data/models/repeatHMM.js', help="Model file") parser.add_argument('--bind_file', nargs='*', help='Replace filenames in ' + 'the input_file model.', default=[]) parser.add_argument('--bind_constant', nargs='*', help='Replace constants' + ' in the input_file model.', default=[]) parser.add_argument('--bind_constant_file', nargs='*', help='Replace' + ' constants in the input_file model.', default=[]) parser.add_argument('n_samples', type=int, help='Number of samples.') parser.add_argument('seq1_length',type=int, help='Length of first sequence.') parser.add_argument('seq2_length', type=int, help='Length of second sequence.') parsed_arg = parser.parse_args() # ====== Validate input parameters ========================================= if parsed_arg.output_file_template.count("{id}") < 1: sys.stderr.write('ERROR: If sampling, output_file filename has to ' +\ 'contain at least one "%d".\n') return 1 if len(parsed_arg.bind_file) % 2 != 0: sys.stderr.write('ERROR: If binding files, the number of arguments has' + 'to be divisible by 2\n') return 1 if len(parsed_arg.bind_constant_file) % 2 != 0: sys.stderr.write('ERROR: If binding constants (as files), the number of' + ' arguments has to be divisible by 2\n') return 1 if len(parsed_arg.bind_constant) % 2 != 0: sys.stderr.write('ERROR: If binding constants, the number of' + ' arguments has to be divisible by 2\n') return 1 # ====== Parse parameters ================================================== output_filename = parsed_arg.output_file_template output_files_filename = parsed_arg.output_files output_files = list() # ====== Load model ======================================================== loader = HMMLoader() for i in range(0, len(parsed_arg.bind_constant), 2): loader.addFile(parsed_arg.bind_file[i], parsed_arg.bind_file[i + 1]) for i in range(0, len(parsed_arg.bind_constant_file), 2): loader.addConstant( parsed_arg.bind_constant_file[i], loader.load(parsed_arg.bind_constant_file[i + 1]) ) for i in range(0, len(parsed_arg.bind_constant), 2): loader.addConstant( parsed_arg.bind_constant[i], loader.loads(parsed_arg.bind_constant[i + 1]), ) model_filename = parsed_arg.model PHMM = loader.load(model_filename)["model"] # ====== Sample ============================================================ PHMM.buildSampleTransitions() n_samples = parsed_arg.n_samples X_len = parsed_arg.seq1_length Y_len = parsed_arg.seq2_length dirname = os.path.dirname(output_filename) if not os.path.exists(dirname): os.makedirs(dirname) for i in range(n_samples): done = False while not done: tandemRepeats = {'sequence1': [], 'sequence2': []} seq = PHMM.generateSequence((X_len, Y_len)) X = "" Y = "" A = "" for (seq, state) in seq: ann_data = None if len(seq) == 2: x, y = seq else: x, y, ann_data = seq dx, dy = len(x), len(y) if ann_data != None: xlen = len(X.replace('-', '')) ylen = len(Y.replace('-', '')) if dx > 0: tandemRepeats['sequence1'].append(( xlen, xlen + dx, dx / ann_data[1], ann_data[0], x )) done = True if dy > 0: tandemRepeats['sequence2'].append(( ylen, ylen + dy, dy / ann_data[2], ann_data[0], y )) done = True A += PHMM.states[state].getChar() * max(dx, dy) X += x + ('-' * (dy - dx)) Y += y + ('-' * (dx - dy)) #if len(X) - X.count('-') > 2 * X_len: # done = False #if len(Y) - Y.count('-') > 2 * Y_len: # done = False aln = [("sequence1", X), ("alignment", A), ("sequence2", Y)] json.dump(tandemRepeats, Open(output_filename.format(id=i) + '.repeats', 'w'), indent=4) Fasta.save(aln, output_filename.format(id=i)) output_files.append(output_filename.format(id=i)) with Open(output_files_filename, 'w') as output_file_object: json.dump(output_files, output_file_object, indent=4) return 0
def main(input_file, realign_output, do_not_touch_output, list_of_files_output, max_length, wrap_length, min_seq_length): realign_counter = 0 do_not_touch_counter = 0 files = [] for alignment in Fasta.load(input_file, '\.[0-9]*$'): if realign_counter % 100 == 0: print(realign_counter, do_not_touch_counter,alignment[0][0]) alignment_len = len(alignment[0][1]) annotation = alignment[2][1] # !!! We expect that first block is not repeat changes = [i for i in range(1, len(annotation)) if annotation[i-1] != annotation[i]] + [len(annotation)] Blocks = zip(changes, changes[1:]) + [(len(annotation), len(annotation) + max_length + 10)] Blocks = [(0, Blocks[0][0])] + Blocks printed = 0 block_start = 0#None block_end = None intervals = [] for block_id in range(1, len(Blocks), 2): current = Blocks[block_id] previous = Blocks[block_id - 1] if block_start == None: startpp = max(printed, previous[0]) if previous[1] - startpp > wrap_length: intervals.append((printed, startpp)) printed = startpp block_start = startpp else: # Pridam tento blok, alebo zacnem novy? if current[1] - block_start > max_length: if previous[1] - previous[0] > wrap_length * 2: intervals.append((block_start, previous[0] + wrap_length)) intervals.append((previous[0] + wrap_length, previous[1] - wrap_length)) printed = previous[1] - wrap_length block_start = previous[1] - wrap_length else: split = (previous[0] + previous[1]) / 2 intervals.append((block_start, split)) block_start = split printed = split #Zacnem novy intervals.append((printed, len(annotation))) assert(len(annotation) == sum([y - x for x, y in intervals])) for i in range(1, len(intervals)): assert(intervals[i - 1][1] == intervals[i][0]) #t = list(range(0, alignment_len, max_length)) + [alignment_len] #intervals = zip(t, t[1:]) for start, stop in intervals: if start >= len(annotation): continue if start == stop: continue assert(start < stop) ann = alignment[2][1][start:stop] output = None seq1 = alignment[0][1] seq2 = alignment[4][1] seq1_len = len(seq1) - seq1.count('-') - seq1.count('.') seq2_len = len(seq2) - seq2.count('-') - seq2.count('.') if ann.count('R') == 0 or min(seq1_len, seq2_len) < min_seq_length or ann.count('R') == len(ann): output = do_not_touch_output.format(id=do_not_touch_counter) do_not_touch_counter += 1 else: output = realign_output.format(id=realign_counter) realign_counter += 1 files.append(output) aln = [ (alignment[0][0], alignment[0][1][start:stop]), (alignment[2][0], alignment[2][1][start:stop]), (alignment[4][0], alignment[4][1][start:stop]) ] #Fasta.save(aln, output, width=-1) files.append(''); with Open(list_of_files_output, 'w') as f: json.dump(files, f, indent=4)
def main(inp, out, alignment_regexp, sequence_regexp, trf=trf_paths): for trf_executable in trf: if os.path.exists(trf_executable): trf = TRFDriver(trf_executable, mathType=float) break repeats = trf.run(inp) stats = defaultdict(int) for aln in Fasta.load(inp, alignment_regexp, Alignment, sequence_selectors=sequence_regexp): X_index = 0 Y_index = 1 X_trf = list( translate_repeat_to_annotation(repeats[aln.names[X_index]], aln.seq_to_aln[X_index])) Y_trf = list( translate_repeat_to_annotation(repeats[aln.names[Y_index]], aln.seq_to_aln[Y_index])) X_ann = list("M" * len(aln.sequences[X_index])) Y_ann = list("M" * len(aln.sequences[Y_index])) B_ann = list("M" * len(aln.sequences[Y_index])) for repeat in X_trf: if repeat.end >= len(X_ann): repeat.end = len(X_ann) - 1 rlen = 1 + repeat.end - repeat.start X_ann[repeat.start:repeat.end + 1] = list("R" * rlen) B_ann[repeat.start:repeat.end + 1] = list("R" * rlen) for repeat in Y_trf: if repeat.end >= len(Y_ann): repeat.end = len(Y_ann) - 1 rlen = 1 + repeat.end - repeat.start Y_ann[repeat.start:repeat.end + 1] = list("R" * rlen) B_ann[repeat.start:repeat.end + 1] = list("R" * rlen) assert (len(X_ann) == len(Y_ann) and len(B_ann) == len(Y_ann)) M_count = len([x for x in B_ann if x == 'M']) R_count = len([x for x in B_ann if x == 'R']) R_segments_count = len([ x for x in zip('M' + ''.join(B_ann), ''.join(B_ann) + 'M') if x[0] != 'R' and x[1] == 'R' ]) stats['M_count'] += M_count stats['R_count'] += R_count stats['R_segment_count'] += R_segments_count changes = [ i for i, x in zip(range(len(B_ann) + 1), zip('M' + ''.join(B_ann), ''.join(B_ann) + 'M')) if x[0] != x[1] ] R_segments = [(changes[i], changes[i + 1]) for i in range(0, len(changes) - (len(changes) % 2), 2)] assert (R_segments_count == len(R_segments)) for start, stop in R_segments: XX = 'M' YY = 'M' for i in range(start, stop): if X_ann[i] == 'R': XX = 'R' if Y_ann[i] == 'R': YY = 'R' assert (B_ann[i] == 'R') stats[XX + YY] += 1 with Open(out, 'w') as f: json.dump(stats, f, indent=4)
parser = \ argparse.ArgumentParser(description='Create specific model from stats') parser.add_argument('model', type=str, help='File containing the seleton of the model') parser.add_argument('filenames', type=str, help='File containing needed list of files (json' + ' containing name of files for emissions,' + ' transition, and statistics from TRF') parser.add_argument('output', type=str, help='Output file for resulting model') parser.add_argument('--parameters', type=str, default='{}', help='Additional parameters (in json as dictionary).') parser.add_argument('--simple_model', type=bool, default=False, help='Whether it is simple model or repeat model') parsed_arg = parser.parse_args() with Open(parsed_arg.filenames, 'r') as f: files = dict([(x.split('.')[-2], x) for x in json.load(f)]) main( parsed_arg.model, json.loads(parsed_arg.parameters), files['emission'], files['transition'], files['trf_consensus'], files['trf_length'], files['trf_cover'], parsed_arg.output, parsed_arg.simple_model, ) perf.printAll()
def main(correct_file, aln_file, output_file, interval=None): task_ids = [None] if os.environ.has_key('SGE_TASK_ID'): if os.environ['SGE_TASK_ID'] != 'undefined': sge_task_id = int(os.environ['SGE_TASK_ID']) if not os.environ.has_key('SGE_STEP_SIZE'): sge_step_size = 1 else: sge_step_size = int(os.environ['SGE_STEP_SIZE']) sge_task_last = int(os.environ['SGE_TASK_LAST']) task_ids = range( sge_task_id, min(sge_task_id + sge_step_size, sge_task_last + 1)) if interval != None: task_ids = range(interval[0], interval[1] + 1) for task_id in task_ids: separator = '' output = {} for fun, tp in [(identity, 'standard'), (expand_repeats, 'expanded_repeats'), (remove_repeats, 'removed_repeats')]: try: for correct, alignment in zip( Fasta.load(correct_file.format(id=task_id - 1), separator, Alignment), Fasta.load(aln_file.format(id=task_id - 1), separator, Alignment)): correct_len = len(correct.getCoordPairs(False)) total_len = correct_len * 2 - correct.sequences[0].count( '-') - correct.sequences[2].count('-') ccc = fun(correct.getCoordPairs(False), correct) if tp == 'removed_repeats': correct_len = len(ccc) total_len = 0 for v1, _, v2 in ccc: if v1 >= 0: total_len += 1 if v2 >= 0: total_len += 1 acc = alignment.getCoordPairs(False) cc = map(lambda x: (x[0], x[2]), ccc) if len(acc[0]) == 3: ac = map(lambda x: (x[0], x[2]), acc) elif len(acc[0]) == 2: ac = acc else: ac = None c = set(cc) a = set(ac) intersect = c.intersection(a) not_in_c = c.difference(a) not_in_a = a.difference(c) symm_diff = c.symmetric_difference(a) score = 0 for v1, v2 in intersect: if v1 >= 0: score += 1 if v2 >= 0: score += 1 dists_correct = defaultdict(int) dists_total = defaultdict(int) position = dict() dists = [99999999] * len(correct.sequences[1]) dst = 9999999 for x, a, y in ccc: position[(x, y)] = a for i in range(len(correct.sequences[1])): if correct.sequences[1][i] == 'R': dst = 0 else: dst += 1 dists[i] = min(dists[i], dst) for i in reversed(range(len(correct.sequences[1]))): if correct.sequences[1][i] == 'R': dst = 0 else: dst += 1 dists[i] = min(dists[i], dst) for pos in c: d = dists[position[pos]] if d == 0: continue dists_total[d] += 1 if pos in ac: dists_correct[d] += 1 def getRepeatAnnotation(coord, annotation): if len(coord[0]) != 3: return set() ret = set() for x, a, y in coord: if annotation[a] == 'R': if x >= 0: ret.add((x, -1)) if y >= 0: ret.add((-1, y)) return ret crann = getRepeatAnnotation(correct.getCoordPairs(False), correct.sequences[1]) arann = getRepeatAnnotation(alignment.getCoordPairs(False), alignment.sequences[1]) def getRepeatBlocks(coord, annotation): if len(coord[0]) != 3: return set() ret = set() x = set() y = set() for _x, a, _y in coord: if annotation[a] == 'R': if _x >= 0: x.add(_x) if _y >= 0: y.add(_y) else: if len(x) + len(y) > 0: if len(x) == 0: x.add(-1) if len(y) == 0: y.add(-1) ret.add(((min(x), max(x) + 1), (min(y), max(y) + 1))) x = set() y = set() if len(x) + len(y) > 0: if len(x) == 0: x.add(-1) if len(y) == 0: y.add(-1) ret.add( ((min(x), max(x) + 1), (min(y), max(y) + 1))) x = set() y = set() return ret cbann = getRepeatBlocks(correct.getCoordPairs(False), correct.sequences[1]) abann = getRepeatBlocks(alignment.getCoordPairs(False), alignment.sequences[1]) def dst(x1, x2): if x1 == -1: return 0 return x2 - x1 def getPoints(s): return sum([ dst(x1, x2) + dst(y1, y2) for ((x1, x2), (y1, y2)) in s ]) # Find long segments that are correctly aligned cseg = [1 if x in c else 0 for x in ac] seg_len = [] length = 0 segment_length_histogram = defaultdict(int) for x in cseg: if x == 0 and length != 0: segment_length_histogram[length] += 1 length = length * x + x seg_len.append(length) if length > 0: segment_length_histogram[length] += 1 getPoints = len output[tp] = { 'corect': correct_file, 'alignment': aln_file, 'c-lenght': len(cc), 'a-length': len(ac), 'intersect': len(intersect), '%correct': 100.0 - float(len(intersect) * 100) / correct_len if correct_len > 0 else 100, '+mistakes': len(intersect), '+len': correct_len, '+RepTP': len(crann & arann), '+RepTN': total_len - len(crann | arann), '+RepFP': len(arann - crann), '+RepFN': len(crann - arann), '+BlkTP': getPoints(cbann & abann), '+BlkTN': 0, '+BlkFP': getPoints(abann - cbann), '+BlkFN': getPoints(cbann - abann), '%score': float(score) * 100 / total_len if total_len > 0 else 0, 'c-a': len(not_in_c), 'a-c': len(not_in_a), 'symmetric_difference': len(symm_diff), 'correct_len_histogram': segment_length_histogram, '@+dists_correct': dists_correct, '@+dists_total': dists_total, } if correct_len == 0: del output[tp]['%correct'] if total_len == 0: del output[tp]['%score'] except IOError: pass with Open(output_file.format(id=task_id - 1), 'w') as f: json.dump(output, f, indent=4)
def main(model_file, additional_parameters, emmisions_file, transitions_file, repeat_consensus_file, repeat_length_file, trf_cover_file, output_file, simple_model): loader = HMMLoader() with Open(trf_cover_file, 'r') as f: trf_cover = json.load(f) if not simple_model: repeat_probability = (float(trf_cover['R_segment_count']) / (trf_cover['R_segment_count'] + trf_cover['M_count'])) repeat_count = sum([trf_cover[x] for x in ['RR', 'RM', 'MR']]) repeat_repeat_probability = float(trf_cover['RR']) / repeat_count nothing_repeat_probability = float(trf_cover['MR']) / repeat_count repeat_nothing_probability = float(trf_cover['RM']) / repeat_count loader.addDictionary('trackemi', {"value": { 'RR': repeat_repeat_probability, 'RM': repeat_nothing_probability, 'MR': nothing_repeat_probability, }}) for k, v in additional_parameters.iteritems(): loader.addDictionary(k, v) # Parse emissions with Open(emmisions_file, 'r') as f: emm = normalize_dict(json.load(f)) emm = [(ast.literal_eval(k), v) for k, v in emm.iteritems()] loader.addDictionary('MatchStateEmissions', {'value': emm}) background_prob = defaultdict(int) for ((r1, r2), v) in emm: background_prob[r1] += v background_prob[r2] += v background_prob = \ {'value': list(normalize_dict(background_prob).iteritems())} loader.addDictionary('background-probability', background_prob) # Parse transitions with Open(transitions_file, 'r') as f: __trans = json.load(f) trans = dict() for k, v in __trans.iteritems(): trans[''.join(ast.literal_eval(k))] = v trans = normalize_tuple_dict(trans) if not simple_model: for k in trans: trans[k] *= (1 - repeat_probability) trans['MR'] = repeat_probability trans['XR'] = repeat_probability trans['YR'] = repeat_probability trans['RR'] = repeat_probability trans['RX'] = (1 - repeat_probability) / 3 trans['RY'] = (1 - repeat_probability) / 3 trans['RM'] = (1 - repeat_probability) / 3 loader.addDictionary('trans', trans) # Parse emissions from trf if not simple_model: loader.addFile('consensus.js', os.path.relpath(os.path.abspath(repeat_consensus_file), os.path.dirname(model_file))) loader.addFile('repeatlength.js', os.path.abspath(repeat_length_file)) model = loader.load(model_file) json_prep = {'model': model['model'].toJSON()} with Open(output_file, 'w') as f: json.dump(json_prep, f, indent=4) return output_file
def load(self, filename): self.filenameStack.append(filename) f = Open(filename, "r") r = json.load(f, object_hook=self.objectHook) self.filenameStack.pop() return r
def __loadJSON(filename): with Open(filename, 'r') as f: return json.load(f)
parser.add_argument('output_files', type=str, help='Output file') parser.add_argument('--start', type=int, default=0, help='Which files to select') parser.add_argument('--step', type=int, default=-1, help='How many files to select (-1 to all)') parser.add_argument('--trf', type=toList, default=trf_paths , help="Location of tandem repeat finder binary") parser.add_argument('--sequence_regexp', nargs='+', default=None, help='Regular expressions used to select sequences.') parser.add_argument('--alignment_regexp', default='', help='Regular expression used to separate alignment' + 'in input file') parsed_arg = parser.parse_args() with Open(parsed_arg.files, 'r') as f: files = json.load(f) start = parsed_arg.start step = parsed_arg.step if step < 0: step = len(files) # Grid engine can always override parameters if os.environ.has_key('SGE_TASK_ID'): start = int(os.environ['SGE_TASK_ID']) if os.environ.has_key('SGE_STEP_SIZE'): step = int(os.environ['SGE_STEP_SIZE']) output_files = main(files[start:start + step], parsed_arg.trf, parsed_arg.alignment_regexp,
def main(n, datadir='data/train_sequences/', fname='simulated_alignment'): s1name = "sequence1" s2name = "sequence2" s3name = "sequence3" annotation_name = 'gene' alignment_extension = ".fa" annotations_extension = ".bed" config_extension = ".js" if len(sys.argv) > 1: n = int(sys.argv[1]) if len(sys.argv) > 2: fname = sys.argv[2] master_gene_sequence = MarkovChain(P_START_GENE, P_STOP_GENE) human_delete_sequence = MarkovChain(P_START_DELETE, P_STOP_DELETE) mouse_delete_sequence = MarkovChain(P_START_DELETE, P_STOP_DELETE) horse_delete_sequence = MarkovChain(P_START_DELETE, P_STOP_DELETE) mutator_coin = BiasedCoin(P_NOT_MUTATE_GENE) master_gene = list() human_gene = list() mouse_gene = list() horse_gene = list() human_dna = list() mouse_dna = list() horse_dna = list() for i in range(n): # create master_gene item g = g2 = g3 = g4 = master_gene_sequence.get_state() # mutate master_gene item if g: g2 = mutator_coin.flip() g3 = mutator_coin.flip() g4 = mutator_coin.flip() dna_mutation_coin = create_dna_mutation_coin(g2 + g3) dna_mutation_coin2 = create_dna_mutation_coin(g2 + g4) # create DNA item c = c2 = c3 = random.randint(0, 3) c2 = mutate(c2, g2 + g3) c, c2, c3 = [DNA_CHARS[i] for i in (c, c2, c3)] if not dna_mutation_coin.flip(): char_index = random.randint(0, 2) if DNA_CHARS[char_index] == c2: char_index = 3 c2 = DNA_CHARS[char_index] if not dna_mutation_coin2.flip(): char_index = random.randint(0, 2) if DNA_CHARS[char_index] == c3: char_index = 3 c3 = DNA_CHARS[char_index] # delete DNA item if human_delete_sequence.get_state(): c = '-' if mouse_delete_sequence.get_state(): c2 = '-' if horse_delete_sequence.get_state(): c3 = '-' # add items to sequence master_gene.append(g) human_gene.append(g2) mouse_gene.append(g3) horse_gene.append(g4) human_dna.append(c) mouse_dna.append(c2) horse_dna.append(c3) # output s1fname = os.path.join( datadir, fname + '_' + s1name + '_' + annotation_name + annotations_extension) if os.path.isfile(s1fname): os.remove(s1fname) s2fname = os.path.join( datadir, fname + '_' + s2name + '_' + annotation_name + annotations_extension) if os.path.isfile(s2fname): os.remove(s2fname) s3fname = os.path.join( datadir, fname + '_' + s3name + '_' + annotation_name + annotations_extension) if os.path.isfile(s3fname): os.remove(s3fname) intervals1 = sequence_to_intervals(get_sequence(human_gene, human_dna), annotation_name) intervals2 = sequence_to_intervals(get_sequence(mouse_gene, mouse_dna), annotation_name) intervals3 = sequence_to_intervals(get_sequence(horse_gene, horse_dna), annotation_name) annotations = Annotations() annotations.setAnnotations([annotation_name]) annotations.addSequences([s1name, s2name, s3name]) annotations.addAnnotationFile(s1name, annotation_name, s1fname) annotations.addAnnotationFile(s2name, annotation_name, s2fname) # annotations.addAnnotationFile(s3name, annotation_name, s3fname) Fasta.save( [ (s1name, ''.join(human_dna)), (s2name, ''.join(mouse_dna)), # (s3name, ''.join(horse_dna)) ], os.path.join(datadir, fname + alignment_extension)) with track.new(s1fname, 'bed') as t: t.fields = ['start', 'end', 'name'] t.write("chr1", intervals1) with track.new(s2fname, 'bed') as t: t.fields = ['start', 'end', 'name'] t.write("chr1", intervals2) # with track.new(s3fname, 'bed') as t: # t.fields = ['start', 'end', 'name'] # t.write("chr1", intervals3) with Open(os.path.join(datadir, fname + config_extension), "w") as f: json.dump(annotations.toJSON(), f)
def createKRepeatHMM( mathType, maxK, time, backgroundProb, indelProb, indelExtProb, repeatProb, endProb, initEndProb=None, silEndProb=None, ): if initEndProb == None: initEndProb = endProb if silEndProb == None: silEndProb = endProb tp = type(backgroundProb) if tp in [dict, defaultdict]: backgroundProb = list(backgroundProb.iteritems()) probabilities = list(backgroundProb) alphabet = [x for x, _ in backgroundProb] for a in alphabet: for b in alphabet: probabilities.append((a + b, JCModelDist(a, b, time))) states = list() transitions = list() end_state = GeneralizedState(mathType) end_state.load({ '__name__': 'GeneralizedState', 'name': 'End', 'startprob': mathType(0.0), 'endprob': mathType(1.0), 'emission': [('', mathType(1.0))], 'durations': [(0, mathType(1.0))], }) states.append(end_state) initTemplate = { '__name__': 'GeneralizedState', 'name': 'I{}', 'startprob': mathType(0.0), 'endprob': mathType(0.0), 'emission': backgroundProb, #,[('', mathType(1.0))],#backgroundProb, 'durations': [(1, mathType(1.0))], } for order in range(1, maxK + 1): if order == 1: initTemplate['startprob'] = mathType(1.0) transitions.append({ 'from': 'I{}'.format(order), 'to': 'R{}'.format(order), 'prob': repeatProb, }) transitions.append({ 'from': 'I{}'.format(order), 'to': 'End', 'prob': initEndProb, }) self_prob = mathType(1.0) self_prob -= repeatProb + initEndProb if order < maxK: transitions.append({ 'from': 'I{}'.format(order), 'to': 'I{}'.format(order + 1), 'prob': self_prob }) initTemplate['name'] = 'I{}'.format(order) state = GeneralizedState(mathType) state.load(initTemplate) states.append(state) silentTemplate = { '__name__': 'GeneralizedState', 'name': 'S{}{}', 'startprob': mathType(0.0), 'endprob': mathType(0.0), 'emission': [('', mathType(1.0))], 'durations': [(0, mathType(1.0))], } insertTemplate = { '__name__': 'GeneralizedState', 'name': 'S{}{}', 'startprob': mathType(0.0), 'endprob': mathType(0.0), 'emission': backgroundProb, 'durations': [(1, mathType(1.0))], } for order in range(1, maxK): insertTemplate['name'] = 'SI{}'.format(order) state = GeneralizedState(mathType) state.load(insertTemplate) states.append(state) end_p = mathType(1.0) if order < maxK - 1: transitions.append({ 'from': 'SI{}'.format(order), 'to': 'SI{}'.format(order + 1), 'prob': indelExtProb }) end_p -= indelExtProb transitions.append({ 'from': 'SI{}'.format(order), 'to': 'End', 'prob': silEndProb }) end_p -= silEndProb transitions.append({ 'from': 'SI{}'.format(order), 'to': 'R{}'.format(order + 1), 'prob': end_p }) silentTemplate['name'] = 'SD{}'.format(order) state = GeneralizedState(mathType) state.load(silentTemplate) states.append(state) end_p = mathType(1.0) transitions.append({ 'from': 'SD{}'.format(order), 'to': 'End', 'prob': silEndProb, }) end_p -= silEndProb if order < maxK - 1: transitions.append({ 'from': 'SD{}'.format(order + 1), 'to': 'SD{}'.format(order), 'prob': indelExtProb }) if order > 1: end_p -= indelExtProb transitions.append({ 'from': 'SD{}'.format(order), 'to': 'R{}'.format(order), 'prob': end_p }) repeatTemplate = { '__name__': 'HighOrderState', 'name': 'R{}', 'startprob': mathType(0.0), 'endprob': mathType(0.0), 'emission': probabilities, 'durations': [(1, mathType(1.0))], 'order': 0 } for order in range(1, maxK + 1): repeatTemplate['name'] = 'R{}'.format(order) repeatTemplate['order'] = order state = HighOrderState(mathType) state.load(repeatTemplate) states.append(state) stayprob = mathType(1.0) transitions.append({ 'from': 'R{}'.format(order), 'to': 'End', 'prob': endProb, }) stayprob -= endProb if order > 1: transitions.append({ 'from': 'R{}'.format(order), 'to': 'SD{}'.format(order - 1), 'prob': indelProb, }) stayprob -= indelProb if order < maxK: transitions.append({ 'from': 'R{}'.format(order), 'to': 'SI{}'.format(order), 'prob': indelProb, }) stayprob -= indelProb transitions.append({ 'from': 'R{}'.format(order), 'to': 'R{}'.format(order), 'prob': stayprob, }) hmm = GeneralizedHMM(mathType) hmm.load({ '__name__': 'GeneralizedHMM', 'states': states, 'transitions': transitions, }) for i in range(len(hmm.states)): hmm.states[i].normalizeTransitions() hmm.reorderStatesTopologically() with Open( 'submodels/newK-{}-{}-{}-{}.js'.format(maxK, time, indelProb, repeatProb), 'w') as f: print f def LogNumToJson(obj): if isinstance(obj, LogNum): return '{0}'.format(str(float(obj))) raise TypeError json.dump(hmm.toJSON(), f, indent=4, sort_keys=True, default=LogNumToJson) return hmm