def main(input_filename, output_filename): task_ids = [1] if os.environ.has_key('SGE_TASK_ID'): sge_task_id = int(os.environ['SGE_TASK_ID']) if not os.environ.has_key('SGE_STEP_SIZE'): sge_step_size = 1 else: sge_step_size = int(os.environ['SGE_STEP_SIZE']) sge_task_last = int(os.environ['SGE_TASK_LAST']) task_ids = range( sge_task_id, min(sge_task_id + sge_step_size, sge_task_last + 1) ) print task_ids, sge_task_id, sge_step_size, sge_task_last for task_id in task_ids: inp_filename = input_filename.format(id=task_id - 1) out_filename = output_filename.format(id=task_id - 1) aln = list(Fasta.load( inp_filename, '', Alignment, sequence_selectors=['sequence1', 'sequence2']))[0] tmp_filename = get_temp_filename() Fasta.save(zip(aln.names, aln.sequences), tmp_filename) os.system("muscle -in {inp} -out {out} 2> /dev/null".format( inp=tmp_filename, out=out_filename, )) os.system("cp {inp}.repeats {out}.repeats".format( inp=inp_filename, out=out_filename, ))
def create_fasta(fname, seq_basename, sequences, rename=False): if rename: names = [seq_basename + str(i + 1) for i in range(len(sequences))] else: names = [s["name"] for s in sequences] assert len(names) == len(sequences) seq = [s["sequence"].upper() for s in sequences] Fasta.save(zip(names, seq), fname) return names
def create_fasta(fname, seq_basename, sequences, rename=False): if rename: names = [seq_basename + str(i + 1) for i in range(len(sequences))] else: names = [s['name'] for s in sequences] assert (len(names) == len(sequences)) seq = [s['sequence'].upper() for s in sequences] Fasta.save(zip(names, seq), fname) return names
def main(): parser = argparse.ArgumentParser(description='Sample alignments.') parser.add_argument('output_file_template', type=str, help="Template for output file. Have to contain " + \ "string '{id}' as placeholder for sequence number.") parser.add_argument('--output_files', type=str, help="File where the " + \ 'list of output files will be written.', default='-') parser.add_argument('--model', type=str, default='data/models/repeatHMM.js', help="Model file") parser.add_argument('--bind_file', nargs='*', help='Replace filenames in ' + 'the input_file model.', default=[]) parser.add_argument('--bind_constant', nargs='*', help='Replace constants' + ' in the input_file model.', default=[]) parser.add_argument('--bind_constant_file', nargs='*', help='Replace' + ' constants in the input_file model.', default=[]) parser.add_argument('n_samples', type=int, help='Number of samples.') parser.add_argument('seq1_length',type=int, help='Length of first sequence.') parser.add_argument('seq2_length', type=int, help='Length of second sequence.') parsed_arg = parser.parse_args() # ====== Validate input parameters ========================================= if parsed_arg.output_file_template.count("{id}") < 1: sys.stderr.write('ERROR: If sampling, output_file filename has to ' +\ 'contain at least one "%d".\n') return 1 if len(parsed_arg.bind_file) % 2 != 0: sys.stderr.write('ERROR: If binding files, the number of arguments has' + 'to be divisible by 2\n') return 1 if len(parsed_arg.bind_constant_file) % 2 != 0: sys.stderr.write('ERROR: If binding constants (as files), the number of' + ' arguments has to be divisible by 2\n') return 1 if len(parsed_arg.bind_constant) % 2 != 0: sys.stderr.write('ERROR: If binding constants, the number of' + ' arguments has to be divisible by 2\n') return 1 # ====== Parse parameters ================================================== output_filename = parsed_arg.output_file_template output_files_filename = parsed_arg.output_files output_files = list() # ====== Load model ======================================================== loader = HMMLoader() for i in range(0, len(parsed_arg.bind_constant), 2): loader.addFile(parsed_arg.bind_file[i], parsed_arg.bind_file[i + 1]) for i in range(0, len(parsed_arg.bind_constant_file), 2): loader.addConstant( parsed_arg.bind_constant_file[i], loader.load(parsed_arg.bind_constant_file[i + 1]) ) for i in range(0, len(parsed_arg.bind_constant), 2): loader.addConstant( parsed_arg.bind_constant[i], loader.loads(parsed_arg.bind_constant[i + 1]), ) model_filename = parsed_arg.model PHMM = loader.load(model_filename)["model"] # ====== Sample ============================================================ PHMM.buildSampleTransitions() n_samples = parsed_arg.n_samples X_len = parsed_arg.seq1_length Y_len = parsed_arg.seq2_length dirname = os.path.dirname(output_filename) if not os.path.exists(dirname): os.makedirs(dirname) for i in range(n_samples): done = False while not done: tandemRepeats = {'sequence1': [], 'sequence2': []} seq = PHMM.generateSequence((X_len, Y_len)) X = "" Y = "" A = "" for (seq, state) in seq: ann_data = None if len(seq) == 2: x, y = seq else: x, y, ann_data = seq dx, dy = len(x), len(y) if ann_data != None: xlen = len(X.replace('-', '')) ylen = len(Y.replace('-', '')) if dx > 0: tandemRepeats['sequence1'].append(( xlen, xlen + dx, dx / ann_data[1], ann_data[0], x )) done = True if dy > 0: tandemRepeats['sequence2'].append(( ylen, ylen + dy, dy / ann_data[2], ann_data[0], y )) done = True A += PHMM.states[state].getChar() * max(dx, dy) X += x + ('-' * (dy - dx)) Y += y + ('-' * (dx - dy)) #if len(X) - X.count('-') > 2 * X_len: # done = False #if len(Y) - Y.count('-') > 2 * Y_len: # done = False aln = [("sequence1", X), ("alignment", A), ("sequence2", Y)] json.dump(tandemRepeats, Open(output_filename.format(id=i) + '.repeats', 'w'), indent=4) Fasta.save(aln, output_filename.format(id=i)) output_files.append(output_filename.format(id=i)) with Open(output_files_filename, 'w') as output_file_object: json.dump(output_files, output_file_object, indent=4) return 0
def select_sequences(inp_filename, out_filename, sequences): aln = list(Fasta.load(inp_filename, '', Alignment, sequence_selectors=sequences))[0] Fasta.save(zip(aln.names, aln.sequences), out_filename)
def main(n, datadir='data/train_sequences/', fname='simulated_alignment'): s1name = "sequence1" s2name = "sequence2" s3name = "sequence3" annotation_name = 'gene' alignment_extension = ".fa" annotations_extension = ".bed" config_extension = ".js" if len(sys.argv) > 1: n = int(sys.argv[1]) if len(sys.argv) > 2: fname = sys.argv[2] master_gene_sequence = MarkovChain(P_START_GENE, P_STOP_GENE) human_delete_sequence = MarkovChain(P_START_DELETE, P_STOP_DELETE) mouse_delete_sequence = MarkovChain(P_START_DELETE, P_STOP_DELETE) horse_delete_sequence = MarkovChain(P_START_DELETE, P_STOP_DELETE) mutator_coin = BiasedCoin(P_NOT_MUTATE_GENE) master_gene = list() human_gene = list() mouse_gene = list() horse_gene = list() human_dna = list() mouse_dna = list() horse_dna = list() for i in range(n): # create master_gene item g = g2 = g3 = g4 = master_gene_sequence.get_state() # mutate master_gene item if g: g2 = mutator_coin.flip() g3 = mutator_coin.flip() g4 = mutator_coin.flip() dna_mutation_coin = create_dna_mutation_coin(g2 + g3) dna_mutation_coin2 = create_dna_mutation_coin(g2 + g4) # create DNA item c = c2 = c3 = random.randint(0, 3) c2 = mutate(c2, g2 + g3) c, c2, c3 = [DNA_CHARS[i] for i in (c, c2, c3)] if not dna_mutation_coin.flip(): char_index = random.randint(0, 2) if DNA_CHARS[char_index] == c2: char_index = 3 c2 = DNA_CHARS[char_index] if not dna_mutation_coin2.flip(): char_index = random.randint(0, 2) if DNA_CHARS[char_index] == c3: char_index = 3 c3 = DNA_CHARS[char_index] # delete DNA item if human_delete_sequence.get_state(): c = '-' if mouse_delete_sequence.get_state(): c2 = '-' if horse_delete_sequence.get_state(): c3 = '-' # add items to sequence master_gene.append(g) human_gene.append(g2) mouse_gene.append(g3) horse_gene.append(g4) human_dna.append(c) mouse_dna.append(c2) horse_dna.append(c3) # output s1fname = os.path.join( datadir, fname + '_' + s1name + '_' + annotation_name + annotations_extension) if os.path.isfile(s1fname): os.remove(s1fname) s2fname = os.path.join( datadir, fname + '_' + s2name + '_' + annotation_name + annotations_extension) if os.path.isfile(s2fname): os.remove(s2fname) s3fname = os.path.join( datadir, fname + '_' + s3name + '_' + annotation_name + annotations_extension) if os.path.isfile(s3fname): os.remove(s3fname) intervals1 = sequence_to_intervals(get_sequence(human_gene, human_dna), annotation_name) intervals2 = sequence_to_intervals(get_sequence(mouse_gene, mouse_dna), annotation_name) intervals3 = sequence_to_intervals(get_sequence(horse_gene, horse_dna), annotation_name) annotations = Annotations() annotations.setAnnotations([annotation_name]) annotations.addSequences([s1name, s2name, s3name]) annotations.addAnnotationFile(s1name, annotation_name, s1fname) annotations.addAnnotationFile(s2name, annotation_name, s2fname) # annotations.addAnnotationFile(s3name, annotation_name, s3fname) Fasta.save( [ (s1name, ''.join(human_dna)), (s2name, ''.join(mouse_dna)), # (s3name, ''.join(horse_dna)) ], os.path.join(datadir, fname + alignment_extension)) with track.new(s1fname, 'bed') as t: t.fields = ['start', 'end', 'name'] t.write("chr1", intervals1) with track.new(s2fname, 'bed') as t: t.fields = ['start', 'end', 'name'] t.write("chr1", intervals2) # with track.new(s3fname, 'bed') as t: # t.fields = ['start', 'end', 'name'] # t.write("chr1", intervals3) with Open(os.path.join(datadir, fname + config_extension), "w") as f: json.dump(annotations.toJSON(), f)
def simulate( n, datadir='data/sequences/train_sequences/', fname='simulated_alignment', ): s1name = "sequence1" s2name = "sequence2" s3name = "sequence3" annotation_name = 'gene' alignment_extension = ".fa" annotations_extension = ".bed" config_extension = ".js" # if len(sys.argv) > 1: # n = int(sys.argv[1]) # if len(sys.argv) > 2: # fname = sys.argv[2] master_gene_sequence = MarkovChain(P_START_GENE, P_STOP_GENE) human_delete_sequence = MarkovChain(P_START_DELETE, P_STOP_DELETE) mouse_delete_sequence = MarkovChain(P_START_DELETE, P_STOP_DELETE) horse_delete_sequence = MarkovChain(P_START_DELETE, P_STOP_DELETE) mutator_coin = BiasedCoin(P_NOT_MUTATE_GENE) master_gene = list() human_gene = list() mouse_gene = list() horse_gene = list() human_dna = list() mouse_dna = list() horse_dna = list() for i in range(n): # create master_gene item g = g2 = g3 = g4 = master_gene_sequence.get_state() # mutate master_gene item if g: g2 = mutator_coin.flip() g3 = mutator_coin.flip() g4 = mutator_coin.flip() dna_mutation_coin = create_dna_mutation_coin(g2 + g3) dna_mutation_coin2 = create_dna_mutation_coin(g2 + g4) # create DNA item c = c1 = c2 = c3 = DNA_CHARS[random.randint(0, 3)] if not dna_mutation_coin.flip(): char_index = random.randint(0, 2) if DNA_CHARS[char_index] == c1: char_index = 3 c1 = DNA_CHARS[char_index] if not dna_mutation_coin.flip(): char_index = random.randint(0, 2) if DNA_CHARS[char_index] == c2: char_index = 3 c2 = DNA_CHARS[char_index] if not dna_mutation_coin2.flip(): char_index = random.randint(0, 2) if DNA_CHARS[char_index] == c3: char_index = 3 c3 = DNA_CHARS[char_index] # delete DNA item if human_delete_sequence.get_state(): c1 = '-' if mouse_delete_sequence.get_state(): c2 = '-' if horse_delete_sequence.get_state(): c3 = '-' # add items to sequence master_gene.append(g) human_gene.append(g2) mouse_gene.append(g3) horse_gene.append(g4) human_dna.append(c1) mouse_dna.append(c2) horse_dna.append(c3) # output s1fname = os.path.join( datadir, fname+'_'+s1name+'_'+annotation_name+annotations_extension ) if os.path.isfile(s1fname): os.remove(s1fname) s2fname = os.path.join( datadir, fname+'_'+s2name+'_'+annotation_name+annotations_extension ) if os.path.isfile(s2fname): os.remove(s2fname) s3fname = os.path.join( datadir, fname+'_'+s3name+'_'+annotation_name+annotations_extension ) if os.path.isfile(s3fname): os.remove(s3fname) intervals1 = sequence_to_intervals( get_sequence(human_gene, human_dna), annotation_name ) intervals2 = sequence_to_intervals( get_sequence(mouse_gene, mouse_dna), annotation_name ) intervals3 = sequence_to_intervals( get_sequence(horse_gene, horse_dna), annotation_name ) annotations = Annotations() annotations.setAnnotations([annotation_name]) annotations.addSequences([s1name, s2name, s3name]) annotations.addAnnotationFile(s1name, annotation_name, s1fname) annotations.addAnnotationFile(s2name, annotation_name, s2fname) annotations.addAnnotationFile(s3name, annotation_name, s3fname) Fasta.save( [ (s1name, ''.join(human_dna)), (s2name, ''.join(mouse_dna)), (s3name, ''.join(horse_dna)) ], os.path.join(datadir, fname+alignment_extension) ) with track.new(s1fname, 'bed') as t: t.fields = ['start', 'end', 'name'] t.write("chr1", intervals1) with track.new(s2fname, 'bed') as t: t.fields = ['start', 'end', 'name'] t.write("chr1", intervals2) with track.new(s3fname, 'bed') as t: t.fields = ['start', 'end', 'name'] t.write("chr1", intervals3) with Open(os.path.join(datadir, fname+config_extension), "w") as f: json.dump(annotations.toJSON(), f)
def select_sequences(inp_filename, out_filename, sequences): aln = list( Fasta.load(inp_filename, '', Alignment, sequence_selectors=sequences))[0] Fasta.save(zip(aln.names, aln.sequences), out_filename)