Esempio n. 1
0
def main(input_filename, output_filename):
    task_ids = [1]
    if os.environ.has_key('SGE_TASK_ID'):
        sge_task_id = int(os.environ['SGE_TASK_ID'])
        if not os.environ.has_key('SGE_STEP_SIZE'):
            sge_step_size = 1
        else:
            sge_step_size = int(os.environ['SGE_STEP_SIZE'])
        sge_task_last = int(os.environ['SGE_TASK_LAST'])
        task_ids = range(
            sge_task_id,
            min(sge_task_id + sge_step_size, sge_task_last + 1)
        )
        print task_ids, sge_task_id, sge_step_size, sge_task_last
    for task_id in task_ids:
        inp_filename = input_filename.format(id=task_id - 1)
        out_filename = output_filename.format(id=task_id - 1)
        aln = list(Fasta.load(
            inp_filename, 
            '', 
            Alignment, 
            sequence_selectors=['sequence1', 'sequence2']))[0]
        tmp_filename = get_temp_filename()
        Fasta.save(zip(aln.names, aln.sequences), tmp_filename)
        os.system("muscle -in {inp} -out {out} 2> /dev/null".format(
            inp=tmp_filename,
            out=out_filename,
        ))
        os.system("cp {inp}.repeats {out}.repeats".format(
            inp=inp_filename,
            out=out_filename,
        ))
Esempio n. 2
0
def create_fasta(fname, seq_basename, sequences, rename=False):
    if rename:
        names = [seq_basename + str(i + 1) for i in range(len(sequences))]
    else:
        names = [s["name"] for s in sequences]
    assert len(names) == len(sequences)
    seq = [s["sequence"].upper() for s in sequences]
    Fasta.save(zip(names, seq), fname)
    return names
Esempio n. 3
0
def create_fasta(fname, seq_basename, sequences, rename=False):
    if rename:
        names = [seq_basename + str(i + 1) for i in range(len(sequences))]
    else:
        names = [s['name'] for s in sequences]
    assert (len(names) == len(sequences))
    seq = [s['sequence'].upper() for s in sequences]
    Fasta.save(zip(names, seq), fname)
    return names
Esempio n. 4
0
def main():
    
    parser = argparse.ArgumentParser(description='Sample alignments.')
    parser.add_argument('output_file_template', type=str, 
                        help="Template for output file. Have to contain " + \
                        "string '{id}' as placeholder for sequence number.")
    parser.add_argument('--output_files', type=str, help="File where the " + \
                        'list of output files will be written.', default='-')
    parser.add_argument('--model', type=str,
                        default='data/models/repeatHMM.js', help="Model file")
    parser.add_argument('--bind_file', nargs='*', help='Replace filenames in '
                        + 'the input_file model.', default=[]) 
    parser.add_argument('--bind_constant', nargs='*', help='Replace constants'
                         + ' in the input_file model.', default=[])
    parser.add_argument('--bind_constant_file', nargs='*', help='Replace' + 
                        ' constants in the input_file model.', default=[])
    parser.add_argument('n_samples', type=int, help='Number of samples.')
    parser.add_argument('seq1_length',type=int, 
                        help='Length of first sequence.')
    parser.add_argument('seq2_length', type=int, 
                        help='Length of second sequence.')
    parsed_arg = parser.parse_args()
      
    # ====== Validate input parameters =========================================

    if parsed_arg.output_file_template.count("{id}") < 1:
        sys.stderr.write('ERROR: If sampling, output_file filename has to ' +\
                         'contain at least one "%d".\n')
        return 1
    if len(parsed_arg.bind_file) % 2 != 0:
        sys.stderr.write('ERROR: If binding files, the number of arguments has'
                         + 'to be divisible by 2\n')
        return 1 
    if len(parsed_arg.bind_constant_file) % 2 != 0:
        sys.stderr.write('ERROR: If binding constants (as files), the number of'
                         + ' arguments has to be divisible by 2\n')
        return 1
    if len(parsed_arg.bind_constant) % 2 != 0:
        sys.stderr.write('ERROR: If binding constants, the number of'
                         + ' arguments has to be divisible by 2\n')
        return 1
    
    # ====== Parse parameters ==================================================
        
    output_filename = parsed_arg.output_file_template
    output_files_filename = parsed_arg.output_files
    output_files = list()
    
    # ====== Load model ========================================================
    loader = HMMLoader() 
    for i in range(0, len(parsed_arg.bind_constant), 2):
        loader.addFile(parsed_arg.bind_file[i], parsed_arg.bind_file[i + 1])
    for i in range(0, len(parsed_arg.bind_constant_file), 2):
        loader.addConstant(
            parsed_arg.bind_constant_file[i],
            loader.load(parsed_arg.bind_constant_file[i + 1])
        )
    for i in range(0, len(parsed_arg.bind_constant), 2):
        loader.addConstant(
            parsed_arg.bind_constant[i],
            loader.loads(parsed_arg.bind_constant[i + 1]),
        )
    model_filename = parsed_arg.model
    PHMM = loader.load(model_filename)["model"]

    # ====== Sample ============================================================
    PHMM.buildSampleTransitions()
    n_samples = parsed_arg.n_samples
    X_len = parsed_arg.seq1_length
    Y_len = parsed_arg.seq2_length
    dirname = os.path.dirname(output_filename)
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    for i in range(n_samples):
        done = False
        while not done:
            tandemRepeats = {'sequence1': [], 'sequence2': []}
            seq = PHMM.generateSequence((X_len, Y_len))
            X = ""
            Y = ""
            A = ""
            for (seq, state) in seq:
                ann_data = None
                if len(seq) == 2:
                    x, y = seq
                else: 
                    x, y, ann_data = seq
                dx, dy = len(x), len(y)
                if ann_data != None:
                    xlen = len(X.replace('-', ''))
                    ylen = len(Y.replace('-', ''))
                    if dx > 0:
                        tandemRepeats['sequence1'].append((
                            xlen, xlen + dx, dx / ann_data[1], ann_data[0], x
                        ))
                        done = True
                    if dy > 0:
                        tandemRepeats['sequence2'].append((
                            ylen, ylen + dy, dy / ann_data[2], ann_data[0], y
                        ))
                        done = True
                A += PHMM.states[state].getChar() * max(dx, dy)
                X += x + ('-' * (dy - dx))
                Y += y + ('-' * (dx - dy))
            #if len(X) - X.count('-') > 2 * X_len:
            #    done = False
            #if len(Y) - Y.count('-') > 2 * Y_len:
            #    done = False
        aln = [("sequence1", X), ("alignment", A), ("sequence2", Y)]
        json.dump(tandemRepeats, Open(output_filename.format(id=i) + '.repeats',
                                      'w'), indent=4)
        Fasta.save(aln, output_filename.format(id=i))
        output_files.append(output_filename.format(id=i))
    with Open(output_files_filename, 'w') as output_file_object:
        json.dump(output_files, output_file_object, indent=4)  
    return 0
Esempio n. 5
0
def select_sequences(inp_filename, out_filename, sequences):
    aln = list(Fasta.load(inp_filename,
                          '',
                          Alignment,
                          sequence_selectors=sequences))[0]
    Fasta.save(zip(aln.names, aln.sequences), out_filename)
Esempio n. 6
0
def main(n, datadir='data/train_sequences/', fname='simulated_alignment'):
    s1name = "sequence1"
    s2name = "sequence2"
    s3name = "sequence3"
    annotation_name = 'gene'

    alignment_extension = ".fa"
    annotations_extension = ".bed"
    config_extension = ".js"

    if len(sys.argv) > 1:
        n = int(sys.argv[1])
    if len(sys.argv) > 2:
        fname = sys.argv[2]

    master_gene_sequence = MarkovChain(P_START_GENE, P_STOP_GENE)
    human_delete_sequence = MarkovChain(P_START_DELETE, P_STOP_DELETE)
    mouse_delete_sequence = MarkovChain(P_START_DELETE, P_STOP_DELETE)
    horse_delete_sequence = MarkovChain(P_START_DELETE, P_STOP_DELETE)
    mutator_coin = BiasedCoin(P_NOT_MUTATE_GENE)

    master_gene = list()
    human_gene = list()
    mouse_gene = list()
    horse_gene = list()

    human_dna = list()
    mouse_dna = list()
    horse_dna = list()

    for i in range(n):
        # create master_gene item
        g = g2 = g3 = g4 = master_gene_sequence.get_state()

        # mutate master_gene item
        if g:
            g2 = mutator_coin.flip()
            g3 = mutator_coin.flip()
            g4 = mutator_coin.flip()

        dna_mutation_coin = create_dna_mutation_coin(g2 + g3)
        dna_mutation_coin2 = create_dna_mutation_coin(g2 + g4)

        # create DNA item
        c = c2 = c3 = random.randint(0, 3)
        c2 = mutate(c2, g2 + g3)
        c, c2, c3 = [DNA_CHARS[i] for i in (c, c2, c3)]
        if not dna_mutation_coin.flip():
            char_index = random.randint(0, 2)
            if DNA_CHARS[char_index] == c2:
                char_index = 3
            c2 = DNA_CHARS[char_index]

        if not dna_mutation_coin2.flip():
            char_index = random.randint(0, 2)
            if DNA_CHARS[char_index] == c3:
                char_index = 3
            c3 = DNA_CHARS[char_index]

        # delete DNA item
        if human_delete_sequence.get_state():
            c = '-'
        if mouse_delete_sequence.get_state():
            c2 = '-'
        if horse_delete_sequence.get_state():
            c3 = '-'

        # add items to sequence
        master_gene.append(g)
        human_gene.append(g2)
        mouse_gene.append(g3)
        horse_gene.append(g4)

        human_dna.append(c)
        mouse_dna.append(c2)
        horse_dna.append(c3)

    # output
    s1fname = os.path.join(
        datadir,
        fname + '_' + s1name + '_' + annotation_name + annotations_extension)
    if os.path.isfile(s1fname):
        os.remove(s1fname)
    s2fname = os.path.join(
        datadir,
        fname + '_' + s2name + '_' + annotation_name + annotations_extension)
    if os.path.isfile(s2fname):
        os.remove(s2fname)
    s3fname = os.path.join(
        datadir,
        fname + '_' + s3name + '_' + annotation_name + annotations_extension)
    if os.path.isfile(s3fname):
        os.remove(s3fname)

    intervals1 = sequence_to_intervals(get_sequence(human_gene, human_dna),
                                       annotation_name)
    intervals2 = sequence_to_intervals(get_sequence(mouse_gene, mouse_dna),
                                       annotation_name)
    intervals3 = sequence_to_intervals(get_sequence(horse_gene, horse_dna),
                                       annotation_name)

    annotations = Annotations()
    annotations.setAnnotations([annotation_name])
    annotations.addSequences([s1name, s2name, s3name])
    annotations.addAnnotationFile(s1name, annotation_name, s1fname)
    annotations.addAnnotationFile(s2name, annotation_name, s2fname)
    # annotations.addAnnotationFile(s3name, annotation_name,  s3fname)

    Fasta.save(
        [
            (s1name, ''.join(human_dna)),
            (s2name, ''.join(mouse_dna)),
            # (s3name, ''.join(horse_dna))
        ],
        os.path.join(datadir, fname + alignment_extension))

    with track.new(s1fname, 'bed') as t:
        t.fields = ['start', 'end', 'name']
        t.write("chr1", intervals1)
    with track.new(s2fname, 'bed') as t:
        t.fields = ['start', 'end', 'name']
        t.write("chr1", intervals2)
    # with track.new(s3fname, 'bed') as t:
    #     t.fields = ['start', 'end', 'name']
    #     t.write("chr1", intervals3)

    with Open(os.path.join(datadir, fname + config_extension), "w") as f:
        json.dump(annotations.toJSON(), f)
Esempio n. 7
0
def simulate(
    n,
    datadir='data/sequences/train_sequences/',
    fname='simulated_alignment',
):
    s1name = "sequence1"
    s2name = "sequence2"
    s3name = "sequence3"
    annotation_name = 'gene'

    alignment_extension = ".fa"
    annotations_extension = ".bed"
    config_extension = ".js"

    # if len(sys.argv) > 1:
    #     n = int(sys.argv[1])
    # if len(sys.argv) > 2:
    #     fname = sys.argv[2]

    master_gene_sequence = MarkovChain(P_START_GENE, P_STOP_GENE)
    human_delete_sequence = MarkovChain(P_START_DELETE, P_STOP_DELETE)
    mouse_delete_sequence = MarkovChain(P_START_DELETE, P_STOP_DELETE)
    horse_delete_sequence = MarkovChain(P_START_DELETE, P_STOP_DELETE)
    mutator_coin = BiasedCoin(P_NOT_MUTATE_GENE)

    master_gene = list()
    human_gene = list()
    mouse_gene = list()
    horse_gene = list()

    human_dna = list()
    mouse_dna = list()
    horse_dna = list()

    for i in range(n):
        # create master_gene item
        g = g2 = g3 = g4 = master_gene_sequence.get_state()

        # mutate master_gene item
        if g:
            g2 = mutator_coin.flip()
            g3 = mutator_coin.flip()
            g4 = mutator_coin.flip()

        dna_mutation_coin = create_dna_mutation_coin(g2 + g3)
        dna_mutation_coin2 = create_dna_mutation_coin(g2 + g4)

        # create DNA item
        c = c1 = c2 = c3 = DNA_CHARS[random.randint(0, 3)]
        if not dna_mutation_coin.flip():
            char_index = random.randint(0, 2)
            if DNA_CHARS[char_index] == c1:
                char_index = 3
            c1 = DNA_CHARS[char_index]

        if not dna_mutation_coin.flip():
            char_index = random.randint(0, 2)
            if DNA_CHARS[char_index] == c2:
                char_index = 3
            c2 = DNA_CHARS[char_index]

        if not dna_mutation_coin2.flip():
            char_index = random.randint(0, 2)
            if DNA_CHARS[char_index] == c3:
                char_index = 3
            c3 = DNA_CHARS[char_index]

        # delete DNA item
        if human_delete_sequence.get_state():
            c1 = '-'
        if mouse_delete_sequence.get_state():
            c2 = '-'
        if horse_delete_sequence.get_state():
            c3 = '-'

        # add items to sequence
        master_gene.append(g)
        human_gene.append(g2)
        mouse_gene.append(g3)
        horse_gene.append(g4)

        human_dna.append(c1)
        mouse_dna.append(c2)
        horse_dna.append(c3)

    # output
    s1fname = os.path.join(
        datadir, fname+'_'+s1name+'_'+annotation_name+annotations_extension
    )
    if os.path.isfile(s1fname):
        os.remove(s1fname)
    s2fname = os.path.join(
        datadir, fname+'_'+s2name+'_'+annotation_name+annotations_extension
    )
    if os.path.isfile(s2fname):
        os.remove(s2fname)
    s3fname = os.path.join(
        datadir, fname+'_'+s3name+'_'+annotation_name+annotations_extension
    )
    if os.path.isfile(s3fname):
        os.remove(s3fname)

    intervals1 = sequence_to_intervals(
        get_sequence(human_gene, human_dna), annotation_name
    )
    intervals2 = sequence_to_intervals(
        get_sequence(mouse_gene, mouse_dna), annotation_name
    )
    intervals3 = sequence_to_intervals(
        get_sequence(horse_gene, horse_dna), annotation_name
    )

    annotations = Annotations()
    annotations.setAnnotations([annotation_name])
    annotations.addSequences([s1name, s2name, s3name])
    annotations.addAnnotationFile(s1name, annotation_name,  s1fname)
    annotations.addAnnotationFile(s2name, annotation_name,  s2fname)
    annotations.addAnnotationFile(s3name, annotation_name,  s3fname)

    Fasta.save(
        [
            (s1name, ''.join(human_dna)),
            (s2name, ''.join(mouse_dna)),
            (s3name, ''.join(horse_dna))
        ],
        os.path.join(datadir, fname+alignment_extension)
    )

    with track.new(s1fname, 'bed') as t:
        t.fields = ['start', 'end', 'name']
        t.write("chr1", intervals1)
    with track.new(s2fname, 'bed') as t:
        t.fields = ['start', 'end', 'name']
        t.write("chr1", intervals2)
    with track.new(s3fname, 'bed') as t:
        t.fields = ['start', 'end', 'name']
        t.write("chr1", intervals3)

    with Open(os.path.join(datadir, fname+config_extension), "w") as f:
        json.dump(annotations.toJSON(), f)
Esempio n. 8
0
def select_sequences(inp_filename, out_filename, sequences):
    aln = list(
        Fasta.load(inp_filename, '', Alignment,
                   sequence_selectors=sequences))[0]
    Fasta.save(zip(aln.names, aln.sequences), out_filename)