Ejemplo n.º 1
0
    def realign(self, x, dx, y, dy):
        if 'viterbi_path' not in self.io_files['input']:
            path = self.model.getViterbiPath(self.table)
            if 'viterbi_path' in self.io_files['output']:
                with Open(self.io_files['output']['viterbi_path.js'],
                          'w') as f:
                    json.dump(jsonize(path), f, indent=4)
        else:
            path = dejsonize_struct(
                json.load(Open(self.io_files['input']['viterbi_path'])),
                (list, (tuple, int, (tuple, int),
                        (tuple, int), lambda x: LogNum(x, False))))

        X = ""
        Y = ""
        A = ""

        for (state, (_x, _y), (_dx, _dy), _) in path:
            X += self.X[_x - _dx:_x] + ('-' * (max(_dx, _dy) - _dx))
            Y += self.Y[_y - _dy:_y] + ('-' * (max(_dx, _dy) - _dy))
            A += self.model.states[state].getChar() * max(_dx, _dy)

        return [
            (self.X_name, X),
            ("viterbi annotation of " + self.X_name + " and " + self.Y_name,
             A), (self.Y_name, Y)
        ]
Ejemplo n.º 2
0
def main(input_file, index1, index2, emissionOutput, transitionOutput):

    emissions = defaultdict(int)
    transitions = defaultdict(int)

    X = None
    Y = None

    def aggregate(X, Y):
        pairs = zip(X, Y)
        for p in pairs:
            if skip(p):
                continue
            emissions[str(upper(p))] += 1
        Types = list(sortTypes([getType(x) for x in pairs if x != ('-', '-')]))
        for p in zip(Types, Types[1:]):
            transitions[str(p)] += 1

    for aln in Fasta.load(input_file, '[.][0-9]+$'):
        count = 0
        for _, sequence in aln:
            if count == index1:
                X = sequence
            if count == index2:
                Y = sequence
            count += 1
        aggregate(X, Y)

    with Open(emissionOutput, 'w') as f:
        json.dump(emissions, f, indent=4)

    with Open(transitionOutput, 'w') as f:
        json.dump(transitions, f, indent=4)
Ejemplo n.º 3
0
    def computeViterbiTable(self):
        if 'viterbi' not in self.io_files['input']:
            for state in self.model.states:
                if isinstance(state, ClassifierState):
                    emission_table = SequenceTablePrecompute(
                        state.clf, self.positionGenerator, self.X, self.Y,
                        state.ann_x, state.ann_y)
                    emission_table.compute()
                    state.set_emission_table(emission_table)

            self.table = self.model.getViterbiTable(
                self.X,
                0,
                len(self.X),
                self.Y,
                0,
                len(self.Y),
                positionGenerator=self.positionGenerator)
            x = jsonize(self.table)
            if 'viterbi' in self.io_files['output']:
                with Open(self.io_files['output']['viterbi'], 'w') as f:
                    json.dump(x, f, indent=4)
        else:
            self.table = dejsonize_struct(
                json.load(Open(self.io_files['input']['viterbi'])),
                (list, (dict, int,
                        (dict, (tuple, int),
                         (tuple, lambda x: LogNum(x, False), int)))))
Ejemplo n.º 4
0
def main(alignment, working_directory, split_count, output_file,
         seq_selectors):

    seq_selectors = map(re.compile, seq_selectors)
    # TODO: check na to, ci to uz existuje, a ked ano, tak to nespravil znova

    if not os.path.exists(working_directory):
        os.makedirs(working_directory)
    # Prepare alignment into right format

    filename = os.path.basename(alignment)
    extension = filename.split('.')[-1].lower()
    base = '.'.join(filename.split('.')[:-2])
    if extension == 'gz':
        extension = filename.split('.')[-2].lower()
        base = '.'.join(filename.split('.')[:-3])

    if extension == 'fa':
        # fasta_generator = alignment
        assert (False)
    elif extension == 'maf':
        fasta_generator = Maf2FastaGen(alignment)
    else:
        assert (False)

    parallel_dir = '{dir}/{base}_parallel'.format(
        dir=working_directory,
        base=base,
    )

    if not os.path.exists(parallel_dir):
        os.makedirs(parallel_dir)

    filenames = [
        '{dir}/alignment_{index:04d}.fa'.format(dir=parallel_dir, index=i + 1)
        for i in range(split_count)
    ]
    files = [Open(name, 'w') for name in filenames]

    for aln in fasta_generator:
        new_aln = []
        for src, aln_count, text in aln:
            add = False
            for selector in seq_selectors:
                if selector.match(src) != None:
                    add = True
            if add:
                new_aln.append('>{0}.{1}\n{2}\n'.format(src, aln_count, text))
        if len(new_aln) == 2:
            new_aln.sort(key=lambda x: x[0])
            files[aln_count % split_count].writelines(new_aln)

    map(lambda x: x.close(), files)

    with Open(output_file, 'w') as f:
        json.dump(filenames, f, indent=4)
Ejemplo n.º 5
0
def main(config_file, output_file):
    
    
    with Open(config_file, 'r') as f:
        config = json.load(f)
        
    graph = dict()
    for name, item in config.iteritems():
        graph[name] = [] if "depends" not in item else item['depends']
    
    with Open(output_file, 'w') as f:
        
        f.write('#!/bin/bash\n\n')
        for job in toposort(graph):
            item = config[job]
            param = ['-terse', '-cwd']
                    
            if "depends" in item:
                param.append('-hold_jid')
                param.append(','.join(['$' + x for x in item['depends']]))
            
            if "array" in item:
                param.append('-t')
                assert(len(item['array']) > 0 and len(item['array']) < 4)
                param.append(
                    ''.join([
                        ''.join(x) 
                        for x in zip(['', '-', ':'], map(str, item['array']))
                    ])
                )
            if 'stdout' in item:
                param.append('-o')
                param.append("'{}'".format(item['stdout']))
            if 'stderr' in item:
                param.append('-e')
                param.append("'{}'".format(item['stderr']))
            if "resources" in item:
                assert(len(item['resources']) > 0)
                param.append('-l')
                param.append(','.join([
                    '='.join(x) for x in item['resources'].iteritems()
                ]))
            
            if "params" in item:
                assert(len(item['params']) > 0)
                param.append(' '.join(item['params']))
            query = ("{jobname}=`qsub -N '{name}' {parameters} {command} " + \
                "| sed -e 's/[.].*$//'`").format(
                name=job,
                jobname=job,
                parameters=' '.join(param),
                command=cmd_to_string(item['cmd'])
            )
            f.write(query + '\n')
Ejemplo n.º 6
0
def main(args_input, args_output, interval, ignore):
    aggr = dict()
    for task_id in range(interval[0] - 1, interval[1]):
        if task_id == 68 and filename.count('0002')>0:
            print 'removing task_id 68'
            continue
        if task_id in ignore:
            print 'removing task {}'.format(task_id)
            continue
        for filename in args_input:
            with Open(filename.format(id=task_id), 'r') as f:
                data = json.load(f)
                add_dictionaries(aggr, data)
    compute_stats(aggr)
    json.dump(aggr, Open(args_output, 'w'), indent=4)
Ejemplo n.º 7
0
def main(files_filename, output_filename, suffix, base_dir):
    X = ""
    Y = ""
    A = ""
    with Open(output_filename, 'w') as ff:
        files = json.load(Open(files_filename))
        total = len(files)
        done = 0
        for filename in files:
            if done %100 ==0:
                print '{}/{} {:.2}%'.format(done, total, 100.0 * done / total)
            if filename == "":
                Fasta.saveAlignmentPiece([(X_name, X), (Y_name, Y), (A_name, A)], ff)
                X = ""
                Y = ""
                A = ""
                continue
            done += 1
            old_filename = filename
            keep = False
            
            if filename.count('keep') == 0:
                filename = filename[:-2] + suffix
                if base_dir != None:
                    filename = base_dir + '/' + filename.split('/')[-1]
                try:
                    with Open(filename, 'r') as f:
                        l = len(''.join(f).strip())
                    if l == 0:
                        filename = old_filename
                        keep = True
                except IOError:
                    filename = old_filename
                    keep = True
            if filename.count('keep') > 0:
                keep = True
            aln = list(Fasta.load(filename, ''))[0]
            assert(len(aln) == 3)
            assert(len(aln[0][1]) == len(aln[1][1]) == len(aln[2][1]))
            X += aln[0][1]
            if keep:
                A += '.' * len(aln[0][1])
            else: 
                A += aln[1][1]
            Y += aln[2][1]
            X_name = aln[0][0]
            A_name = aln[1][0]
            Y_name = aln[2][0]
Ejemplo n.º 8
0
def realign_file(args, model, output_filename, alignment_filename):
    # begin of HACK
    if args.expand_model:
        old_tracks = args.tracks
        args.tracks.add('trf_cons')
    m = model
    if args.annotation_model:
        m = args.annotation_model
    annotations = compute_annotations(args, alignment_filename, m)
    if args.expand_model:
        consensuses = annotations['trf_cons']
        args.tracks = old_tracks
        if 'trf_cons' not in old_tracks:
            del args.tracks['trf_cons']
    # end of HACK
    with Open(output_filename, 'w') as output_file_object:
        for aln in Fasta.load(
            alignment_filename, 
            args.alignment_regexp, 
            Alignment, 
            sequence_selectors=args.sequence_regexp):
            if len(aln.sequences) < 2:
                sys.stderr.write("ERROR: not enough sequences in file\n")
                return 1
            if len(args.draw) == 0:
                drawer = brainwash(AlignmentCanvas)()
            else:
                drawer = AlignmentCanvas()
                drawer.add_original_alignment(aln)
            aln, unmask_repeats = args.mask_repeats(aln, annotations)
            seq1, seq2 = tuple(map(Fasta.alnToSeq, aln.sequences[:2]))
            perf.msg("Data loaded in {time} seconds.")
            perf.replace()
            if args.expand_model:
                # Potrebujem zistit konsenzy
                A = consensuses[aln.names[0]]
                B = consensuses[aln.names[1]]
                cons = list(A.union(B))
                real_model = model.expandModel({'consensus': cons})
            else: 
                real_model = model
            realigner = args.algorithm()
            realigner.setDrawer(drawer)
            realigner.prepareData(seq1, aln.names[0], seq2, aln.names[1], aln, 
                                  real_model, annotations, args)
                                                              
            aln = realigner.realign(0, len(seq1), 0, len(seq2))
            aln = unmask_repeats(aln)
            perf.msg("Sequence was realigned in {time} seconds.")
            perf.replace()
            if len(args.draw) > 0:
                drawer.add_sequence('X', seq1)
                drawer.add_sequence('Y', seq2)
                drawer.add_alignment_line(101, (255, 0, 255, 255), 2, 
                                          AlignmentPositionGenerator(
                                              Alignment([aln[0], aln[2]])))
                drawer.draw(args.draw, 2000, 2000)
                perf.msg("Image was drawn in {time} seconds.")
            # Save output_file
            Fasta.saveAlignmentPiece(aln, output_file_object)
Ejemplo n.º 9
0
def Maf2FastaGen(input_file, sequences, min_size=0):
    regs = map(re.compile, sequences)

    with Open(input_file, 'r') as inp:
        aln_count = 0
        output = []
        for line in inp:
            line = line.strip()
            if len(line) == 0:
                continue
            if line[0] not in ['a', 's']:
                continue
            if line[0] == 'a':
                #out.write("\n")
                if len(output) > 0 and (len(regs) == 0 or
                                        (len(output) == len(regs))):
                    aln_count += 1
                    yield output
                output = []
                continue
            line = tuple(re.split('\s+', line))
            if len(line) != 7:
                continue
            s, src, start, size, strand, srcSize, text = line
            #if strand == '-':
            #    text = reverseStrand(text)
            if matched(regs, src) and size >= min_size:
                output.append(
                    (src, aln_count, text, [start, size, strand, srcSize]))
    if len(output) > 0 and (len(regs) == 0 or (len(output) == len(regs))):
        yield output
Ejemplo n.º 10
0
def main(files, columns, headers, ignore):
    r = re.compile("^.*/([^/.]*)[.]evaluated.js$")
    out = []
    x = ['type']
    if headers == None:
        x.extend(columns)
    else:
        x.extend(headers)
        x.extend(columns[len(headers):])
    out.append(x)
    columns = map(lambda x: x.split(':'), columns)
    for filename in files:
        with Open(filename, 'r') as f:
            data = json.load(f)
        rr = r.match(filename)
        if rr == None:
            row = [filename]
        else:
            row = [rr.group(1)]
        if row[0] in ignore:
            continue
        for column in columns:
            sel = data
            for key in column:
                if isinstance(sel, list):
                    key = int(key)
                sel = sel[key]
            row.append(sel)
        out.append(row)
    return out
Ejemplo n.º 11
0
def jbug(structure, text=None, filename=None):
    dump = json.dumps(jsonize(structure), sort_keys=True, indent=4)
    if filename:
        with Open(filename, 'w') as f:
            f.write(dump)
    else:          
        print text + ': ' + dump
Ejemplo n.º 12
0
def main(filelist_filenames, output_filebase, filelist_output):
    filelist = defaultdict(list)
    for filelist_filename in filelist_filenames:
        with Open(filelist_filename, 'r') as f:
            files = json.load(f)
        for key, value in files.iteritems():
            filelist[key].extend(value)

    files = list()
    for key, stat in aggregate(filelist).iteritems():
        output_filename = '{base}.{type}.stat'.format(base=output_filebase, 
                                                      type=key)
        with Open(output_filename, 'w') as f:
            json.dump(stat, f, indent=4)
        files.append(output_filename)
            
    with Open(filelist_output, 'w') as f:
        json.dump(files, f, indent=4)
Ejemplo n.º 13
0
def jcpoint(
    structure_generator,
    file_type,
    io_files,
    mathType=float,
    serializer=jsonize,
    deserializer=dejsonize,
):
    if file_type in io_files['input']:
        with Open(io_files['input'][file_type], 'r') as f:
            return deserializer(json.load(f),  mathType)
    structure = structure_generator()
    if file_type in io_files['output']:
        if inspect.isgenerator(structure):
            structure = list(structure)
        with Open(io_files['output'][file_type], 'w') as f:
            json.dump(serializer(structure), f, sort_keys=True, indent=4)
    return structure
Ejemplo n.º 14
0
def main(input_file, output_file, sequences, output_type):

    with Open(output_file, 'w') as out:
        for alignment in Maf2FastaGen(input_file, sequences):
            for src, aln_count, text, rest in alignment:
                if output_type == "normal":
                    out.write('>{0}.{1}\n{2}\n'.format(src, aln_count, text))
                elif output_type == "params":
                    out.write('>{0}.{1} {2}\n'.format(src, aln_count,
                                                      ' '.join(rest)))
Ejemplo n.º 15
0
def compute_expectations(args, model, output_filename, alignment_filename):
    annotations = compute_annotations(args, alignment_filename)
    with Open(output_filename, 'w') as fp:
        json.dump(jsonize_to_list(
            list(
                expectation_generator(
                    args,
                    model,
                    alignment_filename,
                    annotations,
                ))),
                  fp,
                  indent=4)
Ejemplo n.º 16
0
def compute_annotations(args, alignment_filename, model):
    annotations = dict()
    if 'trf' in args.tracks:
        trf = None
        for trf_executable in args.trf:
            if os.path.exists(trf_executable):
                trf = TRFDriver(trf_executable, mathType=args.mathType)
                #break
        if trf:
            repeats = trf.run(alignment_filename)
            annotations['trf'] = repeats
                        
    if 'original_repeats' in args.tracks:
        repeats = json.load(Open(alignment_filename + '.repeats',
                                 'r'))
        for k, v in repeats.iteritems():
            repeats[k] = [Repeat(_v[0], _v[1], _v[2], _v[3], _v[4]) 
                          for _v in v]
        
        annotations['original_repeats'] = repeats

    if 'trf_cons' in args.tracks:
        trf = None
        for trf_executable in args.trf:
            if os.path.exists(trf_executable):
                trf = TRFDriver(trf_executable, mathType=args.mathType)
                #break
        if trf:
            repeats = trf.run(alignment_filename)
        #    repeats = json.load(Open(alignment_filename + '.repeats',
        #                         'r'))
        #    for k, v in repeats.iteritems():
        #        repeats[k] = [Repeat(_v[0], _v[1], _v[2], _v[3], _v[4]) 
        #                      for _v in v]
            annotations['trf_cons'] = {}
            for seq_name in repeats:
                cons = set([repeat.consensus for repeat in repeats[seq_name]])
                annotations['trf_cons'][seq_name] = cons
    
    if 'hmm' in args.tracks:
        paths = None;
        if args.trf != None and len(args.trf) > 0:
            paths = args.trf
        driver = HMMDriver(paths, args.mathType, model)
        if driver:
            repeats = driver.run(alignment_filename)
            annotations['hmm'] = repeats
            
    perf.msg("Hints computed in {time} seconds.")
    perf.replace()
    return annotations
Ejemplo n.º 17
0
def main(input_file, length_output, consensus_output, full_length_output):

    statLen = defaultdict(int)
    statStr = defaultdict(int)
    statFull = defaultdict(int)

    with Open(input_file, 'r') as f:
        lines = (listConverter(line.strip().split(' '), (int, 0, 2))
                 for line in f if len(line.split(' ')) >= 15)
        for line in lines:
            if line == None:
                continue
            statLen[round(10 * (1 + line[1] - line[0]) / len(line[-2])) / 10.0] \
                += 1
            statStr[line[-2]] += 1
            statFull[1 + line[1] - line[0]] += 1

    with Open(length_output, 'w') as f:
        json.dump(statLen, f, indent=4)
    with Open(consensus_output, 'w') as f:
        json.dump(statStr, f, indent=4)
    with Open(full_length_output, 'w') as f:
        json.dump(statFull, f, indent=4)
Ejemplo n.º 18
0
def main(input_file, output_file, trf):
    
    # THIS IS ONLY GENERATOR!!!
    alns = (Alignment(a) 
            for a in Fasta.load(input_file, '[.][0-9]+$', Alignment))
    
    # 1. run trf, 
    for trf_executable in trf:
        if os.path.exists(trf_executable):  
            trf = TRFDriver(trf_executable)
            break
    repeats = trf.run(input_file)
    
    
    
    A = list(compute_annotation_track(alns, repeats))
    json.dump(A, Open(output_file, 'w'), indent=4)
Ejemplo n.º 19
0
def loadGenerator(filename):
    with Open(filename, 'r') as f:
        seq_name = ""
        sequence = ""
        for line in f:
            line = line.strip()
            if len(line) == 0:
                continue
            if line[0] == '>':
                if len(sequence) > 0:
                    yield (seq_name, sequence)
                seq_name = line[1:]
                sequence = ""
            else:
                sequence += line.strip()
        if len(sequence) > 0:
            yield (seq_name, sequence)
Ejemplo n.º 20
0
def main():
    
    parser = argparse.ArgumentParser(description='Sample alignments.')
    parser.add_argument('output_file_template', type=str, 
                        help="Template for output file. Have to contain " + \
                        "string '{id}' as placeholder for sequence number.")
    parser.add_argument('--output_files', type=str, help="File where the " + \
                        'list of output files will be written.', default='-')
    parser.add_argument('--model', type=str,
                        default='data/models/repeatHMM.js', help="Model file")
    parser.add_argument('--bind_file', nargs='*', help='Replace filenames in '
                        + 'the input_file model.', default=[]) 
    parser.add_argument('--bind_constant', nargs='*', help='Replace constants'
                         + ' in the input_file model.', default=[])
    parser.add_argument('--bind_constant_file', nargs='*', help='Replace' + 
                        ' constants in the input_file model.', default=[])
    parser.add_argument('n_samples', type=int, help='Number of samples.')
    parser.add_argument('seq1_length',type=int, 
                        help='Length of first sequence.')
    parser.add_argument('seq2_length', type=int, 
                        help='Length of second sequence.')
    parsed_arg = parser.parse_args()
      
    # ====== Validate input parameters =========================================

    if parsed_arg.output_file_template.count("{id}") < 1:
        sys.stderr.write('ERROR: If sampling, output_file filename has to ' +\
                         'contain at least one "%d".\n')
        return 1
    if len(parsed_arg.bind_file) % 2 != 0:
        sys.stderr.write('ERROR: If binding files, the number of arguments has'
                         + 'to be divisible by 2\n')
        return 1 
    if len(parsed_arg.bind_constant_file) % 2 != 0:
        sys.stderr.write('ERROR: If binding constants (as files), the number of'
                         + ' arguments has to be divisible by 2\n')
        return 1
    if len(parsed_arg.bind_constant) % 2 != 0:
        sys.stderr.write('ERROR: If binding constants, the number of'
                         + ' arguments has to be divisible by 2\n')
        return 1
    
    # ====== Parse parameters ==================================================
        
    output_filename = parsed_arg.output_file_template
    output_files_filename = parsed_arg.output_files
    output_files = list()
    
    # ====== Load model ========================================================
    loader = HMMLoader() 
    for i in range(0, len(parsed_arg.bind_constant), 2):
        loader.addFile(parsed_arg.bind_file[i], parsed_arg.bind_file[i + 1])
    for i in range(0, len(parsed_arg.bind_constant_file), 2):
        loader.addConstant(
            parsed_arg.bind_constant_file[i],
            loader.load(parsed_arg.bind_constant_file[i + 1])
        )
    for i in range(0, len(parsed_arg.bind_constant), 2):
        loader.addConstant(
            parsed_arg.bind_constant[i],
            loader.loads(parsed_arg.bind_constant[i + 1]),
        )
    model_filename = parsed_arg.model
    PHMM = loader.load(model_filename)["model"]

    # ====== Sample ============================================================
    PHMM.buildSampleTransitions()
    n_samples = parsed_arg.n_samples
    X_len = parsed_arg.seq1_length
    Y_len = parsed_arg.seq2_length
    dirname = os.path.dirname(output_filename)
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    for i in range(n_samples):
        done = False
        while not done:
            tandemRepeats = {'sequence1': [], 'sequence2': []}
            seq = PHMM.generateSequence((X_len, Y_len))
            X = ""
            Y = ""
            A = ""
            for (seq, state) in seq:
                ann_data = None
                if len(seq) == 2:
                    x, y = seq
                else: 
                    x, y, ann_data = seq
                dx, dy = len(x), len(y)
                if ann_data != None:
                    xlen = len(X.replace('-', ''))
                    ylen = len(Y.replace('-', ''))
                    if dx > 0:
                        tandemRepeats['sequence1'].append((
                            xlen, xlen + dx, dx / ann_data[1], ann_data[0], x
                        ))
                        done = True
                    if dy > 0:
                        tandemRepeats['sequence2'].append((
                            ylen, ylen + dy, dy / ann_data[2], ann_data[0], y
                        ))
                        done = True
                A += PHMM.states[state].getChar() * max(dx, dy)
                X += x + ('-' * (dy - dx))
                Y += y + ('-' * (dx - dy))
            #if len(X) - X.count('-') > 2 * X_len:
            #    done = False
            #if len(Y) - Y.count('-') > 2 * Y_len:
            #    done = False
        aln = [("sequence1", X), ("alignment", A), ("sequence2", Y)]
        json.dump(tandemRepeats, Open(output_filename.format(id=i) + '.repeats',
                                      'w'), indent=4)
        Fasta.save(aln, output_filename.format(id=i))
        output_files.append(output_filename.format(id=i))
    with Open(output_files_filename, 'w') as output_file_object:
        json.dump(output_files, output_file_object, indent=4)  
    return 0
Ejemplo n.º 21
0
def main(input_file, realign_output, do_not_touch_output,
         list_of_files_output, max_length, wrap_length,
         min_seq_length):
    realign_counter = 0
    do_not_touch_counter = 0
    files = []
    for alignment in Fasta.load(input_file, '\.[0-9]*$'):
        if realign_counter % 100 == 0:
            print(realign_counter, do_not_touch_counter,alignment[0][0])
        alignment_len = len(alignment[0][1])

        annotation = alignment[2][1]

        # !!! We expect that first block is not repeat
        changes = [i for i in range(1, len(annotation))
                   if annotation[i-1] != annotation[i]] + [len(annotation)]
        Blocks = zip(changes, changes[1:]) + [(len(annotation), len(annotation) + max_length + 10)]
        Blocks = [(0, Blocks[0][0])] + Blocks
        printed = 0
        block_start = 0#None
        block_end = None
        intervals = []
        for block_id in range(1, len(Blocks), 2):
            current = Blocks[block_id]
            previous = Blocks[block_id - 1]
            if block_start == None:
                startpp = max(printed, previous[0])
                if previous[1] - startpp > wrap_length:
                    intervals.append((printed, startpp))
                    printed = startpp
                    block_start = startpp
            else:
                # Pridam tento blok, alebo zacnem novy?
                if current[1] - block_start > max_length:
                    if previous[1] - previous[0] > wrap_length * 2:
                        intervals.append((block_start, previous[0] + wrap_length))
                        intervals.append((previous[0] + wrap_length, previous[1] - wrap_length))
                        printed = previous[1] - wrap_length
                        block_start = previous[1] - wrap_length
                    else:
                        split = (previous[0] + previous[1]) / 2
                        intervals.append((block_start, split))
                        block_start = split
                        printed = split
                    #Zacnem novy
        intervals.append((printed, len(annotation)))
        assert(len(annotation) == sum([y - x for x, y in intervals]))
        for i in range(1, len(intervals)):
            assert(intervals[i - 1][1] == intervals[i][0])

        #t = list(range(0, alignment_len, max_length)) + [alignment_len]
        #intervals = zip(t, t[1:]) 

        for start, stop in intervals:
            if start >= len(annotation):
                continue
            if start == stop:
                continue
            assert(start < stop)
            ann = alignment[2][1][start:stop]
            output = None
            seq1 = alignment[0][1]
            seq2 = alignment[4][1]
            seq1_len = len(seq1) - seq1.count('-') - seq1.count('.')
            seq2_len = len(seq2) - seq2.count('-') - seq2.count('.')
            if ann.count('R') == 0 or min(seq1_len, seq2_len) < min_seq_length or ann.count('R') == len(ann):
                output = do_not_touch_output.format(id=do_not_touch_counter)
                do_not_touch_counter += 1
            else:   
                output = realign_output.format(id=realign_counter)
                realign_counter += 1
            files.append(output)
            aln = [
                (alignment[0][0], alignment[0][1][start:stop]),
                (alignment[2][0], alignment[2][1][start:stop]),
                (alignment[4][0], alignment[4][1][start:stop])
            ]
            #Fasta.save(aln, output, width=-1)
        files.append('');
        
    with Open(list_of_files_output, 'w') as f:
        json.dump(files, f, indent=4)
Ejemplo n.º 22
0
def main(inp, out, alignment_regexp, sequence_regexp, trf=trf_paths):
    for trf_executable in trf:
        if os.path.exists(trf_executable):
            trf = TRFDriver(trf_executable, mathType=float)
            break
    repeats = trf.run(inp)

    stats = defaultdict(int)

    for aln in Fasta.load(inp,
                          alignment_regexp,
                          Alignment,
                          sequence_selectors=sequence_regexp):
        X_index = 0
        Y_index = 1

        X_trf = list(
            translate_repeat_to_annotation(repeats[aln.names[X_index]],
                                           aln.seq_to_aln[X_index]))
        Y_trf = list(
            translate_repeat_to_annotation(repeats[aln.names[Y_index]],
                                           aln.seq_to_aln[Y_index]))

        X_ann = list("M" * len(aln.sequences[X_index]))
        Y_ann = list("M" * len(aln.sequences[Y_index]))
        B_ann = list("M" * len(aln.sequences[Y_index]))
        for repeat in X_trf:
            if repeat.end >= len(X_ann):
                repeat.end = len(X_ann) - 1
            rlen = 1 + repeat.end - repeat.start
            X_ann[repeat.start:repeat.end + 1] = list("R" * rlen)
            B_ann[repeat.start:repeat.end + 1] = list("R" * rlen)
        for repeat in Y_trf:
            if repeat.end >= len(Y_ann):
                repeat.end = len(Y_ann) - 1
            rlen = 1 + repeat.end - repeat.start
            Y_ann[repeat.start:repeat.end + 1] = list("R" * rlen)
            B_ann[repeat.start:repeat.end + 1] = list("R" * rlen)
        assert (len(X_ann) == len(Y_ann) and len(B_ann) == len(Y_ann))

        M_count = len([x for x in B_ann if x == 'M'])
        R_count = len([x for x in B_ann if x == 'R'])
        R_segments_count = len([
            x for x in zip('M' + ''.join(B_ann), ''.join(B_ann) + 'M')
            if x[0] != 'R' and x[1] == 'R'
        ])
        stats['M_count'] += M_count
        stats['R_count'] += R_count
        stats['R_segment_count'] += R_segments_count
        changes = [
            i for i, x in zip(range(len(B_ann) + 1),
                              zip('M' + ''.join(B_ann), ''.join(B_ann) + 'M'))
            if x[0] != x[1]
        ]
        R_segments = [(changes[i], changes[i + 1])
                      for i in range(0,
                                     len(changes) - (len(changes) % 2), 2)]

        assert (R_segments_count == len(R_segments))
        for start, stop in R_segments:
            XX = 'M'
            YY = 'M'
            for i in range(start, stop):
                if X_ann[i] == 'R':
                    XX = 'R'
                if Y_ann[i] == 'R':
                    YY = 'R'
                assert (B_ann[i] == 'R')
            stats[XX + YY] += 1

    with Open(out, 'w') as f:
        json.dump(stats, f, indent=4)
Ejemplo n.º 23
0
    parser = \
        argparse.ArgumentParser(description='Create specific model from stats')
    parser.add_argument('model', type=str,
                        help='File containing the seleton of the model')
    parser.add_argument('filenames', type=str,
                        help='File containing needed list of files (json' + 
                        ' containing name of files for emissions,' + 
                        ' transition, and statistics from TRF')
    parser.add_argument('output', type=str,
                        help='Output file for resulting model')
    parser.add_argument('--parameters', type=str, default='{}',
                        help='Additional parameters (in json as dictionary).')
    parser.add_argument('--simple_model', type=bool, default=False,
                        help='Whether it is simple model or repeat model')
    parsed_arg = parser.parse_args()
    
    with Open(parsed_arg.filenames, 'r') as f:
        files = dict([(x.split('.')[-2], x) for x in json.load(f)])
    main(
         parsed_arg.model,
         json.loads(parsed_arg.parameters),
         files['emission'],
         files['transition'],
         files['trf_consensus'],
         files['trf_length'],
         files['trf_cover'],
         parsed_arg.output,
         parsed_arg.simple_model,
    )
    perf.printAll()
Ejemplo n.º 24
0
def main(correct_file, aln_file, output_file, interval=None):
    task_ids = [None]
    if os.environ.has_key('SGE_TASK_ID'):
        if os.environ['SGE_TASK_ID'] != 'undefined':
            sge_task_id = int(os.environ['SGE_TASK_ID'])
            if not os.environ.has_key('SGE_STEP_SIZE'):
                sge_step_size = 1
            else:
                sge_step_size = int(os.environ['SGE_STEP_SIZE'])
            sge_task_last = int(os.environ['SGE_TASK_LAST'])
            task_ids = range(
                sge_task_id, min(sge_task_id + sge_step_size,
                                 sge_task_last + 1))
    if interval != None:
        task_ids = range(interval[0], interval[1] + 1)
    for task_id in task_ids:
        separator = ''
        output = {}
        for fun, tp in [(identity, 'standard'),
                        (expand_repeats, 'expanded_repeats'),
                        (remove_repeats, 'removed_repeats')]:
            try:
                for correct, alignment in zip(
                        Fasta.load(correct_file.format(id=task_id - 1),
                                   separator, Alignment),
                        Fasta.load(aln_file.format(id=task_id - 1), separator,
                                   Alignment)):
                    correct_len = len(correct.getCoordPairs(False))
                    total_len = correct_len * 2 - correct.sequences[0].count(
                        '-') - correct.sequences[2].count('-')
                    ccc = fun(correct.getCoordPairs(False), correct)
                    if tp == 'removed_repeats':
                        correct_len = len(ccc)
                        total_len = 0
                        for v1, _, v2 in ccc:
                            if v1 >= 0:
                                total_len += 1
                            if v2 >= 0:
                                total_len += 1
                    acc = alignment.getCoordPairs(False)
                    cc = map(lambda x: (x[0], x[2]), ccc)
                    if len(acc[0]) == 3:
                        ac = map(lambda x: (x[0], x[2]), acc)
                    elif len(acc[0]) == 2:
                        ac = acc
                    else:
                        ac = None
                    c = set(cc)
                    a = set(ac)

                    intersect = c.intersection(a)
                    not_in_c = c.difference(a)
                    not_in_a = a.difference(c)
                    symm_diff = c.symmetric_difference(a)

                    score = 0
                    for v1, v2 in intersect:
                        if v1 >= 0:
                            score += 1
                        if v2 >= 0:
                            score += 1

                    dists_correct = defaultdict(int)
                    dists_total = defaultdict(int)
                    position = dict()
                    dists = [99999999] * len(correct.sequences[1])
                    dst = 9999999
                    for x, a, y in ccc:
                        position[(x, y)] = a
                    for i in range(len(correct.sequences[1])):
                        if correct.sequences[1][i] == 'R':
                            dst = 0
                        else:
                            dst += 1
                        dists[i] = min(dists[i], dst)
                    for i in reversed(range(len(correct.sequences[1]))):
                        if correct.sequences[1][i] == 'R':
                            dst = 0
                        else:
                            dst += 1
                        dists[i] = min(dists[i], dst)

                    for pos in c:
                        d = dists[position[pos]]
                        if d == 0:
                            continue
                        dists_total[d] += 1
                        if pos in ac:
                            dists_correct[d] += 1

                    def getRepeatAnnotation(coord, annotation):
                        if len(coord[0]) != 3:
                            return set()
                        ret = set()
                        for x, a, y in coord:
                            if annotation[a] == 'R':
                                if x >= 0:
                                    ret.add((x, -1))
                                if y >= 0:
                                    ret.add((-1, y))
                        return ret

                    crann = getRepeatAnnotation(correct.getCoordPairs(False),
                                                correct.sequences[1])
                    arann = getRepeatAnnotation(alignment.getCoordPairs(False),
                                                alignment.sequences[1])

                    def getRepeatBlocks(coord, annotation):
                        if len(coord[0]) != 3:
                            return set()
                        ret = set()
                        x = set()
                        y = set()
                        for _x, a, _y in coord:
                            if annotation[a] == 'R':
                                if _x >= 0:
                                    x.add(_x)
                                if _y >= 0:
                                    y.add(_y)
                            else:
                                if len(x) + len(y) > 0:
                                    if len(x) == 0:
                                        x.add(-1)
                                    if len(y) == 0:
                                        y.add(-1)
                                    ret.add(((min(x), max(x) + 1),
                                             (min(y), max(y) + 1)))
                                    x = set()
                                    y = set()
                        if len(x) + len(y) > 0:
                            if len(x) == 0:
                                x.add(-1)
                            if len(y) == 0:
                                y.add(-1)
                            ret.add(
                                ((min(x), max(x) + 1), (min(y), max(y) + 1)))
                            x = set()
                            y = set()
                        return ret

                    cbann = getRepeatBlocks(correct.getCoordPairs(False),
                                            correct.sequences[1])
                    abann = getRepeatBlocks(alignment.getCoordPairs(False),
                                            alignment.sequences[1])

                    def dst(x1, x2):
                        if x1 == -1:
                            return 0
                        return x2 - x1

                    def getPoints(s):
                        return sum([
                            dst(x1, x2) + dst(y1, y2)
                            for ((x1, x2), (y1, y2)) in s
                        ])

                    # Find long segments that are correctly aligned
                    cseg = [1 if x in c else 0 for x in ac]
                    seg_len = []
                    length = 0
                    segment_length_histogram = defaultdict(int)
                    for x in cseg:
                        if x == 0 and length != 0:
                            segment_length_histogram[length] += 1
                        length = length * x + x
                        seg_len.append(length)
                    if length > 0:
                        segment_length_histogram[length] += 1

                    getPoints = len
                    output[tp] = {
                        'corect':
                        correct_file,
                        'alignment':
                        aln_file,
                        'c-lenght':
                        len(cc),
                        'a-length':
                        len(ac),
                        'intersect':
                        len(intersect),
                        '%correct':
                        100.0 - float(len(intersect) * 100) / correct_len
                        if correct_len > 0 else 100,
                        '+mistakes':
                        len(intersect),
                        '+len':
                        correct_len,
                        '+RepTP':
                        len(crann & arann),
                        '+RepTN':
                        total_len - len(crann | arann),
                        '+RepFP':
                        len(arann - crann),
                        '+RepFN':
                        len(crann - arann),
                        '+BlkTP':
                        getPoints(cbann & abann),
                        '+BlkTN':
                        0,
                        '+BlkFP':
                        getPoints(abann - cbann),
                        '+BlkFN':
                        getPoints(cbann - abann),
                        '%score':
                        float(score) * 100 / total_len if total_len > 0 else 0,
                        'c-a':
                        len(not_in_c),
                        'a-c':
                        len(not_in_a),
                        'symmetric_difference':
                        len(symm_diff),
                        'correct_len_histogram':
                        segment_length_histogram,
                        '@+dists_correct':
                        dists_correct,
                        '@+dists_total':
                        dists_total,
                    }
                    if correct_len == 0:
                        del output[tp]['%correct']
                    if total_len == 0:
                        del output[tp]['%score']
            except IOError:
                pass

        with Open(output_file.format(id=task_id - 1), 'w') as f:
            json.dump(output, f, indent=4)
Ejemplo n.º 25
0
def main(model_file, additional_parameters,
         emmisions_file, transitions_file, repeat_consensus_file,
         repeat_length_file, trf_cover_file, output_file, simple_model):
    loader = HMMLoader()

    with Open(trf_cover_file, 'r') as f:
        trf_cover = json.load(f)
    if not simple_model:
        repeat_probability = (float(trf_cover['R_segment_count']) / 
                              (trf_cover['R_segment_count'] +
                               trf_cover['M_count']))
        repeat_count = sum([trf_cover[x] for x in ['RR', 'RM', 'MR']])
        repeat_repeat_probability = float(trf_cover['RR']) / repeat_count
        nothing_repeat_probability = float(trf_cover['MR']) / repeat_count
        repeat_nothing_probability = float(trf_cover['RM']) / repeat_count

        loader.addDictionary('trackemi', {"value": {
            'RR': repeat_repeat_probability,
            'RM': repeat_nothing_probability,
            'MR': nothing_repeat_probability,
        }})

    for k, v in additional_parameters.iteritems():
        loader.addDictionary(k, v)
    
    # Parse emissions
    
    with Open(emmisions_file, 'r') as f:
        emm = normalize_dict(json.load(f))

    emm = [(ast.literal_eval(k), v) for k, v in emm.iteritems()]
    loader.addDictionary('MatchStateEmissions', {'value': emm})
    
    background_prob = defaultdict(int)
    for ((r1, r2), v) in emm:
        background_prob[r1] += v
        background_prob[r2] += v
    background_prob = \
        {'value': list(normalize_dict(background_prob).iteritems())}
    loader.addDictionary('background-probability', background_prob)
    
    # Parse transitions
    with Open(transitions_file, 'r') as f:
        __trans = json.load(f)
    trans = dict()
    for k, v in __trans.iteritems():
        trans[''.join(ast.literal_eval(k))] = v
    trans = normalize_tuple_dict(trans)
    if not simple_model:
        for k in trans:
            trans[k] *= (1 - repeat_probability)
        trans['MR'] = repeat_probability
        trans['XR'] = repeat_probability
        trans['YR'] = repeat_probability
        trans['RR'] = repeat_probability
        trans['RX'] = (1 - repeat_probability) / 3
        trans['RY'] = (1 - repeat_probability) / 3
        trans['RM'] = (1 - repeat_probability) / 3
       
    loader.addDictionary('trans', trans) 
        
    # Parse emissions from trf
    if not simple_model:
        loader.addFile('consensus.js', 
                       os.path.relpath(os.path.abspath(repeat_consensus_file), 
                                       os.path.dirname(model_file)))
        loader.addFile('repeatlength.js', os.path.abspath(repeat_length_file))

    model = loader.load(model_file)
    
    json_prep = {'model': model['model'].toJSON()}
    with Open(output_file, 'w') as f:
        json.dump(json_prep, f, indent=4)
    return output_file
Ejemplo n.º 26
0
 def load(self, filename):
     self.filenameStack.append(filename)
     f = Open(filename, "r")
     r = json.load(f, object_hook=self.objectHook)
     self.filenameStack.pop()
     return r
Ejemplo n.º 27
0
def __loadJSON(filename):
    with Open(filename, 'r') as f:
        return json.load(f)            
Ejemplo n.º 28
0
    parser.add_argument('output_files', type=str, help='Output file')
    parser.add_argument('--start', type=int, default=0, 
                        help='Which files to select')
    parser.add_argument('--step', type=int, default=-1,
                        help='How many files to select (-1 to all)')
    parser.add_argument('--trf', type=toList, default=trf_paths
                        , help="Location of tandem repeat finder binary")
    parser.add_argument('--sequence_regexp', nargs='+', default=None,
                        help='Regular expressions used to select sequences.')
    parser.add_argument('--alignment_regexp', default='', 
                        help='Regular expression used to separate alignment' +
                        'in input file')

    parsed_arg = parser.parse_args()
    
    with Open(parsed_arg.files, 'r') as f:
        files = json.load(f)
        
    start = parsed_arg.start
    step = parsed_arg.step
    
    if step < 0:
        step = len(files)
    
    # Grid engine can always override parameters 
    if os.environ.has_key('SGE_TASK_ID'):
        start = int(os.environ['SGE_TASK_ID'])
    if os.environ.has_key('SGE_STEP_SIZE'):
        step = int(os.environ['SGE_STEP_SIZE'])
    output_files = main(files[start:start + step], parsed_arg.trf,
                        parsed_arg.alignment_regexp,
Ejemplo n.º 29
0
def main(n, datadir='data/train_sequences/', fname='simulated_alignment'):
    s1name = "sequence1"
    s2name = "sequence2"
    s3name = "sequence3"
    annotation_name = 'gene'

    alignment_extension = ".fa"
    annotations_extension = ".bed"
    config_extension = ".js"

    if len(sys.argv) > 1:
        n = int(sys.argv[1])
    if len(sys.argv) > 2:
        fname = sys.argv[2]

    master_gene_sequence = MarkovChain(P_START_GENE, P_STOP_GENE)
    human_delete_sequence = MarkovChain(P_START_DELETE, P_STOP_DELETE)
    mouse_delete_sequence = MarkovChain(P_START_DELETE, P_STOP_DELETE)
    horse_delete_sequence = MarkovChain(P_START_DELETE, P_STOP_DELETE)
    mutator_coin = BiasedCoin(P_NOT_MUTATE_GENE)

    master_gene = list()
    human_gene = list()
    mouse_gene = list()
    horse_gene = list()

    human_dna = list()
    mouse_dna = list()
    horse_dna = list()

    for i in range(n):
        # create master_gene item
        g = g2 = g3 = g4 = master_gene_sequence.get_state()

        # mutate master_gene item
        if g:
            g2 = mutator_coin.flip()
            g3 = mutator_coin.flip()
            g4 = mutator_coin.flip()

        dna_mutation_coin = create_dna_mutation_coin(g2 + g3)
        dna_mutation_coin2 = create_dna_mutation_coin(g2 + g4)

        # create DNA item
        c = c2 = c3 = random.randint(0, 3)
        c2 = mutate(c2, g2 + g3)
        c, c2, c3 = [DNA_CHARS[i] for i in (c, c2, c3)]
        if not dna_mutation_coin.flip():
            char_index = random.randint(0, 2)
            if DNA_CHARS[char_index] == c2:
                char_index = 3
            c2 = DNA_CHARS[char_index]

        if not dna_mutation_coin2.flip():
            char_index = random.randint(0, 2)
            if DNA_CHARS[char_index] == c3:
                char_index = 3
            c3 = DNA_CHARS[char_index]

        # delete DNA item
        if human_delete_sequence.get_state():
            c = '-'
        if mouse_delete_sequence.get_state():
            c2 = '-'
        if horse_delete_sequence.get_state():
            c3 = '-'

        # add items to sequence
        master_gene.append(g)
        human_gene.append(g2)
        mouse_gene.append(g3)
        horse_gene.append(g4)

        human_dna.append(c)
        mouse_dna.append(c2)
        horse_dna.append(c3)

    # output
    s1fname = os.path.join(
        datadir,
        fname + '_' + s1name + '_' + annotation_name + annotations_extension)
    if os.path.isfile(s1fname):
        os.remove(s1fname)
    s2fname = os.path.join(
        datadir,
        fname + '_' + s2name + '_' + annotation_name + annotations_extension)
    if os.path.isfile(s2fname):
        os.remove(s2fname)
    s3fname = os.path.join(
        datadir,
        fname + '_' + s3name + '_' + annotation_name + annotations_extension)
    if os.path.isfile(s3fname):
        os.remove(s3fname)

    intervals1 = sequence_to_intervals(get_sequence(human_gene, human_dna),
                                       annotation_name)
    intervals2 = sequence_to_intervals(get_sequence(mouse_gene, mouse_dna),
                                       annotation_name)
    intervals3 = sequence_to_intervals(get_sequence(horse_gene, horse_dna),
                                       annotation_name)

    annotations = Annotations()
    annotations.setAnnotations([annotation_name])
    annotations.addSequences([s1name, s2name, s3name])
    annotations.addAnnotationFile(s1name, annotation_name, s1fname)
    annotations.addAnnotationFile(s2name, annotation_name, s2fname)
    # annotations.addAnnotationFile(s3name, annotation_name,  s3fname)

    Fasta.save(
        [
            (s1name, ''.join(human_dna)),
            (s2name, ''.join(mouse_dna)),
            # (s3name, ''.join(horse_dna))
        ],
        os.path.join(datadir, fname + alignment_extension))

    with track.new(s1fname, 'bed') as t:
        t.fields = ['start', 'end', 'name']
        t.write("chr1", intervals1)
    with track.new(s2fname, 'bed') as t:
        t.fields = ['start', 'end', 'name']
        t.write("chr1", intervals2)
    # with track.new(s3fname, 'bed') as t:
    #     t.fields = ['start', 'end', 'name']
    #     t.write("chr1", intervals3)

    with Open(os.path.join(datadir, fname + config_extension), "w") as f:
        json.dump(annotations.toJSON(), f)
Ejemplo n.º 30
0
def createKRepeatHMM(
    mathType,
    maxK,
    time,
    backgroundProb,
    indelProb,
    indelExtProb,
    repeatProb,
    endProb,
    initEndProb=None,
    silEndProb=None,
):
    if initEndProb == None:
        initEndProb = endProb
    if silEndProb == None:
        silEndProb = endProb
    tp = type(backgroundProb)
    if tp in [dict, defaultdict]:
        backgroundProb = list(backgroundProb.iteritems())
    probabilities = list(backgroundProb)
    alphabet = [x for x, _ in backgroundProb]
    for a in alphabet:
        for b in alphabet:
            probabilities.append((a + b, JCModelDist(a, b, time)))
    states = list()
    transitions = list()

    end_state = GeneralizedState(mathType)
    end_state.load({
        '__name__': 'GeneralizedState',
        'name': 'End',
        'startprob': mathType(0.0),
        'endprob': mathType(1.0),
        'emission': [('', mathType(1.0))],
        'durations': [(0, mathType(1.0))],
    })
    states.append(end_state)

    initTemplate = {
        '__name__': 'GeneralizedState',
        'name': 'I{}',
        'startprob': mathType(0.0),
        'endprob': mathType(0.0),
        'emission': backgroundProb,  #,[('', mathType(1.0))],#backgroundProb,
        'durations': [(1, mathType(1.0))],
    }

    for order in range(1, maxK + 1):
        if order == 1:
            initTemplate['startprob'] = mathType(1.0)
        transitions.append({
            'from': 'I{}'.format(order),
            'to': 'R{}'.format(order),
            'prob': repeatProb,
        })
        transitions.append({
            'from': 'I{}'.format(order),
            'to': 'End',
            'prob': initEndProb,
        })
        self_prob = mathType(1.0)
        self_prob -= repeatProb + initEndProb
        if order < maxK:
            transitions.append({
                'from': 'I{}'.format(order),
                'to': 'I{}'.format(order + 1),
                'prob': self_prob
            })
        initTemplate['name'] = 'I{}'.format(order)
        state = GeneralizedState(mathType)
        state.load(initTemplate)
        states.append(state)

    silentTemplate = {
        '__name__': 'GeneralizedState',
        'name': 'S{}{}',
        'startprob': mathType(0.0),
        'endprob': mathType(0.0),
        'emission': [('', mathType(1.0))],
        'durations': [(0, mathType(1.0))],
    }

    insertTemplate = {
        '__name__': 'GeneralizedState',
        'name': 'S{}{}',
        'startprob': mathType(0.0),
        'endprob': mathType(0.0),
        'emission': backgroundProb,
        'durations': [(1, mathType(1.0))],
    }

    for order in range(1, maxK):
        insertTemplate['name'] = 'SI{}'.format(order)
        state = GeneralizedState(mathType)
        state.load(insertTemplate)
        states.append(state)
        end_p = mathType(1.0)
        if order < maxK - 1:
            transitions.append({
                'from': 'SI{}'.format(order),
                'to': 'SI{}'.format(order + 1),
                'prob': indelExtProb
            })
            end_p -= indelExtProb
        transitions.append({
            'from': 'SI{}'.format(order),
            'to': 'End',
            'prob': silEndProb
        })
        end_p -= silEndProb
        transitions.append({
            'from': 'SI{}'.format(order),
            'to': 'R{}'.format(order + 1),
            'prob': end_p
        })
        silentTemplate['name'] = 'SD{}'.format(order)
        state = GeneralizedState(mathType)
        state.load(silentTemplate)
        states.append(state)
        end_p = mathType(1.0)
        transitions.append({
            'from': 'SD{}'.format(order),
            'to': 'End',
            'prob': silEndProb,
        })
        end_p -= silEndProb
        if order < maxK - 1:
            transitions.append({
                'from': 'SD{}'.format(order + 1),
                'to': 'SD{}'.format(order),
                'prob': indelExtProb
            })
        if order > 1:
            end_p -= indelExtProb
        transitions.append({
            'from': 'SD{}'.format(order),
            'to': 'R{}'.format(order),
            'prob': end_p
        })

    repeatTemplate = {
        '__name__': 'HighOrderState',
        'name': 'R{}',
        'startprob': mathType(0.0),
        'endprob': mathType(0.0),
        'emission': probabilities,
        'durations': [(1, mathType(1.0))],
        'order': 0
    }
    for order in range(1, maxK + 1):
        repeatTemplate['name'] = 'R{}'.format(order)
        repeatTemplate['order'] = order
        state = HighOrderState(mathType)
        state.load(repeatTemplate)
        states.append(state)
        stayprob = mathType(1.0)
        transitions.append({
            'from': 'R{}'.format(order),
            'to': 'End',
            'prob': endProb,
        })
        stayprob -= endProb
        if order > 1:
            transitions.append({
                'from': 'R{}'.format(order),
                'to': 'SD{}'.format(order - 1),
                'prob': indelProb,
            })
            stayprob -= indelProb
        if order < maxK:
            transitions.append({
                'from': 'R{}'.format(order),
                'to': 'SI{}'.format(order),
                'prob': indelProb,
            })
            stayprob -= indelProb
        transitions.append({
            'from': 'R{}'.format(order),
            'to': 'R{}'.format(order),
            'prob': stayprob,
        })
    hmm = GeneralizedHMM(mathType)
    hmm.load({
        '__name__': 'GeneralizedHMM',
        'states': states,
        'transitions': transitions,
    })
    for i in range(len(hmm.states)):
        hmm.states[i].normalizeTransitions()
    hmm.reorderStatesTopologically()
    with Open(
            'submodels/newK-{}-{}-{}-{}.js'.format(maxK, time, indelProb,
                                                   repeatProb), 'w') as f:
        print f

        def LogNumToJson(obj):
            if isinstance(obj, LogNum):
                return '{0}'.format(str(float(obj)))
            raise TypeError

        json.dump(hmm.toJSON(),
                  f,
                  indent=4,
                  sort_keys=True,
                  default=LogNumToJson)
    return hmm