コード例 #1
0
ファイル: ArgumentParser.py プロジェクト: mhozza/realigner
def get_model(args, filename, allow_mask=True):
    loader = HMMLoader(args.mathType) # TODO: rename HMMLoader to ModelLoader
    register_classifier_states(loader)
    register_annotation_states(loader)
    register_cannotation_states(loader)
    register_annotations(loader)

    for i in range(0, len(args.bind_file), 2):
        loader.addFile(args.bind_file[i], args.bind_file[i + 1])
    for i in range(0, len(args.bind_constant_file), 2):
        loader.addConstant(
            args.bind_constant_file[i],
            loader.load(args.bind_constant_file[i + 1])
        )
    for i in range(0, len(args.bind_constant_file), 2):
        loader.addConstant(
            args.bind_constant_file[i],
            loader.loads(args.bind_constant_file[i + 1]),
        )

    model = loader.load(filename)
    if type(model) is dict and 'model' in model:
        model = model["model"]
    if args.add_masked_to_distribution and allow_mask:
        model.add_soft_masking_to_distribution()
    return model
コード例 #2
0
def get_model(args, filename, allow_mask=True):
    loader = HMMLoader(args.mathType)  # TODO: rename HMMLoader to ModelLoader
    register_classifier_states(loader)
    register_annotation_states(loader)
    register_cannotation_states(loader)
    register_annotations(loader)

    for i in range(0, len(args.bind_file), 2):
        loader.addFile(args.bind_file[i], args.bind_file[i + 1])
    for i in range(0, len(args.bind_constant_file), 2):
        loader.addConstant(args.bind_constant_file[i],
                           loader.load(args.bind_constant_file[i + 1]))
    for i in range(0, len(args.bind_constant_file), 2):
        loader.addConstant(
            args.bind_constant_file[i],
            loader.loads(args.bind_constant_file[i + 1]),
        )

    model = loader.load(filename)
    if type(model) is dict and 'model' in model:
        model = model["model"]
    if args.add_masked_to_distribution and allow_mask:
        model.add_soft_masking_to_distribution()
    return model
コード例 #3
0
ファイル: Sample.py プロジェクト: pombredanne/realigner
def main():
    
    parser = argparse.ArgumentParser(description='Sample alignments.')
    parser.add_argument('output_file_template', type=str, 
                        help="Template for output file. Have to contain " + \
                        "string '{id}' as placeholder for sequence number.")
    parser.add_argument('--output_files', type=str, help="File where the " + \
                        'list of output files will be written.', default='-')
    parser.add_argument('--model', type=str,
                        default='data/models/repeatHMM.js', help="Model file")
    parser.add_argument('--bind_file', nargs='*', help='Replace filenames in '
                        + 'the input_file model.', default=[]) 
    parser.add_argument('--bind_constant', nargs='*', help='Replace constants'
                         + ' in the input_file model.', default=[])
    parser.add_argument('--bind_constant_file', nargs='*', help='Replace' + 
                        ' constants in the input_file model.', default=[])
    parser.add_argument('n_samples', type=int, help='Number of samples.')
    parser.add_argument('seq1_length',type=int, 
                        help='Length of first sequence.')
    parser.add_argument('seq2_length', type=int, 
                        help='Length of second sequence.')
    parsed_arg = parser.parse_args()
      
    # ====== Validate input parameters =========================================

    if parsed_arg.output_file_template.count("{id}") < 1:
        sys.stderr.write('ERROR: If sampling, output_file filename has to ' +\
                         'contain at least one "%d".\n')
        return 1
    if len(parsed_arg.bind_file) % 2 != 0:
        sys.stderr.write('ERROR: If binding files, the number of arguments has'
                         + 'to be divisible by 2\n')
        return 1 
    if len(parsed_arg.bind_constant_file) % 2 != 0:
        sys.stderr.write('ERROR: If binding constants (as files), the number of'
                         + ' arguments has to be divisible by 2\n')
        return 1
    if len(parsed_arg.bind_constant) % 2 != 0:
        sys.stderr.write('ERROR: If binding constants, the number of'
                         + ' arguments has to be divisible by 2\n')
        return 1
    
    # ====== Parse parameters ==================================================
        
    output_filename = parsed_arg.output_file_template
    output_files_filename = parsed_arg.output_files
    output_files = list()
    
    # ====== Load model ========================================================
    loader = HMMLoader() 
    for i in range(0, len(parsed_arg.bind_constant), 2):
        loader.addFile(parsed_arg.bind_file[i], parsed_arg.bind_file[i + 1])
    for i in range(0, len(parsed_arg.bind_constant_file), 2):
        loader.addConstant(
            parsed_arg.bind_constant_file[i],
            loader.load(parsed_arg.bind_constant_file[i + 1])
        )
    for i in range(0, len(parsed_arg.bind_constant), 2):
        loader.addConstant(
            parsed_arg.bind_constant[i],
            loader.loads(parsed_arg.bind_constant[i + 1]),
        )
    model_filename = parsed_arg.model
    PHMM = loader.load(model_filename)["model"]

    # ====== Sample ============================================================
    PHMM.buildSampleTransitions()
    n_samples = parsed_arg.n_samples
    X_len = parsed_arg.seq1_length
    Y_len = parsed_arg.seq2_length
    dirname = os.path.dirname(output_filename)
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    for i in range(n_samples):
        done = False
        while not done:
            tandemRepeats = {'sequence1': [], 'sequence2': []}
            seq = PHMM.generateSequence((X_len, Y_len))
            X = ""
            Y = ""
            A = ""
            for (seq, state) in seq:
                ann_data = None
                if len(seq) == 2:
                    x, y = seq
                else: 
                    x, y, ann_data = seq
                dx, dy = len(x), len(y)
                if ann_data != None:
                    xlen = len(X.replace('-', ''))
                    ylen = len(Y.replace('-', ''))
                    if dx > 0:
                        tandemRepeats['sequence1'].append((
                            xlen, xlen + dx, dx / ann_data[1], ann_data[0], x
                        ))
                        done = True
                    if dy > 0:
                        tandemRepeats['sequence2'].append((
                            ylen, ylen + dy, dy / ann_data[2], ann_data[0], y
                        ))
                        done = True
                A += PHMM.states[state].getChar() * max(dx, dy)
                X += x + ('-' * (dy - dx))
                Y += y + ('-' * (dx - dy))
            #if len(X) - X.count('-') > 2 * X_len:
            #    done = False
            #if len(Y) - Y.count('-') > 2 * Y_len:
            #    done = False
        aln = [("sequence1", X), ("alignment", A), ("sequence2", Y)]
        json.dump(tandemRepeats, Open(output_filename.format(id=i) + '.repeats',
                                      'w'), indent=4)
        Fasta.save(aln, output_filename.format(id=i))
        output_files.append(output_filename.format(id=i))
    with Open(output_files_filename, 'w') as output_file_object:
        json.dump(output_files, output_file_object, indent=4)  
    return 0
コード例 #4
0
ファイル: CreateBetterModel.py プロジェクト: mhozza/realigner
def main(model_file, additional_parameters,
         emmisions_file, transitions_file, repeat_consensus_file,
         repeat_length_file, trf_cover_file, output_file, simple_model):
    loader = HMMLoader()

    with Open(trf_cover_file, 'r') as f:
        trf_cover = json.load(f)
    if not simple_model:
        repeat_probability = (float(trf_cover['R_segment_count']) / 
                              (trf_cover['R_segment_count'] +
                               trf_cover['M_count']))
        repeat_count = sum([trf_cover[x] for x in ['RR', 'RM', 'MR']])
        repeat_repeat_probability = float(trf_cover['RR']) / repeat_count
        nothing_repeat_probability = float(trf_cover['MR']) / repeat_count
        repeat_nothing_probability = float(trf_cover['RM']) / repeat_count

        loader.addDictionary('trackemi', {"value": {
            'RR': 0.0,#repeat_repeat_probability,
            'RM': repeat_nothing_probability,
            'MR': nothing_repeat_probability,
        }})

    for k, v in additional_parameters.iteritems():
        loader.addDictionary(k, v)
    
    # Parse emissions
    
    with Open(emmisions_file, 'r') as f:
        emm = normalize_dict(json.load(f))

    emm = [(ast.literal_eval(k), v) for k, v in emm.iteritems()]
    loader.addDictionary('MatchStateEmissions', {'value': emm})
    
    background_prob = defaultdict(int)
    for ((r1, r2), v) in emm:
        background_prob[r1] += v
        background_prob[r2] += v
    background_prob = \
        {'value': list(normalize_dict(background_prob).iteritems())}
    loader.addDictionary('background-probability', background_prob)
    
    # Parse transitions
    with Open(transitions_file, 'r') as f:
        __trans = json.load(f)
    trans = dict()
    for k, v in __trans.iteritems():
        trans[''.join(ast.literal_eval(k))] = v
    trans = normalize_tuple_dict(trans)
    if not simple_model:
        for k in trans:
            trans[k] *= (1 - repeat_probability)
        trans['MR'] = repeat_probability
        trans['XR'] = repeat_probability
        trans['YR'] = repeat_probability
        trans['RR'] = repeat_probability
        trans['RX'] = (1 - repeat_probability) / 3
        trans['RY'] = (1 - repeat_probability) / 3
        trans['RM'] = (1 - repeat_probability) / 3
       
    loader.addDictionary('trans', trans) 
        
    # Parse emissions from trf
    if not simple_model:
        loader.addFile('consensus.js', 
                       os.path.relpath(os.path.abspath(repeat_consensus_file), 
                                       os.path.dirname(model_file)))
        loader.addFile('repeatlength.js', os.path.abspath(repeat_length_file))

    model = loader.load(model_file)
    
    json_prep = {'model': model['model'].toJSON()}
    with Open(output_file, 'w') as f:
        json.dump(json_prep, f, indent=4)
    return output_file
コード例 #5
0
ファイル: CreateModel.py プロジェクト: pombredanne/realigner
def main(model_file, additional_parameters,
         emmisions_file, transitions_file, repeat_consensus_file,
         repeat_length_file, trf_cover_file, output_file, simple_model):
    loader = HMMLoader()

    with Open(trf_cover_file, 'r') as f:
        trf_cover = json.load(f)
    if not simple_model:
        repeat_probability = (float(trf_cover['R_segment_count']) / 
                              (trf_cover['R_segment_count'] +
                               trf_cover['M_count']))
        repeat_count = sum([trf_cover[x] for x in ['RR', 'RM', 'MR']])
        repeat_repeat_probability = float(trf_cover['RR']) / repeat_count
        nothing_repeat_probability = float(trf_cover['MR']) / repeat_count
        repeat_nothing_probability = float(trf_cover['RM']) / repeat_count

        loader.addDictionary('trackemi', {"value": {
            'RR': repeat_repeat_probability,
            'RM': repeat_nothing_probability,
            'MR': nothing_repeat_probability,
        }})

    for k, v in additional_parameters.iteritems():
        loader.addDictionary(k, v)
    
    # Parse emissions
    
    with Open(emmisions_file, 'r') as f:
        emm = normalize_dict(json.load(f))

    emm = [(ast.literal_eval(k), v) for k, v in emm.iteritems()]
    loader.addDictionary('MatchStateEmissions', {'value': emm})
    
    background_prob = defaultdict(int)
    for ((r1, r2), v) in emm:
        background_prob[r1] += v
        background_prob[r2] += v
    background_prob = \
        {'value': list(normalize_dict(background_prob).iteritems())}
    loader.addDictionary('background-probability', background_prob)
    
    # Parse transitions
    with Open(transitions_file, 'r') as f:
        __trans = json.load(f)
    trans = dict()
    for k, v in __trans.iteritems():
        trans[''.join(ast.literal_eval(k))] = v
    trans = normalize_tuple_dict(trans)
    if not simple_model:
        for k in trans:
            trans[k] *= (1 - repeat_probability)
        trans['MR'] = repeat_probability
        trans['XR'] = repeat_probability
        trans['YR'] = repeat_probability
        trans['RR'] = repeat_probability
        trans['RX'] = (1 - repeat_probability) / 3
        trans['RY'] = (1 - repeat_probability) / 3
        trans['RM'] = (1 - repeat_probability) / 3
       
    loader.addDictionary('trans', trans) 
        
    # Parse emissions from trf
    if not simple_model:
        loader.addFile('consensus.js', 
                       os.path.relpath(os.path.abspath(repeat_consensus_file), 
                                       os.path.dirname(model_file)))
        loader.addFile('repeatlength.js', os.path.abspath(repeat_length_file))

    model = loader.load(model_file)
    
    json_prep = {'model': model['model'].toJSON()}
    with Open(output_file, 'w') as f:
        json.dump(json_prep, f, indent=4)
    return output_file