Esempio n. 1
0
def get_model(args, filename, allow_mask=True):
    loader = HMMLoader(args.mathType) # TODO: rename HMMLoader to ModelLoader
    register_classifier_states(loader)
    register_annotation_states(loader)
    register_cannotation_states(loader)
    register_annotations(loader)

    for i in range(0, len(args.bind_file), 2):
        loader.addFile(args.bind_file[i], args.bind_file[i + 1])
    for i in range(0, len(args.bind_constant_file), 2):
        loader.addConstant(
            args.bind_constant_file[i],
            loader.load(args.bind_constant_file[i + 1])
        )
    for i in range(0, len(args.bind_constant_file), 2):
        loader.addConstant(
            args.bind_constant_file[i],
            loader.loads(args.bind_constant_file[i + 1]),
        )

    model = loader.load(filename)
    if type(model) is dict and 'model' in model:
        model = model["model"]
    if args.add_masked_to_distribution and allow_mask:
        model.add_soft_masking_to_distribution()
    return model
Esempio n. 2
0
def main(args):
    # TODO: build model params
    if args.model == None:
        print("You have to provide model")
        exit(1)
    loader = HMMLoader(LogNum)
    for state in loader.load(args.model)['model'].states:
        if state.onechar == 'R':
            model = state
    #BEGIN COPY
    D, stats = do_find_repeats(
        args.fasta,
        None,
        model,
        LogNum,
        args.stats,
        '$',
    )
    #END COPY
    if args.stats != None:
        out_stats = dict()
        for k, v in stats.iteritems():
            out_stats[str(k)] = v
        with open(args.stats, 'w') as f:
            json.dump(out_stats, f, indent=4)
    for key in D:
        D[key] = [
            (x.start, x.end, x.repetitions, x.consensus, x.sequence) 
            for x in D[key]
        ]
    with open(args.output, 'w') as f:
        json.dump(D, f, indent=4)
Esempio n. 3
0
def train(sequences, original_model, new_model):
    loader = HMMLoader(LogNum)
    model = loader.load(original_model)['model']
    with open('mmm.dot', 'w') as f:
        f.write(model_to_dot(model))
    ID = model.statenameToID['Repeat']
    with open(sequences) as f:
        sequences = json.load(f)
    model.states[ID].trainModel(sequences)

    def ln_to_float(x):
        if type(x) in [dict, defaultdict]:
            for k, v in x.iteritems():
                x[k] = ln_to_float(v)
        elif type(x) == list:
            x = map(ln_to_float, x)
        elif type(x) == tuple:
            x = tuple(map(ln_to_float, x))
        elif isinstance(x, LogNum):
            return float(x)
        return x

    js = {"model": ln_to_float(model.toJSON())}
    with open(new_model, 'w') as f:
        json.dump(js, f, sort_keys=True, indent=4)
Esempio n. 4
0
    def load_model(self, fname):
        loader = HMMLoader(float)
        register_classifier_states(loader)
        register_annotation_states(loader)
        register_cannotation_states(loader)

        self.fname = fname
        self.model = loader.load(fname)
        self.states_dict = dict()
        for i, state in enumerate(self.model['model'].states):
            self.states_dict[state.onechar] = i
Esempio n. 5
0
 def setModel(self, model):
     """
     Set model or link to the model, so we have HMM generator
     """
     if type(model) == str:
         loader = HMMLoader(self.mathType)
         for state in loader.load(model)['model'].states:
             if state.onechar == 'R':
                 model = state
     if isinstance(model, HMM):
         for state in model.states:
             if state.onechar == 'R':
                 model = state
     if not isinstance(model, State):
         raise "TODO"
     self.model = model
Esempio n. 6
0
 def setModel(self, model):
     """
     Set model or link to the model, so we have HMM generator
     """
     if type(model) == str:
         loader = HMMLoader(self.mathType)
         for state in loader.load(model)['model'].states:
             if state.onechar == 'R':
                 model = state
     if isinstance(model, HMM):
         for state in model.states:
             if state.onechar == 'R':
                 model = state
     if not isinstance(model, State):
         raise "TODO"
     self.model = model
Esempio n. 7
0
 def __init__(
     self,
     preparer,
     state_class=SimpleMatchState,
     model='data/models/SimpleHMM2.js',
 ):
     self._preparer = None
     self.preparer = preparer
     for state in HMMLoader().load(model)['model'].states:
         if isinstance(state, state_class):
             self.emissions = state.emissions
             break
def train(sequences, original_model, new_model):
    loader = HMMLoader(LogNum)
    model = loader.load(original_model)['model']
    with open('mmm.dot', 'w') as f:
        f.write(model_to_dot(model))
    ID = model.statenameToID['Repeat']
    with open(sequences) as f:
        sequences = json.load(f)
    model.states[ID].trainModel(sequences)
    def ln_to_float(x):
        if type(x) in [dict, defaultdict]:
            for k, v in x.iteritems():
                x[k] = ln_to_float(v)
        elif type(x) == list:
            x = map(ln_to_float,x)
        elif type(x) == tuple:
            x = tuple(map(ln_to_float, x))
        elif isinstance(x,LogNum):
            return float(x)
        return x                    
    js = {"model": ln_to_float(model.toJSON())}
    with open(new_model, 'w') as f:
        json.dump(js, f, sort_keys=True, indent=4)
Esempio n. 9
0
def main():
    
    parser = argparse.ArgumentParser(description='Sample alignments.')
    parser.add_argument('output_file_template', type=str, 
                        help="Template for output file. Have to contain " + \
                        "string '{id}' as placeholder for sequence number.")
    parser.add_argument('--output_files', type=str, help="File where the " + \
                        'list of output files will be written.', default='-')
    parser.add_argument('--model', type=str,
                        default='data/models/repeatHMM.js', help="Model file")
    parser.add_argument('--bind_file', nargs='*', help='Replace filenames in '
                        + 'the input_file model.', default=[]) 
    parser.add_argument('--bind_constant', nargs='*', help='Replace constants'
                         + ' in the input_file model.', default=[])
    parser.add_argument('--bind_constant_file', nargs='*', help='Replace' + 
                        ' constants in the input_file model.', default=[])
    parser.add_argument('n_samples', type=int, help='Number of samples.')
    parser.add_argument('seq1_length',type=int, 
                        help='Length of first sequence.')
    parser.add_argument('seq2_length', type=int, 
                        help='Length of second sequence.')
    parsed_arg = parser.parse_args()
      
    # ====== Validate input parameters =========================================

    if parsed_arg.output_file_template.count("{id}") < 1:
        sys.stderr.write('ERROR: If sampling, output_file filename has to ' +\
                         'contain at least one "%d".\n')
        return 1
    if len(parsed_arg.bind_file) % 2 != 0:
        sys.stderr.write('ERROR: If binding files, the number of arguments has'
                         + 'to be divisible by 2\n')
        return 1 
    if len(parsed_arg.bind_constant_file) % 2 != 0:
        sys.stderr.write('ERROR: If binding constants (as files), the number of'
                         + ' arguments has to be divisible by 2\n')
        return 1
    if len(parsed_arg.bind_constant) % 2 != 0:
        sys.stderr.write('ERROR: If binding constants, the number of'
                         + ' arguments has to be divisible by 2\n')
        return 1
    
    # ====== Parse parameters ==================================================
        
    output_filename = parsed_arg.output_file_template
    output_files_filename = parsed_arg.output_files
    output_files = list()
    
    # ====== Load model ========================================================
    loader = HMMLoader() 
    for i in range(0, len(parsed_arg.bind_constant), 2):
        loader.addFile(parsed_arg.bind_file[i], parsed_arg.bind_file[i + 1])
    for i in range(0, len(parsed_arg.bind_constant_file), 2):
        loader.addConstant(
            parsed_arg.bind_constant_file[i],
            loader.load(parsed_arg.bind_constant_file[i + 1])
        )
    for i in range(0, len(parsed_arg.bind_constant), 2):
        loader.addConstant(
            parsed_arg.bind_constant[i],
            loader.loads(parsed_arg.bind_constant[i + 1]),
        )
    model_filename = parsed_arg.model
    PHMM = loader.load(model_filename)["model"]

    # ====== Sample ============================================================
    PHMM.buildSampleTransitions()
    n_samples = parsed_arg.n_samples
    X_len = parsed_arg.seq1_length
    Y_len = parsed_arg.seq2_length
    dirname = os.path.dirname(output_filename)
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    for i in range(n_samples):
        done = False
        while not done:
            tandemRepeats = {'sequence1': [], 'sequence2': []}
            seq = PHMM.generateSequence((X_len, Y_len))
            X = ""
            Y = ""
            A = ""
            for (seq, state) in seq:
                ann_data = None
                if len(seq) == 2:
                    x, y = seq
                else: 
                    x, y, ann_data = seq
                dx, dy = len(x), len(y)
                if ann_data != None:
                    xlen = len(X.replace('-', ''))
                    ylen = len(Y.replace('-', ''))
                    if dx > 0:
                        tandemRepeats['sequence1'].append((
                            xlen, xlen + dx, dx / ann_data[1], ann_data[0], x
                        ))
                        done = True
                    if dy > 0:
                        tandemRepeats['sequence2'].append((
                            ylen, ylen + dy, dy / ann_data[2], ann_data[0], y
                        ))
                        done = True
                A += PHMM.states[state].getChar() * max(dx, dy)
                X += x + ('-' * (dy - dx))
                Y += y + ('-' * (dx - dy))
            #if len(X) - X.count('-') > 2 * X_len:
            #    done = False
            #if len(Y) - Y.count('-') > 2 * Y_len:
            #    done = False
        aln = [("sequence1", X), ("alignment", A), ("sequence2", Y)]
        json.dump(tandemRepeats, Open(output_filename.format(id=i) + '.repeats',
                                      'w'), indent=4)
        Fasta.save(aln, output_filename.format(id=i))
        output_files.append(output_filename.format(id=i))
    with Open(output_files_filename, 'w') as output_file_object:
        json.dump(output_files, output_file_object, indent=4)  
    return 0
Esempio n. 10
0
 def __init__(self, sequence_regexp, loader=None):
     if loader is None:
         self.loader = HMMLoader()
         register_annotations(self.loader)
     self.x_regexp = sequence_regexp[0]
     self.y_regexp = sequence_regexp[1]
Esempio n. 11
0
class AnnotationLoader:
    def __init__(self, sequence_regexp, loader=None):
        if loader is None:
            self.loader = HMMLoader()
            register_annotations(self.loader)
        self.x_regexp = sequence_regexp[0]
        self.y_regexp = sequence_regexp[1]

    @staticmethod
    def get_annotation_at(annotations, i):
        """
        Returns annotations at position i
        @param annotations:
        @param i:
        """
        base_annotation = dict()
        if annotations is not None:
            for key in annotations:
                base_annotation[key] = annotations[key][i]
        return base_annotation

    def _intervals_to_interval_map(self, intervals, offset):
        """
        Converts intervals from track to intervalmap, for searching

        currently supports binary annotations only
        """
        m = intervalmap()
        m[:] = 0
        for i in intervals:
            m[i[1] + offset:i[2] + offset] = 1
        return m

    def _get_annotation_from_bed(self, fname, offset):
        """
        Reads intervals from BED file
        """
        try:
            with track.load(fname) as ann:
                ann = ann.read(fields=['start', 'end'])
                intervals = self._intervals_to_interval_map(ann, offset)
        except Exception:
            intervals = self._intervals_to_interval_map([], 0)
        return intervals

    def _get_sequence_annotations(self, annotations,
                                  sequence_annotations_config):
        """
        Returns annotations for one sequence
        """
        res = dict()
        for annotation in annotations:
            res[annotation] = self._get_annotation_from_bed(
                *sequence_annotations_config[annotation])
        return res

    def _get_seq_name(self, names, regexp):
        r = re.compile(regexp)
        matches = [name for name in names if r.match(name)]
        if len(matches) != 1:
            raise RuntimeError('Cannot get name for regexp', regexp, '. Found',
                               len(matches), 'matches.')
        return matches[0]

    def get_annotations_from_model(self, model):
        if not constants.annotations_enabled:
            return None, None, None
        if model is None:
            raise RuntimeError('No annotation model!')
        names = model.sequences.keys()
        x_name = self._get_seq_name(names, self.x_regexp)
        y_name = self._get_seq_name(names, self.y_regexp)
        annotations = model.annotations
        # print 'Using annotations for x:', x_name
        annotations_x = self._get_sequence_annotations(annotations,
                                                       model.sequences[x_name])
        # print 'Using annotations for y:', y_name
        annotations_y = self._get_sequence_annotations(annotations,
                                                       model.sequences[y_name])
        return annotations, annotations_x, annotations_y

    def get_annotations(self, fname):
        model = self.loader.load(fname)
        return self.get_annotations_from_model(model)
Esempio n. 12
0
def get_model(args, filename, allow_mask=True):
    loader = HMMLoader(args.mathType)  # TODO: rename HMMLoader to ModelLoader
    register_classifier_states(loader)
    register_annotation_states(loader)
    register_cannotation_states(loader)
    register_annotations(loader)

    for i in range(0, len(args.bind_file), 2):
        loader.addFile(args.bind_file[i], args.bind_file[i + 1])
    for i in range(0, len(args.bind_constant_file), 2):
        loader.addConstant(args.bind_constant_file[i],
                           loader.load(args.bind_constant_file[i + 1]))
    for i in range(0, len(args.bind_constant_file), 2):
        loader.addConstant(
            args.bind_constant_file[i],
            loader.loads(args.bind_constant_file[i + 1]),
        )

    model = loader.load(filename)
    if type(model) is dict and 'model' in model:
        model = model["model"]
    if args.add_masked_to_distribution and allow_mask:
        model.add_soft_masking_to_distribution()
    return model
Esempio n. 13
0
def main(model_file, additional_parameters,
         emmisions_file, transitions_file, repeat_consensus_file,
         repeat_length_file, trf_cover_file, output_file, simple_model):
    loader = HMMLoader()

    with Open(trf_cover_file, 'r') as f:
        trf_cover = json.load(f)
    if not simple_model:
        repeat_probability = (float(trf_cover['R_segment_count']) / 
                              (trf_cover['R_segment_count'] +
                               trf_cover['M_count']))
        repeat_count = sum([trf_cover[x] for x in ['RR', 'RM', 'MR']])
        repeat_repeat_probability = float(trf_cover['RR']) / repeat_count
        nothing_repeat_probability = float(trf_cover['MR']) / repeat_count
        repeat_nothing_probability = float(trf_cover['RM']) / repeat_count

        loader.addDictionary('trackemi', {"value": {
            'RR': 0.0,#repeat_repeat_probability,
            'RM': repeat_nothing_probability,
            'MR': nothing_repeat_probability,
        }})

    for k, v in additional_parameters.iteritems():
        loader.addDictionary(k, v)
    
    # Parse emissions
    
    with Open(emmisions_file, 'r') as f:
        emm = normalize_dict(json.load(f))

    emm = [(ast.literal_eval(k), v) for k, v in emm.iteritems()]
    loader.addDictionary('MatchStateEmissions', {'value': emm})
    
    background_prob = defaultdict(int)
    for ((r1, r2), v) in emm:
        background_prob[r1] += v
        background_prob[r2] += v
    background_prob = \
        {'value': list(normalize_dict(background_prob).iteritems())}
    loader.addDictionary('background-probability', background_prob)
    
    # Parse transitions
    with Open(transitions_file, 'r') as f:
        __trans = json.load(f)
    trans = dict()
    for k, v in __trans.iteritems():
        trans[''.join(ast.literal_eval(k))] = v
    trans = normalize_tuple_dict(trans)
    if not simple_model:
        for k in trans:
            trans[k] *= (1 - repeat_probability)
        trans['MR'] = repeat_probability
        trans['XR'] = repeat_probability
        trans['YR'] = repeat_probability
        trans['RR'] = repeat_probability
        trans['RX'] = (1 - repeat_probability) / 3
        trans['RY'] = (1 - repeat_probability) / 3
        trans['RM'] = (1 - repeat_probability) / 3
       
    loader.addDictionary('trans', trans) 
        
    # Parse emissions from trf
    if not simple_model:
        loader.addFile('consensus.js', 
                       os.path.relpath(os.path.abspath(repeat_consensus_file), 
                                       os.path.dirname(model_file)))
        loader.addFile('repeatlength.js', os.path.abspath(repeat_length_file))

    model = loader.load(model_file)
    
    json_prep = {'model': model['model'].toJSON()}
    with Open(output_file, 'w') as f:
        json.dump(json_prep, f, indent=4)
    return output_file
Esempio n. 14
0
def main(model_file, additional_parameters,
         emmisions_file, transitions_file, repeat_consensus_file,
         repeat_length_file, trf_cover_file, output_file, simple_model):
    loader = HMMLoader()

    with Open(trf_cover_file, 'r') as f:
        trf_cover = json.load(f)
    if not simple_model:
        repeat_probability = (float(trf_cover['R_segment_count']) / 
                              (trf_cover['R_segment_count'] +
                               trf_cover['M_count']))
        repeat_count = sum([trf_cover[x] for x in ['RR', 'RM', 'MR']])
        repeat_repeat_probability = float(trf_cover['RR']) / repeat_count
        nothing_repeat_probability = float(trf_cover['MR']) / repeat_count
        repeat_nothing_probability = float(trf_cover['RM']) / repeat_count

        loader.addDictionary('trackemi', {"value": {
            'RR': repeat_repeat_probability,
            'RM': repeat_nothing_probability,
            'MR': nothing_repeat_probability,
        }})

    for k, v in additional_parameters.iteritems():
        loader.addDictionary(k, v)
    
    # Parse emissions
    
    with Open(emmisions_file, 'r') as f:
        emm = normalize_dict(json.load(f))

    emm = [(ast.literal_eval(k), v) for k, v in emm.iteritems()]
    loader.addDictionary('MatchStateEmissions', {'value': emm})
    
    background_prob = defaultdict(int)
    for ((r1, r2), v) in emm:
        background_prob[r1] += v
        background_prob[r2] += v
    background_prob = \
        {'value': list(normalize_dict(background_prob).iteritems())}
    loader.addDictionary('background-probability', background_prob)
    
    # Parse transitions
    with Open(transitions_file, 'r') as f:
        __trans = json.load(f)
    trans = dict()
    for k, v in __trans.iteritems():
        trans[''.join(ast.literal_eval(k))] = v
    trans = normalize_tuple_dict(trans)
    if not simple_model:
        for k in trans:
            trans[k] *= (1 - repeat_probability)
        trans['MR'] = repeat_probability
        trans['XR'] = repeat_probability
        trans['YR'] = repeat_probability
        trans['RR'] = repeat_probability
        trans['RX'] = (1 - repeat_probability) / 3
        trans['RY'] = (1 - repeat_probability) / 3
        trans['RM'] = (1 - repeat_probability) / 3
       
    loader.addDictionary('trans', trans) 
        
    # Parse emissions from trf
    if not simple_model:
        loader.addFile('consensus.js', 
                       os.path.relpath(os.path.abspath(repeat_consensus_file), 
                                       os.path.dirname(model_file)))
        loader.addFile('repeatlength.js', os.path.abspath(repeat_length_file))

    model = loader.load(model_file)
    
    json_prep = {'model': model['model'].toJSON()}
    with Open(output_file, 'w') as f:
        json.dump(json_prep, f, indent=4)
    return output_file
Esempio n. 15
0
 def __init__(self, sequence_regexp, loader=None):
     if loader is None:
         self.loader = HMMLoader()
         register_annotations(self.loader)
     self.x_regexp = sequence_regexp[0]
     self.y_regexp = sequence_regexp[1]
Esempio n. 16
0
class AnnotationLoader:
    def __init__(self, sequence_regexp, loader=None):
        if loader is None:
            self.loader = HMMLoader()
            register_annotations(self.loader)
        self.x_regexp = sequence_regexp[0]
        self.y_regexp = sequence_regexp[1]

    @staticmethod
    def get_annotation_at(annotations, i):
        """
        Returns annotations at position i
        @param annotations:
        @param i:
        """
        base_annotation = dict()
        if annotations is not None:
            for key in annotations:
                base_annotation[key] = annotations[key][i]
        return base_annotation

    def _intervals_to_interval_map(self, intervals, offset):
        """
        Converts intervals from track to intervalmap, for searching

        currently supports binary annotations only
        """
        m = intervalmap()
        m[:] = 0
        for i in intervals:
            m[i[1]+offset:i[2]+offset] = 1
        return m

    def _get_annotation_from_bed(self, fname, offset):
        """
        Reads intervals from BED file
        """
        try:
            with track.load(fname) as ann:
                ann = ann.read(fields=['start', 'end'])
                intervals = self._intervals_to_interval_map(ann, offset)
        except Exception:
            intervals = self._intervals_to_interval_map([], 0)
        return intervals

    def _get_sequence_annotations(
        self,
        annotations,
        sequence_annotations_config
    ):
        """
        Returns annotations for one sequence
        """
        res = dict()
        for annotation in annotations:
            res[annotation] = self._get_annotation_from_bed(
                *sequence_annotations_config[annotation]
            )
        return res

    def _get_seq_name(self, names, regexp):
        r = re.compile(regexp)
        matches = [name for name in names if r.match(name)]
        if len(matches) != 1:
            raise RuntimeError(
                'Cannot get name for regexp', regexp, '. Found', len(matches), 'matches.'
            )
        return matches[0]

    def get_annotations_from_model(self, model):
        if not constants.annotations_enabled:
            return None, None, None
        if model is None:
            raise RuntimeError('No annotation model!')
        names = model.sequences.keys()
        x_name = self._get_seq_name(names, self.x_regexp)
        y_name = self._get_seq_name(names, self.y_regexp)
        annotations = model.annotations
        # print 'Using annotations for x:', x_name
        annotations_x = self._get_sequence_annotations(
            annotations, model.sequences[x_name]
        )
        # print 'Using annotations for y:', y_name
        annotations_y = self._get_sequence_annotations(
            annotations, model.sequences[y_name]
        )
        return annotations, annotations_x, annotations_y

    def get_annotations(self, fname):
        model = self.loader.load(fname)
        return self.get_annotations_from_model(model)