Ejemplo n.º 1
0
def main(input_filename, output_filename):
    task_ids = [1]
    if os.environ.has_key('SGE_TASK_ID'):
        sge_task_id = int(os.environ['SGE_TASK_ID'])
        if not os.environ.has_key('SGE_STEP_SIZE'):
            sge_step_size = 1
        else:
            sge_step_size = int(os.environ['SGE_STEP_SIZE'])
        sge_task_last = int(os.environ['SGE_TASK_LAST'])
        task_ids = range(
            sge_task_id,
            min(sge_task_id + sge_step_size, sge_task_last + 1)
        )
        print task_ids, sge_task_id, sge_step_size, sge_task_last
    for task_id in task_ids:
        inp_filename = input_filename.format(id=task_id - 1)
        out_filename = output_filename.format(id=task_id - 1)
        aln = list(Fasta.load(
            inp_filename, 
            '', 
            Alignment, 
            sequence_selectors=['sequence1', 'sequence2']))[0]
        tmp_filename = get_temp_filename()
        Fasta.save(zip(aln.names, aln.sequences), tmp_filename)
        os.system("muscle -in {inp} -out {out} 2> /dev/null".format(
            inp=tmp_filename,
            out=out_filename,
        ))
        os.system("cp {inp}.repeats {out}.repeats".format(
            inp=inp_filename,
            out=out_filename,
        ))
Ejemplo n.º 2
0
 def __init__(self, *args):
     unittest.TestCase.__init__(self, *args)
     fname = "data/test_data/sequences/alignment.fa"
     dl = DataLoader()
     _, self.seq_x, self.ann_x, self.seq_y, self.ann_y = dl.loadSequence(fname)
     self.seq_xs = Fasta.alnToSeq(self.seq_x)
     self.seq_ys = Fasta.alnToSeq(self.seq_y)
Ejemplo n.º 3
0
def realign_file(args, model, output_filename, alignment_filename):
    # begin of HACK
    if args.expand_model:
        old_tracks = args.tracks
        args.tracks.add('trf_cons')
    m = model
    if args.annotation_model:
        m = args.annotation_model
    annotations = compute_annotations(args, alignment_filename, m)
    if args.expand_model:
        consensuses = annotations['trf_cons']
        args.tracks = old_tracks
        if 'trf_cons' not in old_tracks:
            del args.tracks['trf_cons']
    # end of HACK
    with Open(output_filename, 'w') as output_file_object:
        for aln in Fasta.load(
            alignment_filename, 
            args.alignment_regexp, 
            Alignment, 
            sequence_selectors=args.sequence_regexp):
            if len(aln.sequences) < 2:
                sys.stderr.write("ERROR: not enough sequences in file\n")
                return 1
            if len(args.draw) == 0:
                drawer = brainwash(AlignmentCanvas)()
            else:
                drawer = AlignmentCanvas()
                drawer.add_original_alignment(aln)
            aln, unmask_repeats = args.mask_repeats(aln, annotations)
            seq1, seq2 = tuple(map(Fasta.alnToSeq, aln.sequences[:2]))
            perf.msg("Data loaded in {time} seconds.")
            perf.replace()
            if args.expand_model:
                # Potrebujem zistit konsenzy
                A = consensuses[aln.names[0]]
                B = consensuses[aln.names[1]]
                cons = list(A.union(B))
                real_model = model.expandModel({'consensus': cons})
            else: 
                real_model = model
            realigner = args.algorithm()
            realigner.setDrawer(drawer)
            realigner.prepareData(seq1, aln.names[0], seq2, aln.names[1], aln, 
                                  real_model, annotations, args)
                                                              
            aln = realigner.realign(0, len(seq1), 0, len(seq2))
            aln = unmask_repeats(aln)
            perf.msg("Sequence was realigned in {time} seconds.")
            perf.replace()
            if len(args.draw) > 0:
                drawer.add_sequence('X', seq1)
                drawer.add_sequence('Y', seq2)
                drawer.add_alignment_line(101, (255, 0, 255, 255), 2, 
                                          AlignmentPositionGenerator(
                                              Alignment([aln[0], aln[2]])))
                drawer.draw(args.draw, 2000, 2000)
                perf.msg("Image was drawn in {time} seconds.")
            # Save output_file
            Fasta.saveAlignmentPiece(aln, output_file_object)
Ejemplo n.º 4
0
    def _classification(self, sequence_x, ann_x, sequence_y, ann_y):
        def state(i):
            if sequence_x[i] == '-' and sequence_y[i] == '-':
                return 0
            if sequence_x[i] == '-':
                return 2
            if sequence_y[i] == '-':
                return 1
            return 0

        def get_pos():
            def state(i):
                if sequence_x[i] == '-' and sequence_y[i] == '-':
                    return -1
                if sequence_x[i] == '-':
                    return 2
                if sequence_y[i] == '-':
                    return 1
                return 0

            pos_x, pos_y = 0, 0
            pos = list()
            for i in xrange(len(sequence_x)):
                pos.append((pos_x, pos_y))
                s = state(i)
                if s == 0:
                    pos_x += 1
                    pos_y += 1
                if s == 1:
                    pos_x += 1
                if s == 2:
                    pos_y += 1

            return pos

        assert len(sequence_y) == len(sequence_x)
        l = len(sequence_x)

        sequence_xs = Fasta.alnToSeq(sequence_x)
        sequence_ys = Fasta.alnToSeq(sequence_y)

        positions = get_pos()

        ret_match = (
            0 for _ in filter(lambda x: state(x) == 0, xrange(l))
        )

        ret_insertX = self.clf.multi_prepare_predict(
            (sequence_xs, positions[pos][0], ann_x, sequence_ys, positions[pos][1], ann_y)
            for pos in filter(lambda x: state(x) == 1, xrange(l))
        )

        ret_insertY = self.clf.multi_prepare_predict(
            (sequence_ys, positions[pos][1], ann_y, sequence_xs, positions[pos][0], ann_x)
            for pos in filter(lambda x: state(x) == 2, xrange(l))
        )

        ret = merge((state(x) for x in xrange(l)), ret_match, ret_insertX, ret_insertY)
        return ret
Ejemplo n.º 5
0
 def __init__(self, *args):
     unittest.TestCase.__init__(self, *args)
     fname = "data/test_data/sequences/alignment.fa"
     dl = DataLoader()
     _, self.seq_x, self.ann_x, self.seq_y, self.ann_y = dl.loadSequence(
         fname)
     self.seq_xs = Fasta.alnToSeq(self.seq_x)
     self.seq_ys = Fasta.alnToSeq(self.seq_y)
Ejemplo n.º 6
0
    def _classification(self, sequence_x, ann_x, sequence_y, ann_y):
        def state(i):
            if sequence_x[i] == '-' and sequence_y[i] == '-':
                return 0
            if sequence_x[i] == '-':
                return 2
            if sequence_y[i] == '-':
                return 1
            return 0

        def get_pos():
            def state(i):
                if sequence_x[i] == '-' and sequence_y[i] == '-':
                    return -1
                if sequence_x[i] == '-':
                    return 2
                if sequence_y[i] == '-':
                    return 1
                return 0

            pos_x, pos_y = 0, 0
            pos = list()
            for i in xrange(len(sequence_x)):
                pos.append((pos_x, pos_y))
                s = state(i)
                if s == 0:
                    pos_x += 1
                    pos_y += 1
                if s == 1:
                    pos_x += 1
                if s == 2:
                    pos_y += 1

            return pos

        assert len(sequence_y) == len(sequence_x)
        l = len(sequence_x)

        sequence_xs = Fasta.alnToSeq(sequence_x)
        sequence_ys = Fasta.alnToSeq(sequence_y)

        positions = get_pos()

        ret_match = (0 for _ in filter(lambda x: state(x) == 0, xrange(l)))

        ret_insertX = self.clf.multi_prepare_predict(
            (sequence_xs, positions[pos][0], ann_x, sequence_ys,
             positions[pos][1], ann_y)
            for pos in filter(lambda x: state(x) == 1, xrange(l)))

        ret_insertY = self.clf.multi_prepare_predict(
            (sequence_ys, positions[pos][1], ann_y, sequence_xs,
             positions[pos][0], ann_x)
            for pos in filter(lambda x: state(x) == 2, xrange(l)))

        ret = merge((state(x) for x in xrange(l)), ret_match, ret_insertX,
                    ret_insertY)
        return ret
Ejemplo n.º 7
0
def create_fasta(fname, seq_basename, sequences, rename=False):
    if rename:
        names = [seq_basename + str(i + 1) for i in range(len(sequences))]
    else:
        names = [s['name'] for s in sequences]
    assert (len(names) == len(sequences))
    seq = [s['sequence'].upper() for s in sequences]
    Fasta.save(zip(names, seq), fname)
    return names
Ejemplo n.º 8
0
def create_fasta(fname, seq_basename, sequences, rename=False):
    if rename:
        names = [seq_basename + str(i + 1) for i in range(len(sequences))]
    else:
        names = [s["name"] for s in sequences]
    assert len(names) == len(sequences)
    seq = [s["sequence"].upper() for s in sequences]
    Fasta.save(zip(names, seq), fname)
    return names
Ejemplo n.º 9
0
def main(input_files, output_file):
    global width
    alignments = [
        list(
            Fasta.load(name, '', Alignment,
                       ['^sequence1', '^sequence2', '^[av].*']))
        if os.path.exists(name) else None for name in input_files
    ]
    x_len = len(Fasta.alnToSeq(alignments[0][0].sequences[0]))
    y_len = len(Fasta.alnToSeq(alignments[0][0].sequences[1]))
    I = Image.new('RGB', (x_len * width + 50, y_len * width + 50),
                  (255, 255, 255))
    D = ImageDraw.Draw(I)
    colors = [(0, 0, 0), (255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 0, 255),
              (0, 255, 255)]
    i = -1
    for aln in alignments:
        i += 1
        if aln == None:
            continue
        aln = list(aln)
        if len(aln) == 0:
            continue
        aln = aln[0]
        try:
            annotation = aln.sequences[2]
            coords = aln.getCoordPairs()
            print coords
            x_shift = width / 2 + 25 + i
            y_shift = width / 2 + 25 + i * 2
            D.line([(x * width + x_shift, y * width + y_shift)
                    for x, y, _ in coords],
                   fill=colors[i])
            if annotation != None:
                for x, y, ind in coords:
                    if annotation[ind] != 'R':
                        continue
                    D.rectangle([(x * width + x_shift - width / 4,
                                  y * width + y_shift - width / 4),
                                 (x * width + x_shift + width / 4,
                                  y * width + y_shift + width / 4)],
                                outline=colors[i])
        except IndexError:
            pass
        except IOError:
            pass
    del D
    I.save(output_file)
Ejemplo n.º 10
0
def main(files_filename, output_filename, suffix, base_dir):
    X = ""
    Y = ""
    A = ""
    with Open(output_filename, 'w') as ff:
        files = json.load(Open(files_filename))
        total = len(files)
        done = 0
        for filename in files:
            if done %100 ==0:
                print '{}/{} {:.2}%'.format(done, total, 100.0 * done / total)
            if filename == "":
                Fasta.saveAlignmentPiece([(X_name, X), (Y_name, Y), (A_name, A)], ff)
                X = ""
                Y = ""
                A = ""
                continue
            done += 1
            old_filename = filename
            keep = False
            
            if filename.count('keep') == 0:
                filename = filename[:-2] + suffix
                if base_dir != None:
                    filename = base_dir + '/' + filename.split('/')[-1]
                try:
                    with Open(filename, 'r') as f:
                        l = len(''.join(f).strip())
                    if l == 0:
                        filename = old_filename
                        keep = True
                except IOError:
                    filename = old_filename
                    keep = True
            if filename.count('keep') > 0:
                keep = True
            aln = list(Fasta.load(filename, ''))[0]
            assert(len(aln) == 3)
            assert(len(aln[0][1]) == len(aln[1][1]) == len(aln[2][1]))
            X += aln[0][1]
            if keep:
                A += '.' * len(aln[0][1])
            else: 
                A += aln[1][1]
            Y += aln[2][1]
            X_name = aln[0][0]
            A_name = aln[1][0]
            Y_name = aln[2][0]
Ejemplo n.º 11
0
def main(files_filename, output_filename, suffix, base_dir):
    X = ""
    Y = ""
    A = ""
    with Open(output_filename, "w") as ff:
        files = json.load(Open(files_filename))
        total = len(files)
        done = 0
        for filename in files:
            if done % 100 == 0:
                print "{}/{} {:.2}%".format(done, total, 100.0 * done / total)
            if filename == "":
                Fasta.saveAlignmentPiece([(X_name, X), (Y_name, Y), (A_name, A)], ff)
                X = ""
                Y = ""
                A = ""
                continue
            done += 1
            old_filename = filename
            keep = False

            if filename.count("keep") == 0:
                filename = filename[:-2] + suffix
                if base_dir != None:
                    filename = base_dir + "/" + filename.split("/")[-1]
                try:
                    with Open(filename, "r") as f:
                        l = len("".join(f).strip())
                    if l == 0:
                        filename = old_filename
                        keep = True
                except IOError:
                    filename = old_filename
                    keep = True
            if filename.count("keep") > 0:
                keep = True
            aln = list(Fasta.load(filename, ""))[0]
            assert len(aln) == 3
            assert len(aln[0][1]) == len(aln[1][1]) == len(aln[2][1])
            X += aln[0][1]
            if keep:
                A += "." * len(aln[0][1])
            else:
                A += aln[1][1]
            Y += aln[2][1]
            X_name = aln[0][0]
            A_name = aln[1][0]
            Y_name = aln[2][0]
Ejemplo n.º 12
0
def main(input_file, index1, index2, emissionOutput, transitionOutput):

    emissions = defaultdict(int)
    transitions = defaultdict(int)

    X = None
    Y = None

    def aggregate(X, Y):
        pairs = zip(X, Y)
        for p in pairs:
            if skip(p):
                continue
            emissions[str(upper(p))] += 1
        Types = list(sortTypes([getType(x) for x in pairs if x != ('-', '-')]))
        for p in zip(Types, Types[1:]):
            transitions[str(p)] += 1

    for aln in Fasta.load(input_file, '[.][0-9]+$'):
        count = 0
        for _, sequence in aln:
            if count == index1:
                X = sequence
            if count == index2:
                Y = sequence
            count += 1
        aggregate(X, Y)

    with Open(emissionOutput, 'w') as f:
        json.dump(emissions, f, indent=4)

    with Open(transitionOutput, 'w') as f:
        json.dump(transitions, f, indent=4)
def main(arg):
    with open(arg.output, 'w') as f:
        for alignment_file in arg.alignment:
            alns = Fasta.load(alignment_file, arg.alignment_regexp, Alignment,
                       sequence_selectors=arg.sequence_regexp)
            for aln in alns:
                l = len(aln.sequences[0])
                poss = []
                for i in range(arg.min_split_size, arg.max_split_size + 1):
                    mod = l % i
                    rest = None 
                    if mod >= arg.min_split_size:
                        rest = mod
                    if mod + i <= arg.max_split_size and mod + i >= arg.min_split_size:
                        rest = mod + i
                    if rest == None:
                        continue
                    rest = min(rest, i)
                    poss.append((-rest, i))
                poss.sort()
                _, best = poss[0]
                splits = [i for i in range(0, l, best)]
                if best + l % best < arg.max_split_size:
                    splits.pop()
                splits.append(l)
                for fr, to in zip(splits, splits[1:]):
                    s1 = aln.sequences[0][fr:to]
                    s2 = aln.sequences[1][fr:to]
                    if min(map(len,[s1.strip('-'), s2.strip('-')])) > 0:
                        f.write('{}.{}-{} {}\n'.format(aln.names[0], fr, to, s1))
                        f.write('{}.{}-{} {}\n'.format(aln.names[1], fr, to, s2))
Ejemplo n.º 14
0
def do_find_repeats(alignment_file, paramSeq, model, mathType, _stats, separator):
    modelParam = {
        "mathType": mathType,
        "modelFactory": model,
    } 
    driver = TRFDriver()
    trf_repeats = driver.run(alignment_file, paramSeq)
    alignments = Fasta.load(alignment_file, separator, Alignment)
    D = dict()
    stats = defaultdict(int)
    count = 1;
    for alignment in alignments:
        print ("Annotating alignment {0}".format(count))
        count += 1
        consensus_list = list(set([
            x.consensus for x in itertools.chain(*[
                trf_repeats[name] for name in alignment.names
            ])
        ]))
        repeats = find_repeats_in_alignment(alignment, consensus_list, modelParam)
        print repeats
        D.update(repeats)
        if _stats != None:
            s = compute_statistics(repeats)
            for key, value in s.iteritems():
                stats[key] += value
    return D, stats
Ejemplo n.º 15
0
def main(input_file, index1, index2, emissionOutput, transitionOutput):
    
    emissions = defaultdict(int)
    transitions = defaultdict(int)
    
    X = None
    Y = None
    
    def aggregate(X, Y):
        pairs = zip(X, Y)
        for p in pairs:
            if skip(p):
                continue
            emissions[str(upper(p))] += 1
        Types = list(sortTypes([getType(x) for x in pairs if x != ('-', '-')]))
        for p in zip(Types, Types[1:]):
            transitions[str(p)] += 1
    
    for aln in Fasta.load(input_file, '[.][0-9]+$'):
        count = 0
        for _, sequence in aln:
            if count == index1:
                X = sequence
            if count == index2:
                Y = sequence
            count += 1
        aggregate(X, Y)
        
    with Open(emissionOutput, 'w') as f:
        json.dump(emissions, f, indent=4)
        
    with Open(transitionOutput, 'w') as f:
        json.dump(transitions, f, indent=4)
Ejemplo n.º 16
0
 def getSequences(self, fname, sequence_regexp=None):
     alignment_regexp = ''
     if sequence_regexp is None:
         sequence_regexp = ["^sequence1$", "^sequence2$"]
     self.sequence_regexp = sequence_regexp
     aln = next(
         Fasta.load(fname, alignment_regexp, Alignment, sequence_regexp))
     if aln is None or len(aln.sequences) < 2:
         raise ParseException('Not enough sequences in file\n')
     seq1 = aln.sequences[0]
     seq2 = aln.sequences[1]
     return seq1, seq2
Ejemplo n.º 17
0
 def getSequences(self, fname, sequence_regexp=None):
     alignment_regexp = ''
     if sequence_regexp is None:
         sequence_regexp = ["^sequence1$", "^sequence2$"]
     self.sequence_regexp = sequence_regexp
     aln = next(
         Fasta.load(fname, alignment_regexp, Alignment, sequence_regexp)
     )
     if aln is None or len(aln.sequences) < 2:
         raise ParseException('Not enough sequences in file\n')
     seq1 = aln.sequences[0]
     seq2 = aln.sequences[1]
     return seq1, seq2
Ejemplo n.º 18
0
    def prepare_training_data(
        self,
        sequence_x,
        annotations_x,
        sequence_y,
        annotations_y,
    ):
        """Takes sequences with spaces and prepares training data for classifier
        """
        assert len(sequence_y) == len(sequence_x)

        sequence_xs = Fasta.alnToSeq(sequence_x)
        sequence_ys = Fasta.alnToSeq(sequence_y)

        train_data1, matched_pos, seq_size, weights_set = self.prepare_positive_data(
            sequence_x,
            sequence_xs,
            annotations_x,
            sequence_y,
            sequence_ys,
            annotations_y,
        )

        train_data0 = self.prepare_negative_data(
            sequence_x,
            sequence_xs,
            annotations_x,
            sequence_y,
            sequence_ys,
            annotations_y,
            matched_pos,
            seq_size,
            weights_set,
        )

        return train_data1[0] + train_data0[0],\
            train_data1[1] + train_data0[1],\
            train_data1[2] + train_data0[2]
Ejemplo n.º 19
0
    def prepare_training_data(
        self,
        sequence_x,
        annotations_x,
        sequence_y,
        annotations_y,
    ):
        """Takes sequences with spaces and prepares training data for classifier
        """
        assert len(sequence_y) == len(sequence_x)

        sequence_xs = Fasta.alnToSeq(sequence_x)
        sequence_ys = Fasta.alnToSeq(sequence_y)

        train_data1, matched_pos, seq_size, weights_set = self.prepare_positive_data(
            sequence_x,
            sequence_xs,
            annotations_x,
            sequence_y,
            sequence_ys,
            annotations_y,
        )

        train_data0 = self.prepare_negative_data(
            sequence_x,
            sequence_xs,
            annotations_x,
            sequence_y,
            sequence_ys,
            annotations_y,
            matched_pos,
            seq_size,
            weights_set,
        )

        return train_data1[0] + train_data0[0],\
            train_data1[1] + train_data0[1],\
            train_data1[2] + train_data0[2]
Ejemplo n.º 20
0
def main(input_files, output_file):
    global width
    alignments = [list(Fasta.load(name, '', Alignment, ['^sequence1', '^sequence2', '^[av].*'])) if os.path.exists(name) else None for name in input_files]
    x_len = len(Fasta.alnToSeq(alignments[0][0].sequences[0]))
    y_len = len(Fasta.alnToSeq(alignments[0][0].sequences[1]))
    I = Image.new('RGB', (x_len * width + 50, y_len * width + 50), (255, 255, 255)) 
    D = ImageDraw.Draw(I)
    colors = [(0, 0, 0), (255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 0, 255), (0, 255, 255)]
    i = -1
    for aln in alignments:
        i += 1
        if aln == None:
            continue
        aln = list(aln)
        if len(aln) == 0:
            continue
        aln = aln[0]
        try:
            annotation = aln.sequences[2]
            coords = aln.getCoordPairs() 
            print coords
            x_shift = width / 2 + 25 + i
            y_shift = width / 2 + 25 + i * 2
            D.line([(x * width + x_shift, y * width + y_shift) for 
                    x, y, _ in coords], fill=colors[i])
            if annotation != None:
                for x, y, ind in coords:
                    if annotation[ind] != 'R':
                        continue
                    D.rectangle([(x * width + x_shift - width / 4, y * width + y_shift - width / 4),
                                 (x * width + x_shift + width / 4, y * width + y_shift + width / 4)], outline=colors[i]) 
        except IndexError:
            pass
        except IOError:
            pass
    del D
    I.save(output_file)
Ejemplo n.º 21
0
def main(input_file, output_file, trf):
    
    # THIS IS ONLY GENERATOR!!!
    alns = (Alignment(a) 
            for a in Fasta.load(input_file, '[.][0-9]+$', Alignment))
    
    # 1. run trf, 
    for trf_executable in trf:
        if os.path.exists(trf_executable):  
            trf = TRFDriver(trf_executable)
            break
    repeats = trf.run(input_file)
    
    
    
    A = list(compute_annotation_track(alns, repeats))
    json.dump(A, Open(output_file, 'w'), indent=4)
Ejemplo n.º 22
0
def expectation_generator(args, model, alignment_filename, annotations):
    for aln in Fasta.load(alignment_filename,
                          args.alignment_regexp,
                          Alignment,
                          sequence_selectors=args.sequence_regexp):
        if len(aln.sequences) < 2:
            sys.stderr.write("ERROR: not enough sequences in file\n")
            raise "ERROR: not enough sequences in file"
        seq1, seq2 = tuple(map(Fasta.alnToSeq, aln.sequences[:2]))
        positionGenerator = list(AlignmentBeamGenerator(aln, args.beam_width))

        RX = RepeatGenerator(None, args.repeat_width)
        RY = RepeatGenerator(None, args.repeat_width)
        for rt in ['trf', 'original_repeats']:
            if rt in annotations:
                RX.addRepeats(annotations[rt][aln.names[0]])
                RY.addRepeats(annotations[rt][aln.names[1]])
        RX.buildRepeatDatabase()
        RY.buildRepeatDatabase()
        if 'Repeat' in model.statenameToID:
            model.states[model.statenameToID['Repeat']].addRepeatGenerator(
                RX, RY)

        (transitions, emissions), probability = model.getBaumWelchCounts(
            seq1,
            0,
            len(seq1),
            seq2,
            0,
            len(seq2),
            positionGenerator=positionGenerator)
        yield {
            "probability": probability,
            "transitions": transitions,
            "emissions": emissions,
        }
Ejemplo n.º 23
0
def main(input_file, realign_output, do_not_touch_output,
         list_of_files_output, max_length, wrap_length,
         min_seq_length):
    realign_counter = 0
    do_not_touch_counter = 0
    files = []
    for alignment in Fasta.load(input_file, '\.[0-9]*$'):
        if realign_counter % 100 == 0:
            print(realign_counter, do_not_touch_counter,alignment[0][0])
        alignment_len = len(alignment[0][1])

        annotation = alignment[2][1]

        # !!! We expect that first block is not repeat
        changes = [i for i in range(1, len(annotation))
                   if annotation[i-1] != annotation[i]] + [len(annotation)]
        Blocks = zip(changes, changes[1:]) + [(len(annotation), len(annotation) + max_length + 10)]
        Blocks = [(0, Blocks[0][0])] + Blocks
        printed = 0
        block_start = 0#None
        block_end = None
        intervals = []
        for block_id in range(1, len(Blocks), 2):
            current = Blocks[block_id]
            previous = Blocks[block_id - 1]
            if block_start == None:
                startpp = max(printed, previous[0])
                if previous[1] - startpp > wrap_length:
                    intervals.append((printed, startpp))
                    printed = startpp
                    block_start = startpp
            else:
                # Pridam tento blok, alebo zacnem novy?
                if current[1] - block_start > max_length:
                    if previous[1] - previous[0] > wrap_length * 2:
                        intervals.append((block_start, previous[0] + wrap_length))
                        intervals.append((previous[0] + wrap_length, previous[1] - wrap_length))
                        printed = previous[1] - wrap_length
                        block_start = previous[1] - wrap_length
                    else:
                        split = (previous[0] + previous[1]) / 2
                        intervals.append((block_start, split))
                        block_start = split
                        printed = split
                    #Zacnem novy
        intervals.append((printed, len(annotation)))
        assert(len(annotation) == sum([y - x for x, y in intervals]))
        for i in range(1, len(intervals)):
            assert(intervals[i - 1][1] == intervals[i][0])

        #t = list(range(0, alignment_len, max_length)) + [alignment_len]
        #intervals = zip(t, t[1:]) 

        for start, stop in intervals:
            if start >= len(annotation):
                continue
            if start == stop:
                continue
            assert(start < stop)
            ann = alignment[2][1][start:stop]
            output = None
            seq1 = alignment[0][1]
            seq2 = alignment[4][1]
            seq1_len = len(seq1) - seq1.count('-') - seq1.count('.')
            seq2_len = len(seq2) - seq2.count('-') - seq2.count('.')
            if ann.count('R') == 0 or min(seq1_len, seq2_len) < min_seq_length or ann.count('R') == len(ann):
                output = do_not_touch_output.format(id=do_not_touch_counter)
                do_not_touch_counter += 1
            else:   
                output = realign_output.format(id=realign_counter)
                realign_counter += 1
            files.append(output)
            aln = [
                (alignment[0][0], alignment[0][1][start:stop]),
                (alignment[2][0], alignment[2][1][start:stop]),
                (alignment[4][0], alignment[4][1][start:stop])
            ]
            #Fasta.save(aln, output, width=-1)
        files.append('');
        
    with Open(list_of_files_output, 'w') as f:
        json.dump(files, f, indent=4)
Ejemplo n.º 24
0
def main(inp, out, alignment_regexp, sequence_regexp, trf=trf_paths):
    for trf_executable in trf:
        if os.path.exists(trf_executable):
            trf = TRFDriver(trf_executable, mathType=float)
            break
    repeats = trf.run(inp)

    stats = defaultdict(int)

    for aln in Fasta.load(inp,
                          alignment_regexp,
                          Alignment,
                          sequence_selectors=sequence_regexp):
        X_index = 0
        Y_index = 1

        X_trf = list(
            translate_repeat_to_annotation(repeats[aln.names[X_index]],
                                           aln.seq_to_aln[X_index]))
        Y_trf = list(
            translate_repeat_to_annotation(repeats[aln.names[Y_index]],
                                           aln.seq_to_aln[Y_index]))

        X_ann = list("M" * len(aln.sequences[X_index]))
        Y_ann = list("M" * len(aln.sequences[Y_index]))
        B_ann = list("M" * len(aln.sequences[Y_index]))
        for repeat in X_trf:
            if repeat.end >= len(X_ann):
                repeat.end = len(X_ann) - 1
            rlen = 1 + repeat.end - repeat.start
            X_ann[repeat.start:repeat.end + 1] = list("R" * rlen)
            B_ann[repeat.start:repeat.end + 1] = list("R" * rlen)
        for repeat in Y_trf:
            if repeat.end >= len(Y_ann):
                repeat.end = len(Y_ann) - 1
            rlen = 1 + repeat.end - repeat.start
            Y_ann[repeat.start:repeat.end + 1] = list("R" * rlen)
            B_ann[repeat.start:repeat.end + 1] = list("R" * rlen)
        assert (len(X_ann) == len(Y_ann) and len(B_ann) == len(Y_ann))

        M_count = len([x for x in B_ann if x == 'M'])
        R_count = len([x for x in B_ann if x == 'R'])
        R_segments_count = len([
            x for x in zip('M' + ''.join(B_ann), ''.join(B_ann) + 'M')
            if x[0] != 'R' and x[1] == 'R'
        ])
        stats['M_count'] += M_count
        stats['R_count'] += R_count
        stats['R_segment_count'] += R_segments_count
        changes = [
            i for i, x in zip(range(len(B_ann) + 1),
                              zip('M' + ''.join(B_ann), ''.join(B_ann) + 'M'))
            if x[0] != x[1]
        ]
        R_segments = [(changes[i], changes[i + 1])
                      for i in range(0,
                                     len(changes) - (len(changes) % 2), 2)]

        assert (R_segments_count == len(R_segments))
        for start, stop in R_segments:
            XX = 'M'
            YY = 'M'
            for i in range(start, stop):
                if X_ann[i] == 'R':
                    XX = 'R'
                if Y_ann[i] == 'R':
                    YY = 'R'
                assert (B_ann[i] == 'R')
            stats[XX + YY] += 1

    with Open(out, 'w') as f:
        json.dump(stats, f, indent=4)
Ejemplo n.º 25
0
def select_sequences(inp_filename, out_filename, sequences):
    aln = list(Fasta.load(inp_filename,
                          '',
                          Alignment,
                          sequence_selectors=sequences))[0]
    Fasta.save(zip(aln.names, aln.sequences), out_filename)
Ejemplo n.º 26
0
def main(correct_file, aln_file, output_file, interval=None):
    task_ids = [None]
    if os.environ.has_key('SGE_TASK_ID'):
        if os.environ['SGE_TASK_ID'] != 'undefined':
            sge_task_id = int(os.environ['SGE_TASK_ID'])
            if not os.environ.has_key('SGE_STEP_SIZE'):
                sge_step_size = 1
            else:
                sge_step_size = int(os.environ['SGE_STEP_SIZE'])
            sge_task_last = int(os.environ['SGE_TASK_LAST'])
            task_ids = range(
                sge_task_id, min(sge_task_id + sge_step_size,
                                 sge_task_last + 1))
    if interval != None:
        task_ids = range(interval[0], interval[1] + 1)
    for task_id in task_ids:
        separator = ''
        output = {}
        for fun, tp in [(identity, 'standard'),
                        (expand_repeats, 'expanded_repeats'),
                        (remove_repeats, 'removed_repeats')]:
            try:
                for correct, alignment in zip(
                        Fasta.load(correct_file.format(id=task_id - 1),
                                   separator, Alignment),
                        Fasta.load(aln_file.format(id=task_id - 1), separator,
                                   Alignment)):
                    correct_len = len(correct.getCoordPairs(False))
                    total_len = correct_len * 2 - correct.sequences[0].count(
                        '-') - correct.sequences[2].count('-')
                    ccc = fun(correct.getCoordPairs(False), correct)
                    if tp == 'removed_repeats':
                        correct_len = len(ccc)
                        total_len = 0
                        for v1, _, v2 in ccc:
                            if v1 >= 0:
                                total_len += 1
                            if v2 >= 0:
                                total_len += 1
                    acc = alignment.getCoordPairs(False)
                    cc = map(lambda x: (x[0], x[2]), ccc)
                    if len(acc[0]) == 3:
                        ac = map(lambda x: (x[0], x[2]), acc)
                    elif len(acc[0]) == 2:
                        ac = acc
                    else:
                        ac = None
                    c = set(cc)
                    a = set(ac)

                    intersect = c.intersection(a)
                    not_in_c = c.difference(a)
                    not_in_a = a.difference(c)
                    symm_diff = c.symmetric_difference(a)

                    score = 0
                    for v1, v2 in intersect:
                        if v1 >= 0:
                            score += 1
                        if v2 >= 0:
                            score += 1

                    dists_correct = defaultdict(int)
                    dists_total = defaultdict(int)
                    position = dict()
                    dists = [99999999] * len(correct.sequences[1])
                    dst = 9999999
                    for x, a, y in ccc:
                        position[(x, y)] = a
                    for i in range(len(correct.sequences[1])):
                        if correct.sequences[1][i] == 'R':
                            dst = 0
                        else:
                            dst += 1
                        dists[i] = min(dists[i], dst)
                    for i in reversed(range(len(correct.sequences[1]))):
                        if correct.sequences[1][i] == 'R':
                            dst = 0
                        else:
                            dst += 1
                        dists[i] = min(dists[i], dst)

                    for pos in c:
                        d = dists[position[pos]]
                        if d == 0:
                            continue
                        dists_total[d] += 1
                        if pos in ac:
                            dists_correct[d] += 1

                    def getRepeatAnnotation(coord, annotation):
                        if len(coord[0]) != 3:
                            return set()
                        ret = set()
                        for x, a, y in coord:
                            if annotation[a] == 'R':
                                if x >= 0:
                                    ret.add((x, -1))
                                if y >= 0:
                                    ret.add((-1, y))
                        return ret

                    crann = getRepeatAnnotation(correct.getCoordPairs(False),
                                                correct.sequences[1])
                    arann = getRepeatAnnotation(alignment.getCoordPairs(False),
                                                alignment.sequences[1])

                    def getRepeatBlocks(coord, annotation):
                        if len(coord[0]) != 3:
                            return set()
                        ret = set()
                        x = set()
                        y = set()
                        for _x, a, _y in coord:
                            if annotation[a] == 'R':
                                if _x >= 0:
                                    x.add(_x)
                                if _y >= 0:
                                    y.add(_y)
                            else:
                                if len(x) + len(y) > 0:
                                    if len(x) == 0:
                                        x.add(-1)
                                    if len(y) == 0:
                                        y.add(-1)
                                    ret.add(((min(x), max(x) + 1),
                                             (min(y), max(y) + 1)))
                                    x = set()
                                    y = set()
                        if len(x) + len(y) > 0:
                            if len(x) == 0:
                                x.add(-1)
                            if len(y) == 0:
                                y.add(-1)
                            ret.add(
                                ((min(x), max(x) + 1), (min(y), max(y) + 1)))
                            x = set()
                            y = set()
                        return ret

                    cbann = getRepeatBlocks(correct.getCoordPairs(False),
                                            correct.sequences[1])
                    abann = getRepeatBlocks(alignment.getCoordPairs(False),
                                            alignment.sequences[1])

                    def dst(x1, x2):
                        if x1 == -1:
                            return 0
                        return x2 - x1

                    def getPoints(s):
                        return sum([
                            dst(x1, x2) + dst(y1, y2)
                            for ((x1, x2), (y1, y2)) in s
                        ])

                    # Find long segments that are correctly aligned
                    cseg = [1 if x in c else 0 for x in ac]
                    seg_len = []
                    length = 0
                    segment_length_histogram = defaultdict(int)
                    for x in cseg:
                        if x == 0 and length != 0:
                            segment_length_histogram[length] += 1
                        length = length * x + x
                        seg_len.append(length)
                    if length > 0:
                        segment_length_histogram[length] += 1

                    getPoints = len
                    output[tp] = {
                        'corect':
                        correct_file,
                        'alignment':
                        aln_file,
                        'c-lenght':
                        len(cc),
                        'a-length':
                        len(ac),
                        'intersect':
                        len(intersect),
                        '%correct':
                        100.0 - float(len(intersect) * 100) / correct_len
                        if correct_len > 0 else 100,
                        '+mistakes':
                        len(intersect),
                        '+len':
                        correct_len,
                        '+RepTP':
                        len(crann & arann),
                        '+RepTN':
                        total_len - len(crann | arann),
                        '+RepFP':
                        len(arann - crann),
                        '+RepFN':
                        len(crann - arann),
                        '+BlkTP':
                        getPoints(cbann & abann),
                        '+BlkTN':
                        0,
                        '+BlkFP':
                        getPoints(abann - cbann),
                        '+BlkFN':
                        getPoints(cbann - abann),
                        '%score':
                        float(score) * 100 / total_len if total_len > 0 else 0,
                        'c-a':
                        len(not_in_c),
                        'a-c':
                        len(not_in_a),
                        'symmetric_difference':
                        len(symm_diff),
                        'correct_len_histogram':
                        segment_length_histogram,
                        '@+dists_correct':
                        dists_correct,
                        '@+dists_total':
                        dists_total,
                    }
                    if correct_len == 0:
                        del output[tp]['%correct']
                    if total_len == 0:
                        del output[tp]['%score']
            except IOError:
                pass

        with Open(output_file.format(id=task_id - 1), 'w') as f:
            json.dump(output, f, indent=4)
Ejemplo n.º 27
0
def main(input_file, output_file):

    for trf_executable in trf_paths:
        if os.path.exists(trf_executable):
            trf = TRFDriver(trf_executable)
            #break
    if not trf:
        raise "No trf found"
    repeats = trf.run(input_file)

    with open(output_file, 'w') as f:
        for alignment in Fasta.load(input_file, '\.[0-9]*$', Alignment):
            if len(alignment.sequences) != 2:
                print 'error'
                continue
            #print alignment.names
            annotation = list('.' * len(alignment.sequences[0]))
            annotationX = list('.' * len(alignment.sequences[0]))
            annotationY = list('.' * len(alignment.sequences[0]))
            trf = None
            for seq_name in alignment.names:
                index = None
                for i in range(len(alignment.names)):
                    if seq_name == alignment.names[i]:
                        index = i
                translator = alignment.seq_to_aln[index]
                revtranslator = alignment.aln_to_seq[index]
                for repeat in repeats[seq_name]:
                    for i in range(translator[repeat.start],
                                   translator[repeat.end]):
                        annotation[i] = 'R'
                        j = i - translator[repeat.start]
                        if index == 0:
                            annotationX[i] = repeat.consensus[
                                revtranslator[j] % len(repeat.consensus)]
                        else:
                            annotationY[i] = repeat.consensus[
                                revtranslator[j] % len(repeat.consensus)]
            d = defaultdict(int)
            ll = 0
            for v in annotation:
                if v != 'R':
                    if ll > 0:
                        d[ll] += 1
                        ll = 0
                else:
                    ll += 1
            #for x, y in sorted(d.iteritems(), key=lambda x: x[1]):
            #    print '{}: {}'.format(x, y)
            #if len(d.keys()) > 0:
            #    print('Number of repeats: {}, average length: {}, maximum length: {}, minimum length: {}'.format(
            #        sum(d.values()),
            #        sum([x * y for x, y in d.iteritems()])/ max(sum(d.values()), 1),
            #        max(d.keys()),
            #        min(d.keys())
            #    ))

            seqX = alignment.sequences

            nm = alignment.names[0]
            aln = [
                (alignment.names[0], alignment.sequences[0].replace('.', '-')),
                ('consensusX' + nm, ''.join(annotationX)),
                ('annotation' + nm, ''.join(annotation)),
                ('consensusY' + nm, ''.join(annotationY)),
                (alignment.names[1], alignment.sequences[1].replace('.', '-'))
            ]
            Fasta.saveAlignmentPiece(aln, f, -1)
Ejemplo n.º 28
0
def simulate(
    n,
    datadir='data/sequences/train_sequences/',
    fname='simulated_alignment',
):
    s1name = "sequence1"
    s2name = "sequence2"
    s3name = "sequence3"
    annotation_name = 'gene'

    alignment_extension = ".fa"
    annotations_extension = ".bed"
    config_extension = ".js"

    # if len(sys.argv) > 1:
    #     n = int(sys.argv[1])
    # if len(sys.argv) > 2:
    #     fname = sys.argv[2]

    master_gene_sequence = MarkovChain(P_START_GENE, P_STOP_GENE)
    human_delete_sequence = MarkovChain(P_START_DELETE, P_STOP_DELETE)
    mouse_delete_sequence = MarkovChain(P_START_DELETE, P_STOP_DELETE)
    horse_delete_sequence = MarkovChain(P_START_DELETE, P_STOP_DELETE)
    mutator_coin = BiasedCoin(P_NOT_MUTATE_GENE)

    master_gene = list()
    human_gene = list()
    mouse_gene = list()
    horse_gene = list()

    human_dna = list()
    mouse_dna = list()
    horse_dna = list()

    for i in range(n):
        # create master_gene item
        g = g2 = g3 = g4 = master_gene_sequence.get_state()

        # mutate master_gene item
        if g:
            g2 = mutator_coin.flip()
            g3 = mutator_coin.flip()
            g4 = mutator_coin.flip()

        dna_mutation_coin = create_dna_mutation_coin(g2 + g3)
        dna_mutation_coin2 = create_dna_mutation_coin(g2 + g4)

        # create DNA item
        c = c1 = c2 = c3 = DNA_CHARS[random.randint(0, 3)]
        if not dna_mutation_coin.flip():
            char_index = random.randint(0, 2)
            if DNA_CHARS[char_index] == c1:
                char_index = 3
            c1 = DNA_CHARS[char_index]

        if not dna_mutation_coin.flip():
            char_index = random.randint(0, 2)
            if DNA_CHARS[char_index] == c2:
                char_index = 3
            c2 = DNA_CHARS[char_index]

        if not dna_mutation_coin2.flip():
            char_index = random.randint(0, 2)
            if DNA_CHARS[char_index] == c3:
                char_index = 3
            c3 = DNA_CHARS[char_index]

        # delete DNA item
        if human_delete_sequence.get_state():
            c1 = '-'
        if mouse_delete_sequence.get_state():
            c2 = '-'
        if horse_delete_sequence.get_state():
            c3 = '-'

        # add items to sequence
        master_gene.append(g)
        human_gene.append(g2)
        mouse_gene.append(g3)
        horse_gene.append(g4)

        human_dna.append(c1)
        mouse_dna.append(c2)
        horse_dna.append(c3)

    # output
    s1fname = os.path.join(
        datadir, fname+'_'+s1name+'_'+annotation_name+annotations_extension
    )
    if os.path.isfile(s1fname):
        os.remove(s1fname)
    s2fname = os.path.join(
        datadir, fname+'_'+s2name+'_'+annotation_name+annotations_extension
    )
    if os.path.isfile(s2fname):
        os.remove(s2fname)
    s3fname = os.path.join(
        datadir, fname+'_'+s3name+'_'+annotation_name+annotations_extension
    )
    if os.path.isfile(s3fname):
        os.remove(s3fname)

    intervals1 = sequence_to_intervals(
        get_sequence(human_gene, human_dna), annotation_name
    )
    intervals2 = sequence_to_intervals(
        get_sequence(mouse_gene, mouse_dna), annotation_name
    )
    intervals3 = sequence_to_intervals(
        get_sequence(horse_gene, horse_dna), annotation_name
    )

    annotations = Annotations()
    annotations.setAnnotations([annotation_name])
    annotations.addSequences([s1name, s2name, s3name])
    annotations.addAnnotationFile(s1name, annotation_name,  s1fname)
    annotations.addAnnotationFile(s2name, annotation_name,  s2fname)
    annotations.addAnnotationFile(s3name, annotation_name,  s3fname)

    Fasta.save(
        [
            (s1name, ''.join(human_dna)),
            (s2name, ''.join(mouse_dna)),
            (s3name, ''.join(horse_dna))
        ],
        os.path.join(datadir, fname+alignment_extension)
    )

    with track.new(s1fname, 'bed') as t:
        t.fields = ['start', 'end', 'name']
        t.write("chr1", intervals1)
    with track.new(s2fname, 'bed') as t:
        t.fields = ['start', 'end', 'name']
        t.write("chr1", intervals2)
    with track.new(s3fname, 'bed') as t:
        t.fields = ['start', 'end', 'name']
        t.write("chr1", intervals3)

    with Open(os.path.join(datadir, fname+config_extension), "w") as f:
        json.dump(annotations.toJSON(), f)
Ejemplo n.º 29
0
def main(correct_file, aln_file, output_file, interval=None):
    task_ids = [None]
    if os.environ.has_key('SGE_TASK_ID'):
        if os.environ['SGE_TASK_ID'] != 'undefined':
            sge_task_id = int(os.environ['SGE_TASK_ID'])
            if not os.environ.has_key('SGE_STEP_SIZE'):
                sge_step_size = 1
            else:
                sge_step_size = int(os.environ['SGE_STEP_SIZE'])
            sge_task_last = int(os.environ['SGE_TASK_LAST'])
            task_ids = range(
                sge_task_id,
                min(sge_task_id + sge_step_size, sge_task_last + 1)
            )
    if interval != None:
        task_ids = range(interval[0], interval[1] + 1)
    for task_id in task_ids:
        separator = ''
        output = {}
        for fun, tp in [(identity, 'standard'), (expand_repeats, 'expanded_repeats'), (remove_repeats, 'removed_repeats')]:
            try:
                for correct, alignment in zip(
                    Fasta.load(correct_file.format(id=task_id - 1), separator, Alignment),
                    Fasta.load(aln_file.format(id=task_id - 1), separator, Alignment)
                ):
                    correct_len = len(correct.getCoordPairs(False))
                    total_len = correct_len * 2 - correct.sequences[0].count('-') - correct.sequences[2].count('-')
                    ccc = fun(correct.getCoordPairs(False), correct)
                    if tp == 'removed_repeats':
                        correct_len = len(ccc)
                        total_len = 0
                        for v1, _, v2 in ccc:
                            if v1 >= 0:
                                total_len += 1
                            if v2 >= 0:
                                total_len += 1
                    acc = alignment.getCoordPairs(False)
                    cc = map(lambda x: (x[0], x[2]), ccc)
                    if len(acc[0]) == 3:
                        ac = map(lambda x: (x[0], x[2]), acc)
                    elif len(acc[0]) ==2:
                        ac = acc
                    else: 
                        ac = None
                    c = set(cc)
                    a = set(ac)
                    
                    intersect = c.intersection(a)
                    not_in_c = c.difference(a)
                    not_in_a = a.difference(c)
                    symm_diff = c.symmetric_difference(a)

                    score = 0
                    for v1, v2 in intersect:
                        if v1 >= 0: 
                            score += 1
                        if v2 >= 0: 
                            score += 1
                  
                    
                    dists_correct = defaultdict(int)
                    dists_total = defaultdict(int)
                    position = dict()
                    dists = [99999999] * len(correct.sequences[1])
                    dst = 9999999
                    for x, a, y in ccc:
                        position[(x,y)] = a
                    for i in range(len(correct.sequences[1])):
                        if correct.sequences[1][i] == 'R':
                            dst = 0
                        else:
                            dst += 1
                        dists[i] = min(dists[i], dst)
                    for i in reversed(range(len(correct.sequences[1]))):
                        if correct.sequences[1][i] == 'R':
                            dst = 0
                        else:
                            dst += 1
                        dists[i] = min(dists[i], dst)

                    for pos in c:
                        d = dists[position[pos]]
                        if d == 0: 
                            continue
                        dists_total[d] += 1
                        if pos in ac:
                            dists_correct[d] += 1
                    
                        


                    def getRepeatAnnotation(coord, annotation):
                        if len(coord[0]) != 3:
                            return set()
                        ret = set()
                        for x, a, y in coord:
                            if annotation[a] == 'R':
                                if x >= 0:
                                   ret.add((x, -1))
                                if y >= 0:
                                    ret.add((-1, y))
                        return ret

                    crann = getRepeatAnnotation(correct.getCoordPairs(False), correct.sequences[1]) 
                    arann = getRepeatAnnotation(alignment.getCoordPairs(False), alignment.sequences[1]) 

                    def getRepeatBlocks(coord, annotation):
                        if len(coord[0]) != 3:
                            return set()
                        ret = set()
                        x = set()
                        y = set()
                        for _x, a, _y in coord:
                            if annotation[a] == 'R':
                                if _x >= 0: 
                                    x.add(_x)
                                if _y >= 0:
                                    y.add(_y)
                            else:
                                if len(x) + len(y) > 0:
                                    if len(x) == 0:
                                        x.add(-1)
                                    if len(y) == 0:
                                        y.add(-1)
                                    ret.add(((min(x), max(x) + 1), (min(y), max(y) + 1)))
                                    x = set()
                                    y = set()
                        if len(x) + len(y) > 0:
                            if len(x) == 0:
                                x.add(-1)
                            if len(y) == 0:
                                y.add(-1)
                            ret.add(((min(x), max(x) + 1), (min(y), max(y) + 1)))
                            x = set()
                            y = set()
                        return ret

                    cbann = getRepeatBlocks(correct.getCoordPairs(False), correct.sequences[1])
                    abann = getRepeatBlocks(alignment.getCoordPairs(False), alignment.sequences[1])
                    
                    def dst(x1, x2):
                        if x1 == -1:
                            return 0
                        return x2 - x1

                    def getPoints(s):
                        return sum([dst(x1,x2) + dst(y1,y2) for ((x1, x2), (y1, y2)) in s])

                    # Find long segments that are correctly aligned
                    cseg = [1 if x in c else 0 for x in ac]
                    seg_len = []
                    length = 0
                    segment_length_histogram = defaultdict(int)
                    for x in cseg:
                        if x == 0 and length != 0:
                            segment_length_histogram[length] += 1
                        length = length * x + x
                        seg_len.append(length)
                    if length > 0:
                        segment_length_histogram[length] += 1
                 
                    getPoints = len
                    output[tp] = {
                        'corect': correct_file,
                        'alignment': aln_file,
                        'c-lenght': len(cc),
                        'a-length': len(ac),
                        'intersect': len(intersect),
                        '%correct': 100.0 - float(len(intersect) * 100) / correct_len if correct_len > 0 else 100,
                        '+mistakes': len(intersect),
                        '+len': correct_len,
                        '+RepTP': len(crann & arann),
                        '+RepTN': total_len - len(crann | arann),
                        '+RepFP': len(arann - crann),
                        '+RepFN': len(crann - arann),
                        '+BlkTP': getPoints(cbann & abann),
                        '+BlkTN': 0,
                        '+BlkFP': getPoints(abann - cbann),
                        '+BlkFN': getPoints(cbann - abann),
                        '%score': float(score) * 100 / total_len if total_len > 0 else 0,
                        'c-a': len(not_in_c),
                        'a-c': len(not_in_a),
                        'symmetric_difference': len(symm_diff),
                        'correct_len_histogram': segment_length_histogram,
                        '@+dists_correct': dists_correct,
                        '@+dists_total': dists_total,
                    }
                    if correct_len == 0:
                        del output[tp]['%correct']
                    if total_len == 0:
                        del output[tp]['%score']
            except IOError:
                pass
                        
        with Open(output_file.format(id=task_id - 1), 'w') as f:
            json.dump(output, f, indent=4)
Ejemplo n.º 30
0
def main(n, datadir='data/train_sequences/', fname='simulated_alignment'):
    s1name = "sequence1"
    s2name = "sequence2"
    s3name = "sequence3"
    annotation_name = 'gene'

    alignment_extension = ".fa"
    annotations_extension = ".bed"
    config_extension = ".js"

    if len(sys.argv) > 1:
        n = int(sys.argv[1])
    if len(sys.argv) > 2:
        fname = sys.argv[2]

    master_gene_sequence = MarkovChain(P_START_GENE, P_STOP_GENE)
    human_delete_sequence = MarkovChain(P_START_DELETE, P_STOP_DELETE)
    mouse_delete_sequence = MarkovChain(P_START_DELETE, P_STOP_DELETE)
    horse_delete_sequence = MarkovChain(P_START_DELETE, P_STOP_DELETE)
    mutator_coin = BiasedCoin(P_NOT_MUTATE_GENE)

    master_gene = list()
    human_gene = list()
    mouse_gene = list()
    horse_gene = list()

    human_dna = list()
    mouse_dna = list()
    horse_dna = list()

    for i in range(n):
        # create master_gene item
        g = g2 = g3 = g4 = master_gene_sequence.get_state()

        # mutate master_gene item
        if g:
            g2 = mutator_coin.flip()
            g3 = mutator_coin.flip()
            g4 = mutator_coin.flip()

        dna_mutation_coin = create_dna_mutation_coin(g2 + g3)
        dna_mutation_coin2 = create_dna_mutation_coin(g2 + g4)

        # create DNA item
        c = c2 = c3 = random.randint(0, 3)
        c2 = mutate(c2, g2 + g3)
        c, c2, c3 = [DNA_CHARS[i] for i in (c, c2, c3)]
        if not dna_mutation_coin.flip():
            char_index = random.randint(0, 2)
            if DNA_CHARS[char_index] == c2:
                char_index = 3
            c2 = DNA_CHARS[char_index]

        if not dna_mutation_coin2.flip():
            char_index = random.randint(0, 2)
            if DNA_CHARS[char_index] == c3:
                char_index = 3
            c3 = DNA_CHARS[char_index]

        # delete DNA item
        if human_delete_sequence.get_state():
            c = '-'
        if mouse_delete_sequence.get_state():
            c2 = '-'
        if horse_delete_sequence.get_state():
            c3 = '-'

        # add items to sequence
        master_gene.append(g)
        human_gene.append(g2)
        mouse_gene.append(g3)
        horse_gene.append(g4)

        human_dna.append(c)
        mouse_dna.append(c2)
        horse_dna.append(c3)

    # output
    s1fname = os.path.join(
        datadir,
        fname + '_' + s1name + '_' + annotation_name + annotations_extension)
    if os.path.isfile(s1fname):
        os.remove(s1fname)
    s2fname = os.path.join(
        datadir,
        fname + '_' + s2name + '_' + annotation_name + annotations_extension)
    if os.path.isfile(s2fname):
        os.remove(s2fname)
    s3fname = os.path.join(
        datadir,
        fname + '_' + s3name + '_' + annotation_name + annotations_extension)
    if os.path.isfile(s3fname):
        os.remove(s3fname)

    intervals1 = sequence_to_intervals(get_sequence(human_gene, human_dna),
                                       annotation_name)
    intervals2 = sequence_to_intervals(get_sequence(mouse_gene, mouse_dna),
                                       annotation_name)
    intervals3 = sequence_to_intervals(get_sequence(horse_gene, horse_dna),
                                       annotation_name)

    annotations = Annotations()
    annotations.setAnnotations([annotation_name])
    annotations.addSequences([s1name, s2name, s3name])
    annotations.addAnnotationFile(s1name, annotation_name, s1fname)
    annotations.addAnnotationFile(s2name, annotation_name, s2fname)
    # annotations.addAnnotationFile(s3name, annotation_name,  s3fname)

    Fasta.save(
        [
            (s1name, ''.join(human_dna)),
            (s2name, ''.join(mouse_dna)),
            # (s3name, ''.join(horse_dna))
        ],
        os.path.join(datadir, fname + alignment_extension))

    with track.new(s1fname, 'bed') as t:
        t.fields = ['start', 'end', 'name']
        t.write("chr1", intervals1)
    with track.new(s2fname, 'bed') as t:
        t.fields = ['start', 'end', 'name']
        t.write("chr1", intervals2)
    # with track.new(s3fname, 'bed') as t:
    #     t.fields = ['start', 'end', 'name']
    #     t.write("chr1", intervals3)

    with Open(os.path.join(datadir, fname + config_extension), "w") as f:
        json.dump(annotations.toJSON(), f)
Ejemplo n.º 31
0
def select_sequences(inp_filename, out_filename, sequences):
    aln = list(
        Fasta.load(inp_filename, '', Alignment,
                   sequence_selectors=sequences))[0]
    Fasta.save(zip(aln.names, aln.sequences), out_filename)
Ejemplo n.º 32
0
    def prepare_training_data(
        self,
        sequence_x,
        annotations_x,
        sequence_y,
        annotations_y,
    ):
        train_data = (list(), list(), list())

        if self.insert_sequence == 0:
            reference = sequence_x
            annotations_r = annotations_x
            space = sequence_y
            annotations_s = annotations_y
        else:
            reference = sequence_y
            annotations_r = annotations_y
            space = sequence_x
            annotations_s = annotations_x

        sequence_rs = Fasta.alnToSeq(reference)
        sequence_ss = Fasta.alnToSeq(space)

        pos_s, pos_r = 0, 0

        match_pos = set()
        for i in range(len(space)):
            br, bs = reference[i], space[i]
            if bs != '-':
                if br != '-':
                    match_pos.add((pos_r, pos_s))
                    pos_r += 1
                pos_s += 1
                continue
            if br == '-':
                continue

            d = self.prepare_data(
                sequence_rs,
                pos_r,
                annotations_r,
                sequence_ss,
                pos_s,
                annotations_s,
                0,
            )
            if d is not None:
                train_data[0].append(d)
                train_data[1].append(1)
                train_data[2].append(1.0)
            pos_r += 1

        matches = sample(match_pos, len(train_data[0]))
        for x, y in matches:
            d = self.prepare_data(
                sequence_rs,
                x,
                annotations_r,
                sequence_ss,
                y,
                annotations_s,
                0,
            )

            train_data[0].append(d)
            train_data[1].append(0)
            train_data[2].append(1.0)

        return train_data
Ejemplo n.º 33
0
def main(inp, out, alignment_regexp, sequence_regexp, trf=trf_paths):
    for trf_executable in trf:
        if os.path.exists(trf_executable):
            trf = TRFDriver(trf_executable, mathType=float)
            break
    repeats = trf.run(inp)

    stats = defaultdict(int)

    for aln in Fasta.load(
        inp,
        alignment_regexp,
        Alignment,
        sequence_selectors=sequence_regexp
    ):
        X_index = 0
        Y_index = 1

        X_trf = list(translate_repeat_to_annotation(
            repeats[aln.names[X_index]], aln.seq_to_aln[X_index]))
        Y_trf = list(translate_repeat_to_annotation(
            repeats[aln.names[Y_index]], aln.seq_to_aln[Y_index]))
        
        X_ann = list("M" * len(aln.sequences[X_index]))
        Y_ann = list("M" * len(aln.sequences[Y_index]))
        B_ann = list("M" * len(aln.sequences[Y_index]))
        for repeat in X_trf:
            if repeat.end >= len(X_ann):
                repeat.end = len(X_ann) - 1
            rlen = 1 + repeat.end - repeat.start
            X_ann[repeat.start : repeat.end + 1] = list("R" * rlen)
            B_ann[repeat.start : repeat.end + 1] = list("R" * rlen)
        for repeat in Y_trf:
            if repeat.end >= len(Y_ann):
                repeat.end = len(Y_ann) - 1
            rlen = 1 + repeat.end - repeat.start
            Y_ann[repeat.start : repeat.end + 1] = list("R" * rlen)
            B_ann[repeat.start : repeat.end + 1] = list("R" * rlen)
        assert(len(X_ann) == len(Y_ann) and len(B_ann) == len(Y_ann))

        M_count = len([x for x in B_ann if x == 'M'])
        R_count = len([x for x in B_ann if x == 'R'])
        R_segments_count = len([x for x in zip('M' + ''.join(B_ann),
                                               ''.join(B_ann) + 'M') 
                                if x[0] != 'R' and x[1] == 'R'])
        stats['M_count'] += M_count
        stats['R_count'] += R_count
        stats['R_segment_count'] += R_segments_count
        changes = [i 
                   for i, x in zip(
                                   range(len(B_ann) + 1), 
                                   zip('M' + ''.join(B_ann),
                                        ''.join(B_ann) + 'M'))
                   if x[0] != x[1]]
        R_segments = [(changes[i], changes[i+1]) 
                      for i in range(0, len(changes) - (len(changes) % 2), 2)]

        assert(R_segments_count == len(R_segments))  
        for start, stop in R_segments:
            XX = 'M'
            YY = 'M'
            for i in range(start, stop):
                if X_ann[i] == 'R':
                    XX = 'R'
                if Y_ann[i] == 'R':
                    YY = 'R'
                assert(B_ann[i] == 'R')
            stats[XX + YY] += 1
        
    with Open(out, 'w') as f:
        json.dump(stats, f, indent=4);
Ejemplo n.º 34
0
def main():
    
    parser = argparse.ArgumentParser(description='Sample alignments.')
    parser.add_argument('output_file_template', type=str, 
                        help="Template for output file. Have to contain " + \
                        "string '{id}' as placeholder for sequence number.")
    parser.add_argument('--output_files', type=str, help="File where the " + \
                        'list of output files will be written.', default='-')
    parser.add_argument('--model', type=str,
                        default='data/models/repeatHMM.js', help="Model file")
    parser.add_argument('--bind_file', nargs='*', help='Replace filenames in '
                        + 'the input_file model.', default=[]) 
    parser.add_argument('--bind_constant', nargs='*', help='Replace constants'
                         + ' in the input_file model.', default=[])
    parser.add_argument('--bind_constant_file', nargs='*', help='Replace' + 
                        ' constants in the input_file model.', default=[])
    parser.add_argument('n_samples', type=int, help='Number of samples.')
    parser.add_argument('seq1_length',type=int, 
                        help='Length of first sequence.')
    parser.add_argument('seq2_length', type=int, 
                        help='Length of second sequence.')
    parsed_arg = parser.parse_args()
      
    # ====== Validate input parameters =========================================

    if parsed_arg.output_file_template.count("{id}") < 1:
        sys.stderr.write('ERROR: If sampling, output_file filename has to ' +\
                         'contain at least one "%d".\n')
        return 1
    if len(parsed_arg.bind_file) % 2 != 0:
        sys.stderr.write('ERROR: If binding files, the number of arguments has'
                         + 'to be divisible by 2\n')
        return 1 
    if len(parsed_arg.bind_constant_file) % 2 != 0:
        sys.stderr.write('ERROR: If binding constants (as files), the number of'
                         + ' arguments has to be divisible by 2\n')
        return 1
    if len(parsed_arg.bind_constant) % 2 != 0:
        sys.stderr.write('ERROR: If binding constants, the number of'
                         + ' arguments has to be divisible by 2\n')
        return 1
    
    # ====== Parse parameters ==================================================
        
    output_filename = parsed_arg.output_file_template
    output_files_filename = parsed_arg.output_files
    output_files = list()
    
    # ====== Load model ========================================================
    loader = HMMLoader() 
    for i in range(0, len(parsed_arg.bind_constant), 2):
        loader.addFile(parsed_arg.bind_file[i], parsed_arg.bind_file[i + 1])
    for i in range(0, len(parsed_arg.bind_constant_file), 2):
        loader.addConstant(
            parsed_arg.bind_constant_file[i],
            loader.load(parsed_arg.bind_constant_file[i + 1])
        )
    for i in range(0, len(parsed_arg.bind_constant), 2):
        loader.addConstant(
            parsed_arg.bind_constant[i],
            loader.loads(parsed_arg.bind_constant[i + 1]),
        )
    model_filename = parsed_arg.model
    PHMM = loader.load(model_filename)["model"]

    # ====== Sample ============================================================
    PHMM.buildSampleTransitions()
    n_samples = parsed_arg.n_samples
    X_len = parsed_arg.seq1_length
    Y_len = parsed_arg.seq2_length
    dirname = os.path.dirname(output_filename)
    if not os.path.exists(dirname):
        os.makedirs(dirname)
    for i in range(n_samples):
        done = False
        while not done:
            tandemRepeats = {'sequence1': [], 'sequence2': []}
            seq = PHMM.generateSequence((X_len, Y_len))
            X = ""
            Y = ""
            A = ""
            for (seq, state) in seq:
                ann_data = None
                if len(seq) == 2:
                    x, y = seq
                else: 
                    x, y, ann_data = seq
                dx, dy = len(x), len(y)
                if ann_data != None:
                    xlen = len(X.replace('-', ''))
                    ylen = len(Y.replace('-', ''))
                    if dx > 0:
                        tandemRepeats['sequence1'].append((
                            xlen, xlen + dx, dx / ann_data[1], ann_data[0], x
                        ))
                        done = True
                    if dy > 0:
                        tandemRepeats['sequence2'].append((
                            ylen, ylen + dy, dy / ann_data[2], ann_data[0], y
                        ))
                        done = True
                A += PHMM.states[state].getChar() * max(dx, dy)
                X += x + ('-' * (dy - dx))
                Y += y + ('-' * (dx - dy))
            #if len(X) - X.count('-') > 2 * X_len:
            #    done = False
            #if len(Y) - Y.count('-') > 2 * Y_len:
            #    done = False
        aln = [("sequence1", X), ("alignment", A), ("sequence2", Y)]
        json.dump(tandemRepeats, Open(output_filename.format(id=i) + '.repeats',
                                      'w'), indent=4)
        Fasta.save(aln, output_filename.format(id=i))
        output_files.append(output_filename.format(id=i))
    with Open(output_files_filename, 'w') as output_file_object:
        json.dump(output_files, output_file_object, indent=4)  
    return 0
Ejemplo n.º 35
0
def main(input_file, output_file):

    for trf_executable in trf_paths:
        if os.path.exists(trf_executable):
            trf = TRFDriver(trf_executable)
            #break
    if not trf:
        raise "No trf found"
    repeats = trf.run(input_file)

    with open(output_file, 'w') as f:
        for alignment in Fasta.load(input_file, '\.[0-9]*$', Alignment):
            if len(alignment.sequences) != 2:
                print 'error'
                continue
            #print alignment.names
            annotation = list('.' * len(alignment.sequences[0]))
            annotationX = list('.' * len(alignment.sequences[0]))
            annotationY = list('.' * len(alignment.sequences[0]))
            trf = None
            for seq_name in alignment.names:
                index = None
                for i in range(len(alignment.names)):
                    if seq_name == alignment.names[i]:
                        index = i
                translator = alignment.seq_to_aln[index]
                revtranslator = alignment.aln_to_seq[index]
                for repeat in repeats[seq_name]:
                    for i in range(translator[repeat.start], translator[repeat.end]):
                        annotation[i] = 'R'
                        j = i - translator[repeat.start]
                        if index == 0:
                            annotationX[i] = repeat.consensus[revtranslator[j] % len(repeat.consensus)]
                        else:
                            annotationY[i] = repeat.consensus[revtranslator[j] % len(repeat.consensus)]
            d = defaultdict(int)
            ll = 0
            for v in annotation:
                if v != 'R':
                    if ll > 0:
                        d[ll] += 1
                        ll = 0
                else:  
                    ll += 1
            #for x, y in sorted(d.iteritems(), key=lambda x: x[1]):
            #    print '{}: {}'.format(x, y)
            #if len(d.keys()) > 0:
            #    print('Number of repeats: {}, average length: {}, maximum length: {}, minimum length: {}'.format(
            #        sum(d.values()),
            #        sum([x * y for x, y in d.iteritems()])/ max(sum(d.values()), 1),
            #        max(d.keys()),
            #        min(d.keys())
            #    ))

            seqX = alignment.sequences

            nm = alignment.names[0]
            aln = [(alignment.names[0], alignment.sequences[0].replace('.', '-')), 
                   ('consensusX' + nm, ''.join(annotationX)),
                   ('annotation' + nm, ''.join(annotation)),
                   ('consensusY' + nm, ''.join(annotationY)),
                   (alignment.names[1], alignment.sequences[1].replace('.','-'))]
            Fasta.saveAlignmentPiece(aln, f, -1)
Ejemplo n.º 36
0
    def prepare_training_data(
        self,
        sequence_x,
        annotations_x,
        sequence_y,
        annotations_y,
    ):
        train_data = (list(), list(), list())

        if self.insert_sequence == 0:
            reference = sequence_x
            annotations_r = annotations_x
            space = sequence_y
            annotations_s = annotations_y
        else:
            reference = sequence_y
            annotations_r = annotations_y
            space = sequence_x
            annotations_s = annotations_x

        sequence_rs = Fasta.alnToSeq(reference)
        sequence_ss = Fasta.alnToSeq(space)

        pos_s, pos_r = 0, 0

        match_pos = set()
        for i in range(len(space)):
            br, bs = reference[i], space[i]
            if bs != '-':
                if br != '-':
                    match_pos.add((pos_r, pos_s))
                    pos_r += 1
                pos_s += 1
                continue
            if br == '-':
                continue

            d = self.prepare_data(
                sequence_rs,
                pos_r,
                annotations_r,
                sequence_ss,
                pos_s,
                annotations_s,
                0,
            )
            if d is not None:
                train_data[0].append(d)
                train_data[1].append(1)
                train_data[2].append(1.0)
            pos_r += 1

        matches = sample(match_pos, len(train_data[0]))
        for x, y in matches:
            d = self.prepare_data(
                sequence_rs,
                x,
                annotations_r,
                sequence_ss,
                y,
                annotations_s,
                0,
            )

            train_data[0].append(d)
            train_data[1].append(0)
            train_data[2].append(1.0)

        return train_data