Python Fasta.load Examples, alignment.Fasta.load Python Examples

Example #1

0

Show file

def main(input_file, index1, index2, emissionOutput, transitionOutput):

    emissions = defaultdict(int)
    transitions = defaultdict(int)

    X = None
    Y = None

    def aggregate(X, Y):
        pairs = zip(X, Y)
        for p in pairs:
            if skip(p):
                continue
            emissions[str(upper(p))] += 1
        Types = list(sortTypes([getType(x) for x in pairs if x != ('-', '-')]))
        for p in zip(Types, Types[1:]):
            transitions[str(p)] += 1

    for aln in Fasta.load(input_file, '[.][0-9]+$'):
        count = 0
        for _, sequence in aln:
            if count == index1:
                X = sequence
            if count == index2:
                Y = sequence
            count += 1
        aggregate(X, Y)

    with Open(emissionOutput, 'w') as f:
        json.dump(emissions, f, indent=4)

    with Open(transitionOutput, 'w') as f:
        json.dump(transitions, f, indent=4)

Example #2

0

Show file

File: FindRepeats.py Project: pombredanne/realigner

def do_find_repeats(alignment_file, paramSeq, model, mathType, _stats, separator):
    modelParam = {
        "mathType": mathType,
        "modelFactory": model,
    } 
    driver = TRFDriver()
    trf_repeats = driver.run(alignment_file, paramSeq)
    alignments = Fasta.load(alignment_file, separator, Alignment)
    D = dict()
    stats = defaultdict(int)
    count = 1;
    for alignment in alignments:
        print ("Annotating alignment {0}".format(count))
        count += 1
        consensus_list = list(set([
            x.consensus for x in itertools.chain(*[
                trf_repeats[name] for name in alignment.names
            ])
        ]))
        repeats = find_repeats_in_alignment(alignment, consensus_list, modelParam)
        print repeats
        D.update(repeats)
        if _stats != None:
            s = compute_statistics(repeats)
            for key, value in s.iteritems():
                stats[key] += value
    return D, stats

Example #3

0

Show file

File: SplitAlignmentForContext.py Project: pombredanne/realigner

def main(arg):
    with open(arg.output, 'w') as f:
        for alignment_file in arg.alignment:
            alns = Fasta.load(alignment_file, arg.alignment_regexp, Alignment,
                       sequence_selectors=arg.sequence_regexp)
            for aln in alns:
                l = len(aln.sequences[0])
                poss = []
                for i in range(arg.min_split_size, arg.max_split_size + 1):
                    mod = l % i
                    rest = None 
                    if mod >= arg.min_split_size:
                        rest = mod
                    if mod + i <= arg.max_split_size and mod + i >= arg.min_split_size:
                        rest = mod + i
                    if rest == None:
                        continue
                    rest = min(rest, i)
                    poss.append((-rest, i))
                poss.sort()
                _, best = poss[0]
                splits = [i for i in range(0, l, best)]
                if best + l % best < arg.max_split_size:
                    splits.pop()
                splits.append(l)
                for fr, to in zip(splits, splits[1:]):
                    s1 = aln.sequences[0][fr:to]
                    s2 = aln.sequences[1][fr:to]
                    if min(map(len,[s1.strip('-'), s2.strip('-')])) > 0:
                        f.write('{}.{}-{} {}\n'.format(aln.names[0], fr, to, s1))
                        f.write('{}.{}-{} {}\n'.format(aln.names[1], fr, to, s2))

Example #4

0

Show file

File: MuscleAdapter.py Project: pombredanne/realigner

def main(input_filename, output_filename):
    task_ids = [1]
    if os.environ.has_key('SGE_TASK_ID'):
        sge_task_id = int(os.environ['SGE_TASK_ID'])
        if not os.environ.has_key('SGE_STEP_SIZE'):
            sge_step_size = 1
        else:
            sge_step_size = int(os.environ['SGE_STEP_SIZE'])
        sge_task_last = int(os.environ['SGE_TASK_LAST'])
        task_ids = range(
            sge_task_id,
            min(sge_task_id + sge_step_size, sge_task_last + 1)
        )
        print task_ids, sge_task_id, sge_step_size, sge_task_last
    for task_id in task_ids:
        inp_filename = input_filename.format(id=task_id - 1)
        out_filename = output_filename.format(id=task_id - 1)
        aln = list(Fasta.load(
            inp_filename, 
            '', 
            Alignment, 
            sequence_selectors=['sequence1', 'sequence2']))[0]
        tmp_filename = get_temp_filename()
        Fasta.save(zip(aln.names, aln.sequences), tmp_filename)
        os.system("muscle -in {inp} -out {out} 2> /dev/null".format(
            inp=tmp_filename,
            out=out_filename,
        ))
        os.system("cp {inp}.repeats {out}.repeats".format(
            inp=inp_filename,
            out=out_filename,
        ))

Example #5

0

Show file

def realign_file(args, model, output_filename, alignment_filename):
    # begin of HACK
    if args.expand_model:
        old_tracks = args.tracks
        args.tracks.add('trf_cons')
    m = model
    if args.annotation_model:
        m = args.annotation_model
    annotations = compute_annotations(args, alignment_filename, m)
    if args.expand_model:
        consensuses = annotations['trf_cons']
        args.tracks = old_tracks
        if 'trf_cons' not in old_tracks:
            del args.tracks['trf_cons']
    # end of HACK
    with Open(output_filename, 'w') as output_file_object:
        for aln in Fasta.load(
            alignment_filename, 
            args.alignment_regexp, 
            Alignment, 
            sequence_selectors=args.sequence_regexp):
            if len(aln.sequences) < 2:
                sys.stderr.write("ERROR: not enough sequences in file\n")
                return 1
            if len(args.draw) == 0:
                drawer = brainwash(AlignmentCanvas)()
            else:
                drawer = AlignmentCanvas()
                drawer.add_original_alignment(aln)
            aln, unmask_repeats = args.mask_repeats(aln, annotations)
            seq1, seq2 = tuple(map(Fasta.alnToSeq, aln.sequences[:2]))
            perf.msg("Data loaded in {time} seconds.")
            perf.replace()
            if args.expand_model:
                # Potrebujem zistit konsenzy
                A = consensuses[aln.names[0]]
                B = consensuses[aln.names[1]]
                cons = list(A.union(B))
                real_model = model.expandModel({'consensus': cons})
            else: 
                real_model = model
            realigner = args.algorithm()
            realigner.setDrawer(drawer)
            realigner.prepareData(seq1, aln.names[0], seq2, aln.names[1], aln, 
                                  real_model, annotations, args)
                                                              
            aln = realigner.realign(0, len(seq1), 0, len(seq2))
            aln = unmask_repeats(aln)
            perf.msg("Sequence was realigned in {time} seconds.")
            perf.replace()
            if len(args.draw) > 0:
                drawer.add_sequence('X', seq1)
                drawer.add_sequence('Y', seq2)
                drawer.add_alignment_line(101, (255, 0, 255, 255), 2, 
                                          AlignmentPositionGenerator(
                                              Alignment([aln[0], aln[2]])))
                drawer.draw(args.draw, 2000, 2000)
                perf.msg("Image was drawn in {time} seconds.")
            # Save output_file
            Fasta.saveAlignmentPiece(aln, output_file_object)

Example #6

0

Show file

File: AgregateAnnotation.py Project: mhozza/realigner

def main(input_file, index1, index2, emissionOutput, transitionOutput):
    
    emissions = defaultdict(int)
    transitions = defaultdict(int)
    
    X = None
    Y = None
    
    def aggregate(X, Y):
        pairs = zip(X, Y)
        for p in pairs:
            if skip(p):
                continue
            emissions[str(upper(p))] += 1
        Types = list(sortTypes([getType(x) for x in pairs if x != ('-', '-')]))
        for p in zip(Types, Types[1:]):
            transitions[str(p)] += 1
    
    for aln in Fasta.load(input_file, '[.][0-9]+$'):
        count = 0
        for _, sequence in aln:
            if count == index1:
                X = sequence
            if count == index2:
                Y = sequence
            count += 1
        aggregate(X, Y)
        
    with Open(emissionOutput, 'w') as f:
        json.dump(emissions, f, indent=4)
        
    with Open(transitionOutput, 'w') as f:
        json.dump(transitions, f, indent=4)

Example #7

0

Show file

File: DataLoader.py Project: pombredanne/realigner

 def getSequences(self, fname, sequence_regexp=None):
     alignment_regexp = ''
     if sequence_regexp is None:
         sequence_regexp = ["^sequence1$", "^sequence2$"]
     self.sequence_regexp = sequence_regexp
     aln = next(
         Fasta.load(fname, alignment_regexp, Alignment, sequence_regexp))
     if aln is None or len(aln.sequences) < 2:
         raise ParseException('Not enough sequences in file\n')
     seq1 = aln.sequences[0]
     seq2 = aln.sequences[1]
     return seq1, seq2

Example #8

0

Show file

File: DataLoader.py Project: mhozza/realigner

 def getSequences(self, fname, sequence_regexp=None):
     alignment_regexp = ''
     if sequence_regexp is None:
         sequence_regexp = ["^sequence1$", "^sequence2$"]
     self.sequence_regexp = sequence_regexp
     aln = next(
         Fasta.load(fname, alignment_regexp, Alignment, sequence_regexp)
     )
     if aln is None or len(aln.sequences) < 2:
         raise ParseException('Not enough sequences in file\n')
     seq1 = aln.sequences[0]
     seq2 = aln.sequences[1]
     return seq1, seq2

Example #9

0

Show file

def main(files_filename, output_filename, suffix, base_dir):
    X = ""
    Y = ""
    A = ""
    with Open(output_filename, 'w') as ff:
        files = json.load(Open(files_filename))
        total = len(files)
        done = 0
        for filename in files:
            if done %100 ==0:
                print '{}/{} {:.2}%'.format(done, total, 100.0 * done / total)
            if filename == "":
                Fasta.saveAlignmentPiece([(X_name, X), (Y_name, Y), (A_name, A)], ff)
                X = ""
                Y = ""
                A = ""
                continue
            done += 1
            old_filename = filename
            keep = False
            
            if filename.count('keep') == 0:
                filename = filename[:-2] + suffix
                if base_dir != None:
                    filename = base_dir + '/' + filename.split('/')[-1]
                try:
                    with Open(filename, 'r') as f:
                        l = len(''.join(f).strip())
                    if l == 0:
                        filename = old_filename
                        keep = True
                except IOError:
                    filename = old_filename
                    keep = True
            if filename.count('keep') > 0:
                keep = True
            aln = list(Fasta.load(filename, ''))[0]
            assert(len(aln) == 3)
            assert(len(aln[0][1]) == len(aln[1][1]) == len(aln[2][1]))
            X += aln[0][1]
            if keep:
                A += '.' * len(aln[0][1])
            else: 
                A += aln[1][1]
            Y += aln[2][1]
            X_name = aln[0][0]
            A_name = aln[1][0]
            Y_name = aln[2][0]

Example #10

0

Show file

File: merge.py Project: pombredanne/realigner

def main(files_filename, output_filename, suffix, base_dir):
    X = ""
    Y = ""
    A = ""
    with Open(output_filename, "w") as ff:
        files = json.load(Open(files_filename))
        total = len(files)
        done = 0
        for filename in files:
            if done % 100 == 0:
                print "{}/{} {:.2}%".format(done, total, 100.0 * done / total)
            if filename == "":
                Fasta.saveAlignmentPiece([(X_name, X), (Y_name, Y), (A_name, A)], ff)
                X = ""
                Y = ""
                A = ""
                continue
            done += 1
            old_filename = filename
            keep = False

            if filename.count("keep") == 0:
                filename = filename[:-2] + suffix
                if base_dir != None:
                    filename = base_dir + "/" + filename.split("/")[-1]
                try:
                    with Open(filename, "r") as f:
                        l = len("".join(f).strip())
                    if l == 0:
                        filename = old_filename
                        keep = True
                except IOError:
                    filename = old_filename
                    keep = True
            if filename.count("keep") > 0:
                keep = True
            aln = list(Fasta.load(filename, ""))[0]
            assert len(aln) == 3
            assert len(aln[0][1]) == len(aln[1][1]) == len(aln[2][1])
            X += aln[0][1]
            if keep:
                A += "." * len(aln[0][1])
            else:
                A += aln[1][1]
            Y += aln[2][1]
            X_name = aln[0][0]
            A_name = aln[1][0]
            Y_name = aln[2][0]

Example #11

0

Show file

File: PlotAlignments.py Project: pombredanne/realigner

def main(input_files, output_file):
    global width
    alignments = [
        list(
            Fasta.load(name, '', Alignment,
                       ['^sequence1', '^sequence2', '^[av].*']))
        if os.path.exists(name) else None for name in input_files
    ]
    x_len = len(Fasta.alnToSeq(alignments[0][0].sequences[0]))
    y_len = len(Fasta.alnToSeq(alignments[0][0].sequences[1]))
    I = Image.new('RGB', (x_len * width + 50, y_len * width + 50),
                  (255, 255, 255))
    D = ImageDraw.Draw(I)
    colors = [(0, 0, 0), (255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 0, 255),
              (0, 255, 255)]
    i = -1
    for aln in alignments:
        i += 1
        if aln == None:
            continue
        aln = list(aln)
        if len(aln) == 0:
            continue
        aln = aln[0]
        try:
            annotation = aln.sequences[2]
            coords = aln.getCoordPairs()
            print coords
            x_shift = width / 2 + 25 + i
            y_shift = width / 2 + 25 + i * 2
            D.line([(x * width + x_shift, y * width + y_shift)
                    for x, y, _ in coords],
                   fill=colors[i])
            if annotation != None:
                for x, y, ind in coords:
                    if annotation[ind] != 'R':
                        continue
                    D.rectangle([(x * width + x_shift - width / 4,
                                  y * width + y_shift - width / 4),
                                 (x * width + x_shift + width / 4,
                                  y * width + y_shift + width / 4)],
                                outline=colors[i])
        except IndexError:
            pass
        except IOError:
            pass
    del D
    I.save(output_file)

Example #12

0

Show file

File: AnnotateAlignment.py Project: pombredanne/realigner

def main(input_file, output_file, trf):
    
    # THIS IS ONLY GENERATOR!!!
    alns = (Alignment(a) 
            for a in Fasta.load(input_file, '[.][0-9]+$', Alignment))
    
    # 1. run trf, 
    for trf_executable in trf:
        if os.path.exists(trf_executable):  
            trf = TRFDriver(trf_executable)
            break
    repeats = trf.run(input_file)
    
    
    
    A = list(compute_annotation_track(alns, repeats))
    json.dump(A, Open(output_file, 'w'), indent=4)

Example #13

0

Show file

File: PlotAlignments.py Project: mhozza/realigner

def main(input_files, output_file):
    global width
    alignments = [list(Fasta.load(name, '', Alignment, ['^sequence1', '^sequence2', '^[av].*'])) if os.path.exists(name) else None for name in input_files]
    x_len = len(Fasta.alnToSeq(alignments[0][0].sequences[0]))
    y_len = len(Fasta.alnToSeq(alignments[0][0].sequences[1]))
    I = Image.new('RGB', (x_len * width + 50, y_len * width + 50), (255, 255, 255)) 
    D = ImageDraw.Draw(I)
    colors = [(0, 0, 0), (255, 0, 0), (0, 255, 0), (0, 0, 255), (255, 0, 255), (0, 255, 255)]
    i = -1
    for aln in alignments:
        i += 1
        if aln == None:
            continue
        aln = list(aln)
        if len(aln) == 0:
            continue
        aln = aln[0]
        try:
            annotation = aln.sequences[2]
            coords = aln.getCoordPairs() 
            print coords
            x_shift = width / 2 + 25 + i
            y_shift = width / 2 + 25 + i * 2
            D.line([(x * width + x_shift, y * width + y_shift) for 
                    x, y, _ in coords], fill=colors[i])
            if annotation != None:
                for x, y, ind in coords:
                    if annotation[ind] != 'R':
                        continue
                    D.rectangle([(x * width + x_shift - width / 4, y * width + y_shift - width / 4),
                                 (x * width + x_shift + width / 4, y * width + y_shift + width / 4)], outline=colors[i]) 
        except IndexError:
            pass
        except IOError:
            pass
    del D
    I.save(output_file)

Example #14

0

Show file

File: Expectation.py Project: pombredanne/realigner

def expectation_generator(args, model, alignment_filename, annotations):
    for aln in Fasta.load(alignment_filename,
                          args.alignment_regexp,
                          Alignment,
                          sequence_selectors=args.sequence_regexp):
        if len(aln.sequences) < 2:
            sys.stderr.write("ERROR: not enough sequences in file\n")
            raise "ERROR: not enough sequences in file"
        seq1, seq2 = tuple(map(Fasta.alnToSeq, aln.sequences[:2]))
        positionGenerator = list(AlignmentBeamGenerator(aln, args.beam_width))

        RX = RepeatGenerator(None, args.repeat_width)
        RY = RepeatGenerator(None, args.repeat_width)
        for rt in ['trf', 'original_repeats']:
            if rt in annotations:
                RX.addRepeats(annotations[rt][aln.names[0]])
                RY.addRepeats(annotations[rt][aln.names[1]])
        RX.buildRepeatDatabase()
        RY.buildRepeatDatabase()
        if 'Repeat' in model.statenameToID:
            model.states[model.statenameToID['Repeat']].addRepeatGenerator(
                RX, RY)

        (transitions, emissions), probability = model.getBaumWelchCounts(
            seq1,
            0,
            len(seq1),
            seq2,
            0,
            len(seq2),
            positionGenerator=positionGenerator)
        yield {
            "probability": probability,
            "transitions": transitions,
            "emissions": emissions,
        }

Example #15

0

Show file

File: annotateTRF.py Project: mhozza/realigner

def main(input_file, output_file):

    for trf_executable in trf_paths:
        if os.path.exists(trf_executable):
            trf = TRFDriver(trf_executable)
            #break
    if not trf:
        raise "No trf found"
    repeats = trf.run(input_file)

    with open(output_file, 'w') as f:
        for alignment in Fasta.load(input_file, '\.[0-9]*$', Alignment):
            if len(alignment.sequences) != 2:
                print 'error'
                continue
            #print alignment.names
            annotation = list('.' * len(alignment.sequences[0]))
            annotationX = list('.' * len(alignment.sequences[0]))
            annotationY = list('.' * len(alignment.sequences[0]))
            trf = None
            for seq_name in alignment.names:
                index = None
                for i in range(len(alignment.names)):
                    if seq_name == alignment.names[i]:
                        index = i
                translator = alignment.seq_to_aln[index]
                revtranslator = alignment.aln_to_seq[index]
                for repeat in repeats[seq_name]:
                    for i in range(translator[repeat.start], translator[repeat.end]):
                        annotation[i] = 'R'
                        j = i - translator[repeat.start]
                        if index == 0:
                            annotationX[i] = repeat.consensus[revtranslator[j] % len(repeat.consensus)]
                        else:
                            annotationY[i] = repeat.consensus[revtranslator[j] % len(repeat.consensus)]
            d = defaultdict(int)
            ll = 0
            for v in annotation:
                if v != 'R':
                    if ll > 0:
                        d[ll] += 1
                        ll = 0
                else:  
                    ll += 1
            #for x, y in sorted(d.iteritems(), key=lambda x: x[1]):
            #    print '{}: {}'.format(x, y)
            #if len(d.keys()) > 0:
            #    print('Number of repeats: {}, average length: {}, maximum length: {}, minimum length: {}'.format(
            #        sum(d.values()),
            #        sum([x * y for x, y in d.iteritems()])/ max(sum(d.values()), 1),
            #        max(d.keys()),
            #        min(d.keys())
            #    ))

            seqX = alignment.sequences

            nm = alignment.names[0]
            aln = [(alignment.names[0], alignment.sequences[0].replace('.', '-')), 
                   ('consensusX' + nm, ''.join(annotationX)),
                   ('annotation' + nm, ''.join(annotation)),
                   ('consensusY' + nm, ''.join(annotationY)),
                   (alignment.names[1], alignment.sequences[1].replace('.','-'))]
            Fasta.saveAlignmentPiece(aln, f, -1)

Example #16

0

Show file

def main(input_file, realign_output, do_not_touch_output,
         list_of_files_output, max_length, wrap_length,
         min_seq_length):
    realign_counter = 0
    do_not_touch_counter = 0
    files = []
    for alignment in Fasta.load(input_file, '\.[0-9]*$'):
        if realign_counter % 100 == 0:
            print(realign_counter, do_not_touch_counter,alignment[0][0])
        alignment_len = len(alignment[0][1])

        annotation = alignment[2][1]

        # !!! We expect that first block is not repeat
        changes = [i for i in range(1, len(annotation))
                   if annotation[i-1] != annotation[i]] + [len(annotation)]
        Blocks = zip(changes, changes[1:]) + [(len(annotation), len(annotation) + max_length + 10)]
        Blocks = [(0, Blocks[0][0])] + Blocks
        printed = 0
        block_start = 0#None
        block_end = None
        intervals = []
        for block_id in range(1, len(Blocks), 2):
            current = Blocks[block_id]
            previous = Blocks[block_id - 1]
            if block_start == None:
                startpp = max(printed, previous[0])
                if previous[1] - startpp > wrap_length:
                    intervals.append((printed, startpp))
                    printed = startpp
                    block_start = startpp
            else:
                # Pridam tento blok, alebo zacnem novy?
                if current[1] - block_start > max_length:
                    if previous[1] - previous[0] > wrap_length * 2:
                        intervals.append((block_start, previous[0] + wrap_length))
                        intervals.append((previous[0] + wrap_length, previous[1] - wrap_length))
                        printed = previous[1] - wrap_length
                        block_start = previous[1] - wrap_length
                    else:
                        split = (previous[0] + previous[1]) / 2
                        intervals.append((block_start, split))
                        block_start = split
                        printed = split
                    #Zacnem novy
        intervals.append((printed, len(annotation)))
        assert(len(annotation) == sum([y - x for x, y in intervals]))
        for i in range(1, len(intervals)):
            assert(intervals[i - 1][1] == intervals[i][0])

        #t = list(range(0, alignment_len, max_length)) + [alignment_len]
        #intervals = zip(t, t[1:]) 

        for start, stop in intervals:
            if start >= len(annotation):
                continue
            if start == stop:
                continue
            assert(start < stop)
            ann = alignment[2][1][start:stop]
            output = None
            seq1 = alignment[0][1]
            seq2 = alignment[4][1]
            seq1_len = len(seq1) - seq1.count('-') - seq1.count('.')
            seq2_len = len(seq2) - seq2.count('-') - seq2.count('.')
            if ann.count('R') == 0 or min(seq1_len, seq2_len) < min_seq_length or ann.count('R') == len(ann):
                output = do_not_touch_output.format(id=do_not_touch_counter)
                do_not_touch_counter += 1
            else:   
                output = realign_output.format(id=realign_counter)
                realign_counter += 1
            files.append(output)
            aln = [
                (alignment[0][0], alignment[0][1][start:stop]),
                (alignment[2][0], alignment[2][1][start:stop]),
                (alignment[4][0], alignment[4][1][start:stop])
            ]
            #Fasta.save(aln, output, width=-1)
        files.append('');
        
    with Open(list_of_files_output, 'w') as f:
        json.dump(files, f, indent=4)

Example #17

0

Show file

File: TrfCover.py Project: pombredanne/realigner

def main(inp, out, alignment_regexp, sequence_regexp, trf=trf_paths):
    for trf_executable in trf:
        if os.path.exists(trf_executable):
            trf = TRFDriver(trf_executable, mathType=float)
            break
    repeats = trf.run(inp)

    stats = defaultdict(int)

    for aln in Fasta.load(inp,
                          alignment_regexp,
                          Alignment,
                          sequence_selectors=sequence_regexp):
        X_index = 0
        Y_index = 1

        X_trf = list(
            translate_repeat_to_annotation(repeats[aln.names[X_index]],
                                           aln.seq_to_aln[X_index]))
        Y_trf = list(
            translate_repeat_to_annotation(repeats[aln.names[Y_index]],
                                           aln.seq_to_aln[Y_index]))

        X_ann = list("M" * len(aln.sequences[X_index]))
        Y_ann = list("M" * len(aln.sequences[Y_index]))
        B_ann = list("M" * len(aln.sequences[Y_index]))
        for repeat in X_trf:
            if repeat.end >= len(X_ann):
                repeat.end = len(X_ann) - 1
            rlen = 1 + repeat.end - repeat.start
            X_ann[repeat.start:repeat.end + 1] = list("R" * rlen)
            B_ann[repeat.start:repeat.end + 1] = list("R" * rlen)
        for repeat in Y_trf:
            if repeat.end >= len(Y_ann):
                repeat.end = len(Y_ann) - 1
            rlen = 1 + repeat.end - repeat.start
            Y_ann[repeat.start:repeat.end + 1] = list("R" * rlen)
            B_ann[repeat.start:repeat.end + 1] = list("R" * rlen)
        assert (len(X_ann) == len(Y_ann) and len(B_ann) == len(Y_ann))

        M_count = len([x for x in B_ann if x == 'M'])
        R_count = len([x for x in B_ann if x == 'R'])
        R_segments_count = len([
            x for x in zip('M' + ''.join(B_ann), ''.join(B_ann) + 'M')
            if x[0] != 'R' and x[1] == 'R'
        ])
        stats['M_count'] += M_count
        stats['R_count'] += R_count
        stats['R_segment_count'] += R_segments_count
        changes = [
            i for i, x in zip(range(len(B_ann) + 1),
                              zip('M' + ''.join(B_ann), ''.join(B_ann) + 'M'))
            if x[0] != x[1]
        ]
        R_segments = [(changes[i], changes[i + 1])
                      for i in range(0,
                                     len(changes) - (len(changes) % 2), 2)]

        assert (R_segments_count == len(R_segments))
        for start, stop in R_segments:
            XX = 'M'
            YY = 'M'
            for i in range(start, stop):
                if X_ann[i] == 'R':
                    XX = 'R'
                if Y_ann[i] == 'R':
                    YY = 'R'
                assert (B_ann[i] == 'R')
            stats[XX + YY] += 1

    with Open(out, 'w') as f:
        json.dump(stats, f, indent=4)

Example #18

0

Show file

File: run_muscle.py Project: mhozza/realigner

def select_sequences(inp_filename, out_filename, sequences):
    aln = list(Fasta.load(inp_filename,
                          '',
                          Alignment,
                          sequence_selectors=sequences))[0]
    Fasta.save(zip(aln.names, aln.sequences), out_filename)

Example #19

0

Show file

File: compareAlignment.py Project: pombredanne/realigner

def main(correct_file, aln_file, output_file, interval=None):
    task_ids = [None]
    if os.environ.has_key('SGE_TASK_ID'):
        if os.environ['SGE_TASK_ID'] != 'undefined':
            sge_task_id = int(os.environ['SGE_TASK_ID'])
            if not os.environ.has_key('SGE_STEP_SIZE'):
                sge_step_size = 1
            else:
                sge_step_size = int(os.environ['SGE_STEP_SIZE'])
            sge_task_last = int(os.environ['SGE_TASK_LAST'])
            task_ids = range(
                sge_task_id, min(sge_task_id + sge_step_size,
                                 sge_task_last + 1))
    if interval != None:
        task_ids = range(interval[0], interval[1] + 1)
    for task_id in task_ids:
        separator = ''
        output = {}
        for fun, tp in [(identity, 'standard'),
                        (expand_repeats, 'expanded_repeats'),
                        (remove_repeats, 'removed_repeats')]:
            try:
                for correct, alignment in zip(
                        Fasta.load(correct_file.format(id=task_id - 1),
                                   separator, Alignment),
                        Fasta.load(aln_file.format(id=task_id - 1), separator,
                                   Alignment)):
                    correct_len = len(correct.getCoordPairs(False))
                    total_len = correct_len * 2 - correct.sequences[0].count(
                        '-') - correct.sequences[2].count('-')
                    ccc = fun(correct.getCoordPairs(False), correct)
                    if tp == 'removed_repeats':
                        correct_len = len(ccc)
                        total_len = 0
                        for v1, _, v2 in ccc:
                            if v1 >= 0:
                                total_len += 1
                            if v2 >= 0:
                                total_len += 1
                    acc = alignment.getCoordPairs(False)
                    cc = map(lambda x: (x[0], x[2]), ccc)
                    if len(acc[0]) == 3:
                        ac = map(lambda x: (x[0], x[2]), acc)
                    elif len(acc[0]) == 2:
                        ac = acc
                    else:
                        ac = None
                    c = set(cc)
                    a = set(ac)

                    intersect = c.intersection(a)
                    not_in_c = c.difference(a)
                    not_in_a = a.difference(c)
                    symm_diff = c.symmetric_difference(a)

                    score = 0
                    for v1, v2 in intersect:
                        if v1 >= 0:
                            score += 1
                        if v2 >= 0:
                            score += 1

                    dists_correct = defaultdict(int)
                    dists_total = defaultdict(int)
                    position = dict()
                    dists = [99999999] * len(correct.sequences[1])
                    dst = 9999999
                    for x, a, y in ccc:
                        position[(x, y)] = a
                    for i in range(len(correct.sequences[1])):
                        if correct.sequences[1][i] == 'R':
                            dst = 0
                        else:
                            dst += 1
                        dists[i] = min(dists[i], dst)
                    for i in reversed(range(len(correct.sequences[1]))):
                        if correct.sequences[1][i] == 'R':
                            dst = 0
                        else:
                            dst += 1
                        dists[i] = min(dists[i], dst)

                    for pos in c:
                        d = dists[position[pos]]
                        if d == 0:
                            continue
                        dists_total[d] += 1
                        if pos in ac:
                            dists_correct[d] += 1

                    def getRepeatAnnotation(coord, annotation):
                        if len(coord[0]) != 3:
                            return set()
                        ret = set()
                        for x, a, y in coord:
                            if annotation[a] == 'R':
                                if x >= 0:
                                    ret.add((x, -1))
                                if y >= 0:
                                    ret.add((-1, y))
                        return ret

                    crann = getRepeatAnnotation(correct.getCoordPairs(False),
                                                correct.sequences[1])
                    arann = getRepeatAnnotation(alignment.getCoordPairs(False),
                                                alignment.sequences[1])

                    def getRepeatBlocks(coord, annotation):
                        if len(coord[0]) != 3:
                            return set()
                        ret = set()
                        x = set()
                        y = set()
                        for _x, a, _y in coord:
                            if annotation[a] == 'R':
                                if _x >= 0:
                                    x.add(_x)
                                if _y >= 0:
                                    y.add(_y)
                            else:
                                if len(x) + len(y) > 0:
                                    if len(x) == 0:
                                        x.add(-1)
                                    if len(y) == 0:
                                        y.add(-1)
                                    ret.add(((min(x), max(x) + 1),
                                             (min(y), max(y) + 1)))
                                    x = set()
                                    y = set()
                        if len(x) + len(y) > 0:
                            if len(x) == 0:
                                x.add(-1)
                            if len(y) == 0:
                                y.add(-1)
                            ret.add(
                                ((min(x), max(x) + 1), (min(y), max(y) + 1)))
                            x = set()
                            y = set()
                        return ret

                    cbann = getRepeatBlocks(correct.getCoordPairs(False),
                                            correct.sequences[1])
                    abann = getRepeatBlocks(alignment.getCoordPairs(False),
                                            alignment.sequences[1])

                    def dst(x1, x2):
                        if x1 == -1:
                            return 0
                        return x2 - x1

                    def getPoints(s):
                        return sum([
                            dst(x1, x2) + dst(y1, y2)
                            for ((x1, x2), (y1, y2)) in s
                        ])

                    # Find long segments that are correctly aligned
                    cseg = [1 if x in c else 0 for x in ac]
                    seg_len = []
                    length = 0
                    segment_length_histogram = defaultdict(int)
                    for x in cseg:
                        if x == 0 and length != 0:
                            segment_length_histogram[length] += 1
                        length = length * x + x
                        seg_len.append(length)
                    if length > 0:
                        segment_length_histogram[length] += 1

                    getPoints = len
                    output[tp] = {
                        'corect':
                        correct_file,
                        'alignment':
                        aln_file,
                        'c-lenght':
                        len(cc),
                        'a-length':
                        len(ac),
                        'intersect':
                        len(intersect),
                        '%correct':
                        100.0 - float(len(intersect) * 100) / correct_len
                        if correct_len > 0 else 100,
                        '+mistakes':
                        len(intersect),
                        '+len':
                        correct_len,
                        '+RepTP':
                        len(crann & arann),
                        '+RepTN':
                        total_len - len(crann | arann),
                        '+RepFP':
                        len(arann - crann),
                        '+RepFN':
                        len(crann - arann),
                        '+BlkTP':
                        getPoints(cbann & abann),
                        '+BlkTN':
                        0,
                        '+BlkFP':
                        getPoints(abann - cbann),
                        '+BlkFN':
                        getPoints(cbann - abann),
                        '%score':
                        float(score) * 100 / total_len if total_len > 0 else 0,
                        'c-a':
                        len(not_in_c),
                        'a-c':
                        len(not_in_a),
                        'symmetric_difference':
                        len(symm_diff),
                        'correct_len_histogram':
                        segment_length_histogram,
                        '@+dists_correct':
                        dists_correct,
                        '@+dists_total':
                        dists_total,
                    }
                    if correct_len == 0:
                        del output[tp]['%correct']
                    if total_len == 0:
                        del output[tp]['%score']
            except IOError:
                pass

        with Open(output_file.format(id=task_id - 1), 'w') as f:
            json.dump(output, f, indent=4)

Example #20

0

Show file

File: annotateTRF.py Project: pombredanne/realigner

def main(input_file, output_file):

    for trf_executable in trf_paths:
        if os.path.exists(trf_executable):
            trf = TRFDriver(trf_executable)
            #break
    if not trf:
        raise "No trf found"
    repeats = trf.run(input_file)

    with open(output_file, 'w') as f:
        for alignment in Fasta.load(input_file, '\.[0-9]*$', Alignment):
            if len(alignment.sequences) != 2:
                print 'error'
                continue
            #print alignment.names
            annotation = list('.' * len(alignment.sequences[0]))
            annotationX = list('.' * len(alignment.sequences[0]))
            annotationY = list('.' * len(alignment.sequences[0]))
            trf = None
            for seq_name in alignment.names:
                index = None
                for i in range(len(alignment.names)):
                    if seq_name == alignment.names[i]:
                        index = i
                translator = alignment.seq_to_aln[index]
                revtranslator = alignment.aln_to_seq[index]
                for repeat in repeats[seq_name]:
                    for i in range(translator[repeat.start],
                                   translator[repeat.end]):
                        annotation[i] = 'R'
                        j = i - translator[repeat.start]
                        if index == 0:
                            annotationX[i] = repeat.consensus[
                                revtranslator[j] % len(repeat.consensus)]
                        else:
                            annotationY[i] = repeat.consensus[
                                revtranslator[j] % len(repeat.consensus)]
            d = defaultdict(int)
            ll = 0
            for v in annotation:
                if v != 'R':
                    if ll > 0:
                        d[ll] += 1
                        ll = 0
                else:
                    ll += 1
            #for x, y in sorted(d.iteritems(), key=lambda x: x[1]):
            #    print '{}: {}'.format(x, y)
            #if len(d.keys()) > 0:
            #    print('Number of repeats: {}, average length: {}, maximum length: {}, minimum length: {}'.format(
            #        sum(d.values()),
            #        sum([x * y for x, y in d.iteritems()])/ max(sum(d.values()), 1),
            #        max(d.keys()),
            #        min(d.keys())
            #    ))

            seqX = alignment.sequences

            nm = alignment.names[0]
            aln = [
                (alignment.names[0], alignment.sequences[0].replace('.', '-')),
                ('consensusX' + nm, ''.join(annotationX)),
                ('annotation' + nm, ''.join(annotation)),
                ('consensusY' + nm, ''.join(annotationY)),
                (alignment.names[1], alignment.sequences[1].replace('.', '-'))
            ]
            Fasta.saveAlignmentPiece(aln, f, -1)

Example #21

0

Show file

File: compareAlignment.py Project: mhozza/realigner

def main(correct_file, aln_file, output_file, interval=None):
    task_ids = [None]
    if os.environ.has_key('SGE_TASK_ID'):
        if os.environ['SGE_TASK_ID'] != 'undefined':
            sge_task_id = int(os.environ['SGE_TASK_ID'])
            if not os.environ.has_key('SGE_STEP_SIZE'):
                sge_step_size = 1
            else:
                sge_step_size = int(os.environ['SGE_STEP_SIZE'])
            sge_task_last = int(os.environ['SGE_TASK_LAST'])
            task_ids = range(
                sge_task_id,
                min(sge_task_id + sge_step_size, sge_task_last + 1)
            )
    if interval != None:
        task_ids = range(interval[0], interval[1] + 1)
    for task_id in task_ids:
        separator = ''
        output = {}
        for fun, tp in [(identity, 'standard'), (expand_repeats, 'expanded_repeats'), (remove_repeats, 'removed_repeats')]:
            try:
                for correct, alignment in zip(
                    Fasta.load(correct_file.format(id=task_id - 1), separator, Alignment),
                    Fasta.load(aln_file.format(id=task_id - 1), separator, Alignment)
                ):
                    correct_len = len(correct.getCoordPairs(False))
                    total_len = correct_len * 2 - correct.sequences[0].count('-') - correct.sequences[2].count('-')
                    ccc = fun(correct.getCoordPairs(False), correct)
                    if tp == 'removed_repeats':
                        correct_len = len(ccc)
                        total_len = 0
                        for v1, _, v2 in ccc:
                            if v1 >= 0:
                                total_len += 1
                            if v2 >= 0:
                                total_len += 1
                    acc = alignment.getCoordPairs(False)
                    cc = map(lambda x: (x[0], x[2]), ccc)
                    if len(acc[0]) == 3:
                        ac = map(lambda x: (x[0], x[2]), acc)
                    elif len(acc[0]) ==2:
                        ac = acc
                    else: 
                        ac = None
                    c = set(cc)
                    a = set(ac)
                    
                    intersect = c.intersection(a)
                    not_in_c = c.difference(a)
                    not_in_a = a.difference(c)
                    symm_diff = c.symmetric_difference(a)

                    score = 0
                    for v1, v2 in intersect:
                        if v1 >= 0: 
                            score += 1
                        if v2 >= 0: 
                            score += 1
                  
                    
                    dists_correct = defaultdict(int)
                    dists_total = defaultdict(int)
                    position = dict()
                    dists = [99999999] * len(correct.sequences[1])
                    dst = 9999999
                    for x, a, y in ccc:
                        position[(x,y)] = a
                    for i in range(len(correct.sequences[1])):
                        if correct.sequences[1][i] == 'R':
                            dst = 0
                        else:
                            dst += 1
                        dists[i] = min(dists[i], dst)
                    for i in reversed(range(len(correct.sequences[1]))):
                        if correct.sequences[1][i] == 'R':
                            dst = 0
                        else:
                            dst += 1
                        dists[i] = min(dists[i], dst)

                    for pos in c:
                        d = dists[position[pos]]
                        if d == 0: 
                            continue
                        dists_total[d] += 1
                        if pos in ac:
                            dists_correct[d] += 1
                    
                        


                    def getRepeatAnnotation(coord, annotation):
                        if len(coord[0]) != 3:
                            return set()
                        ret = set()
                        for x, a, y in coord:
                            if annotation[a] == 'R':
                                if x >= 0:
                                   ret.add((x, -1))
                                if y >= 0:
                                    ret.add((-1, y))
                        return ret

                    crann = getRepeatAnnotation(correct.getCoordPairs(False), correct.sequences[1]) 
                    arann = getRepeatAnnotation(alignment.getCoordPairs(False), alignment.sequences[1]) 

                    def getRepeatBlocks(coord, annotation):
                        if len(coord[0]) != 3:
                            return set()
                        ret = set()
                        x = set()
                        y = set()
                        for _x, a, _y in coord:
                            if annotation[a] == 'R':
                                if _x >= 0: 
                                    x.add(_x)
                                if _y >= 0:
                                    y.add(_y)
                            else:
                                if len(x) + len(y) > 0:
                                    if len(x) == 0:
                                        x.add(-1)
                                    if len(y) == 0:
                                        y.add(-1)
                                    ret.add(((min(x), max(x) + 1), (min(y), max(y) + 1)))
                                    x = set()
                                    y = set()
                        if len(x) + len(y) > 0:
                            if len(x) == 0:
                                x.add(-1)
                            if len(y) == 0:
                                y.add(-1)
                            ret.add(((min(x), max(x) + 1), (min(y), max(y) + 1)))
                            x = set()
                            y = set()
                        return ret

                    cbann = getRepeatBlocks(correct.getCoordPairs(False), correct.sequences[1])
                    abann = getRepeatBlocks(alignment.getCoordPairs(False), alignment.sequences[1])
                    
                    def dst(x1, x2):
                        if x1 == -1:
                            return 0
                        return x2 - x1

                    def getPoints(s):
                        return sum([dst(x1,x2) + dst(y1,y2) for ((x1, x2), (y1, y2)) in s])

                    # Find long segments that are correctly aligned
                    cseg = [1 if x in c else 0 for x in ac]
                    seg_len = []
                    length = 0
                    segment_length_histogram = defaultdict(int)
                    for x in cseg:
                        if x == 0 and length != 0:
                            segment_length_histogram[length] += 1
                        length = length * x + x
                        seg_len.append(length)
                    if length > 0:
                        segment_length_histogram[length] += 1
                 
                    getPoints = len
                    output[tp] = {
                        'corect': correct_file,
                        'alignment': aln_file,
                        'c-lenght': len(cc),
                        'a-length': len(ac),
                        'intersect': len(intersect),
                        '%correct': 100.0 - float(len(intersect) * 100) / correct_len if correct_len > 0 else 100,
                        '+mistakes': len(intersect),
                        '+len': correct_len,
                        '+RepTP': len(crann & arann),
                        '+RepTN': total_len - len(crann | arann),
                        '+RepFP': len(arann - crann),
                        '+RepFN': len(crann - arann),
                        '+BlkTP': getPoints(cbann & abann),
                        '+BlkTN': 0,
                        '+BlkFP': getPoints(abann - cbann),
                        '+BlkFN': getPoints(cbann - abann),
                        '%score': float(score) * 100 / total_len if total_len > 0 else 0,
                        'c-a': len(not_in_c),
                        'a-c': len(not_in_a),
                        'symmetric_difference': len(symm_diff),
                        'correct_len_histogram': segment_length_histogram,
                        '@+dists_correct': dists_correct,
                        '@+dists_total': dists_total,
                    }
                    if correct_len == 0:
                        del output[tp]['%correct']
                    if total_len == 0:
                        del output[tp]['%score']
            except IOError:
                pass
                        
        with Open(output_file.format(id=task_id - 1), 'w') as f:
            json.dump(output, f, indent=4)

Example #22

0

Show file

def select_sequences(inp_filename, out_filename, sequences):
    aln = list(
        Fasta.load(inp_filename, '', Alignment,
                   sequence_selectors=sequences))[0]
    Fasta.save(zip(aln.names, aln.sequences), out_filename)

Example #23

0

Show file

File: TrfCover.py Project: mhozza/realigner

def main(inp, out, alignment_regexp, sequence_regexp, trf=trf_paths):
    for trf_executable in trf:
        if os.path.exists(trf_executable):
            trf = TRFDriver(trf_executable, mathType=float)
            break
    repeats = trf.run(inp)

    stats = defaultdict(int)

    for aln in Fasta.load(
        inp,
        alignment_regexp,
        Alignment,
        sequence_selectors=sequence_regexp
    ):
        X_index = 0
        Y_index = 1

        X_trf = list(translate_repeat_to_annotation(
            repeats[aln.names[X_index]], aln.seq_to_aln[X_index]))
        Y_trf = list(translate_repeat_to_annotation(
            repeats[aln.names[Y_index]], aln.seq_to_aln[Y_index]))
        
        X_ann = list("M" * len(aln.sequences[X_index]))
        Y_ann = list("M" * len(aln.sequences[Y_index]))
        B_ann = list("M" * len(aln.sequences[Y_index]))
        for repeat in X_trf:
            if repeat.end >= len(X_ann):
                repeat.end = len(X_ann) - 1
            rlen = 1 + repeat.end - repeat.start
            X_ann[repeat.start : repeat.end + 1] = list("R" * rlen)
            B_ann[repeat.start : repeat.end + 1] = list("R" * rlen)
        for repeat in Y_trf:
            if repeat.end >= len(Y_ann):
                repeat.end = len(Y_ann) - 1
            rlen = 1 + repeat.end - repeat.start
            Y_ann[repeat.start : repeat.end + 1] = list("R" * rlen)
            B_ann[repeat.start : repeat.end + 1] = list("R" * rlen)
        assert(len(X_ann) == len(Y_ann) and len(B_ann) == len(Y_ann))

        M_count = len([x for x in B_ann if x == 'M'])
        R_count = len([x for x in B_ann if x == 'R'])
        R_segments_count = len([x for x in zip('M' + ''.join(B_ann),
                                               ''.join(B_ann) + 'M') 
                                if x[0] != 'R' and x[1] == 'R'])
        stats['M_count'] += M_count
        stats['R_count'] += R_count
        stats['R_segment_count'] += R_segments_count
        changes = [i 
                   for i, x in zip(
                                   range(len(B_ann) + 1), 
                                   zip('M' + ''.join(B_ann),
                                        ''.join(B_ann) + 'M'))
                   if x[0] != x[1]]
        R_segments = [(changes[i], changes[i+1]) 
                      for i in range(0, len(changes) - (len(changes) % 2), 2)]

        assert(R_segments_count == len(R_segments))  
        for start, stop in R_segments:
            XX = 'M'
            YY = 'M'
            for i in range(start, stop):
                if X_ann[i] == 'R':
                    XX = 'R'
                if Y_ann[i] == 'R':
                    YY = 'R'
                assert(B_ann[i] == 'R')
            stats[XX + YY] += 1
        
    with Open(out, 'w') as f:
        json.dump(stats, f, indent=4);