Esempio n. 1
0
def OldPredictionAnalysis(align1, align2, outfile, widths = range(1,5), same = False, mode = 'a', cons_cut = 0.98, calc_pval = False, short_linkage_format = False):
    """Analyzes the linkages between 2 alignments.
    
    A controller function which calculates the Linkage between columns in 
    the alignment files. This function takes care of opening/loading 
    alignments, iterating through the columns, calculating linkages, 
    and writing to an output file.

    Arguements:
    align1 -- Path to source alignment file.
    align2 -- Path to target alignment file.
    outfile -- Path to the results file.
    
    Kwargs:
    widths -- The desired column widths to check. Default: range(1,5)
    same -- A boolean indicating whether these files are the same protein.
            Default: False
    mode -- Which mode to open the results file. Default: 'a'
    cons_cut -- The conservation cutoff to use for ignoring columns. 
                Default: 0.8
    calc_pval -- A boolean indicating whether to calculate p-values for 
                 linkages. Default: False
    
    Returns:
    None"""
    
    def get_signals(align1, align2, widths, same, last):
        for sw, tw, ss, ts in crazy_iter([0, align1.width], [0, align2.width], widths, last_items = last):
            if same and getOverlap((ss, ss+sw), (ts, ts+tw)) > 0:
                continue
            if (ss, ss+sw) not in source_skip and (ts, ts+tw) not in target_skip:
                a1 = align1.get_slice(ss, ss+sw)
                a2 = align2.get_slice(ts, ts+tw)
                if a1 is None:
                    source_skip.add((ss, ss+sw))
                if a2 is None:
                    target_skip.add((ts, ts+tw))
                if a1 is None or a2 is None:
                    continue
                    
                over = set(a1.seqs.keys()) & set(a2.seqs.keys())
                if len(over) > 5:
                    yield a1, a2, sorted(over), {'Source-Start':ss, 'Source-End':ss+sw,
                                                    'Target-Start':ts, 'Target-End':ts+tw}
                else:
                    yield None, None, None, {'Source-Start':ss, 'Source-End':ss+sw,
                                                    'Target-Start':ts, 'Target-End':ts+tw}
                if len(a1.seqs) < 10:
                    source_skip.add((ss, ss+sw))
                if len(a2.seqs) < 10:
                    target_skip.add((ts, ts+tw))
                    
                    

    print 'widths!', widths

    line_count = 0
    a1 = Alignment.alignment_from_file(align1)
    a2 = Alignment.alignment_from_file(align2)

    sprot = prot_from_path(align1)
    tprot = prot_from_path(align2)

    defaults = dict(zip(LINK_FIELDS, [None]*len(LINK_FIELDS)))
    defaults['Source-Prot']=sprot
    defaults['Target-Prot']=tprot
    defaults.pop('Source-Start')
    defaults.pop('Source-End')
    defaults.pop('Target-Start')
    defaults.pop('Target-End')

    source_skip = set()
    target_skip = set()

    calculator = LinkCalculator()
    
    if mode == 'a' and os.path.exists(outfile):
        print 'trying to get last line!'
        with open(outfile) as handle:
            last = get_last(csv.DictReader(handle, delimiter = '\t'))
    else:
        last = None

    with open(outfile, mode) as handle:
        handle.write('\t'.join(LINK_FIELDS)+'\n')
        writer = csv.DictWriter(handle, fieldnames = LINK_FIELDS, 
                                delimiter = '\t')
        for slice1, slice2, seqs, loc in get_signals(a1, a2, widths, same, last):
            loc.update(defaults)            
            if slice1 is None:
                loc.update({'Source-Seq':None,
                            'Target-Seq':None,
                            'Correct-Num':'too few',
                            'Total-Num':'too few',
                            'This-Score': 0})
                #print 'few %(Source-Start)i, %(Source-End)i, %(Target-Start)i, %(Target-Start)i' % loc
                if not short_linkage_format:                
                    writer.writerow(loc)
                continue
            if not loc['Target-Start'] % 10:
                print '%(Source-Prot)s,%(Target-Prot)s,%(Source-Start)i,%(Target-Start)i' % loc
            s1, m1 = slice1.get_signal(seqs)
            s2, m2 = slice2.get_signal(seqs)

            #create reverse mappings
            rm1 = dict([(y,x) for x,y in m1.items()])
            rm2 = dict([(y,x) for x,y in m2.items()])

            seq1 = ''.join(rm1[s].upper() for s in s1)
            seq2 = ''.join(rm2[s].upper() for s in s2)

            #create count dictionary
            c1 = make_counts(s1)
            c2 = make_counts(s2)
            
            loc['Source-Cons'] = max(x/len(s1) for x in c1.values())
            loc['Target-Cons'] = max(x/len(s2) for x in c2.values())

            if loc['Source-Cons'] > cons_cut or loc['Target-Cons'] > cons_cut:
                loc.update({'Correct-Num':'too conserved',
                            'Total-Num':'too conserved'})
                if not short_linkage_format:
                    writer.writerow(loc)
                if loc['Source-Cons'] > cons_cut:
                    source_skip.add((loc['Source-Start'], loc['Source-End']))
                if loc['Target-Cons'] > cons_cut:
                    target_skip.add((loc['Target-Start'], loc['Target-End']))
                continue


            mappings = prediction_mapping(tuple(s1), tuple(s2))
            score = sum([z for _, _, z in mappings])/len(s1)
            loc['Total-Score'] = score
            for field, val in calculator.calculate_all(seq1, seq2):
                loc[field] = val

            #print '%(Source-Start)i, %(Source-End)i, %(Target-Start)i, %(Target-Start)i, %(Total-Score)f' % loc
            line_count += 1
            for source, target, val in mappings:
                loc.update({'Source-Seq':rm1[source],
                                'Target-Seq':rm2[target],
                                'Correct-Num':val,
                                'Total-Num':c1[source],
                                'This-Score': val/c1[source]})
                writer.writerow(loc)
            handle.flush()
            os.fsync(handle.fileno())
    return line_count
Esempio n. 2
0
def PredictionAnalysis(align1, align2, outfile, cons_cut = 0.99, **kwargs):

    if not os.path.exists(outfile):
        mode = 'w'
        last = None
    elif check_headers(outfile):
        mode = 'w'
        last = None
    else:
        mode = 'a'
        with open(outfile) as handle:
            iterable = csv.DictReader(handle, delimiter = '\t')
            last = get_last(iterable)

    a1 = Alignment.alignment_from_file(align1)
    a2 = Alignment.alignment_from_file(align2)

    sprot = prot_from_path(align1)
    tprot = prot_from_path(align2)

    defaults = dict(zip(LINK_FIELDS, [None]*len(LINK_FIELDS)))
    defaults['Source-Prot']=sprot
    defaults['Target-Prot']=tprot
    defaults.pop('Source-Start')
    defaults.pop('Source-End')
    defaults.pop('Target-Start')
    defaults.pop('Target-End')

    calculator = LinkCalculator()
    rmheaders = dict((head, None) for head in calculator.get_fields())

    headers = sorted(set(a1.seqs.keys()) & set(a2.seqs.keys()))

    iterable = product(izip(count(), a1.iterate_columns(headers)),
                        izip(count(), a2.iterate_columns(headers)))
    if last:
        fiterable = dropwhile(iterable, lambda x:x[0][0]<last[2] and x[1][0]<last[3])
    else:
        fiterable = iterable

    ohandle = open(outfile, mode)
    writer = csv.DictWriter(ohandle, LINK_FIELDS, delimiter = '\t')
    if mode == 'w':
        writer.writerow(dict(zip(LINK_FIELDS, LINK_FIELDS)))

    for (ind1, seq1), (ind2, seq2) in fiterable:

        cseq1 = ''
        cseq2 = ''
        for s1, s2 in zip(seq1, seq2):
            if s1 != '-' and s2 != '-':
                cseq1 += s1
                cseq2 += s2

        if not cseq1 or not cseq2:
            continue

        c1 = make_counts(cseq1)
        c2 = make_counts(cseq2)

        row = dict()
        row.update(defaults)
        row['Source-Start'] = ind1
        row['Source-End'] = ind1+1
        row['Target-Start'] = ind2
        row['Target-End'] = ind2+1

        row['Source-Cons'] = max(x/len(cseq1) for x in c1.values())
        row['Target-Cons'] = max(x/len(cseq2) for x in c2.values())
        if row['Source-Cons'] == 1 or row['Target-Cons'] == 1:
            writer.writerow(row)
            continue

        row['Source-Entropy'] = calculate_entropy(cseq1)
        row['Target-Entropy'] = calculate_entropy(cseq2)

        for field, val in calculator.calculate_all(cseq1, cseq2):
            row[field] = val

        mappings = prediction_mapping(cseq1, cseq2)
        score = sum([z for _, _, z in mappings])/len(cseq1)
        row['Total-Score'] = score
        source, target, val = mappings[0]
        row.update({'Source-Seq':source,
                    'Target-Seq':target,
                    'Correct-Num':val,
                    'Total-Num':c1[source],
                    'This-Score': val/c1[source]})
        writer.writerow(row)
        row.update(rmheaders)

        for source, target, val in mappings[1:]:
            row.update({'Source-Seq':source,
                        'Target-Seq':target,
                        'Correct-Num':val,
                        'Total-Num':c1[source],
                        'This-Score': val/c1[source]})
            writer.writerow(row)
        ohandle.flush()
        os.fsync(ohandle.fileno())