def OldPredictionAnalysis(align1, align2, outfile, widths = range(1,5), same = False, mode = 'a', cons_cut = 0.98, calc_pval = False, short_linkage_format = False): """Analyzes the linkages between 2 alignments. A controller function which calculates the Linkage between columns in the alignment files. This function takes care of opening/loading alignments, iterating through the columns, calculating linkages, and writing to an output file. Arguements: align1 -- Path to source alignment file. align2 -- Path to target alignment file. outfile -- Path to the results file. Kwargs: widths -- The desired column widths to check. Default: range(1,5) same -- A boolean indicating whether these files are the same protein. Default: False mode -- Which mode to open the results file. Default: 'a' cons_cut -- The conservation cutoff to use for ignoring columns. Default: 0.8 calc_pval -- A boolean indicating whether to calculate p-values for linkages. Default: False Returns: None""" def get_signals(align1, align2, widths, same, last): for sw, tw, ss, ts in crazy_iter([0, align1.width], [0, align2.width], widths, last_items = last): if same and getOverlap((ss, ss+sw), (ts, ts+tw)) > 0: continue if (ss, ss+sw) not in source_skip and (ts, ts+tw) not in target_skip: a1 = align1.get_slice(ss, ss+sw) a2 = align2.get_slice(ts, ts+tw) if a1 is None: source_skip.add((ss, ss+sw)) if a2 is None: target_skip.add((ts, ts+tw)) if a1 is None or a2 is None: continue over = set(a1.seqs.keys()) & set(a2.seqs.keys()) if len(over) > 5: yield a1, a2, sorted(over), {'Source-Start':ss, 'Source-End':ss+sw, 'Target-Start':ts, 'Target-End':ts+tw} else: yield None, None, None, {'Source-Start':ss, 'Source-End':ss+sw, 'Target-Start':ts, 'Target-End':ts+tw} if len(a1.seqs) < 10: source_skip.add((ss, ss+sw)) if len(a2.seqs) < 10: target_skip.add((ts, ts+tw)) print 'widths!', widths line_count = 0 a1 = Alignment.alignment_from_file(align1) a2 = Alignment.alignment_from_file(align2) sprot = prot_from_path(align1) tprot = prot_from_path(align2) defaults = dict(zip(LINK_FIELDS, [None]*len(LINK_FIELDS))) defaults['Source-Prot']=sprot defaults['Target-Prot']=tprot defaults.pop('Source-Start') defaults.pop('Source-End') defaults.pop('Target-Start') defaults.pop('Target-End') source_skip = set() target_skip = set() calculator = LinkCalculator() if mode == 'a' and os.path.exists(outfile): print 'trying to get last line!' with open(outfile) as handle: last = get_last(csv.DictReader(handle, delimiter = '\t')) else: last = None with open(outfile, mode) as handle: handle.write('\t'.join(LINK_FIELDS)+'\n') writer = csv.DictWriter(handle, fieldnames = LINK_FIELDS, delimiter = '\t') for slice1, slice2, seqs, loc in get_signals(a1, a2, widths, same, last): loc.update(defaults) if slice1 is None: loc.update({'Source-Seq':None, 'Target-Seq':None, 'Correct-Num':'too few', 'Total-Num':'too few', 'This-Score': 0}) #print 'few %(Source-Start)i, %(Source-End)i, %(Target-Start)i, %(Target-Start)i' % loc if not short_linkage_format: writer.writerow(loc) continue if not loc['Target-Start'] % 10: print '%(Source-Prot)s,%(Target-Prot)s,%(Source-Start)i,%(Target-Start)i' % loc s1, m1 = slice1.get_signal(seqs) s2, m2 = slice2.get_signal(seqs) #create reverse mappings rm1 = dict([(y,x) for x,y in m1.items()]) rm2 = dict([(y,x) for x,y in m2.items()]) seq1 = ''.join(rm1[s].upper() for s in s1) seq2 = ''.join(rm2[s].upper() for s in s2) #create count dictionary c1 = make_counts(s1) c2 = make_counts(s2) loc['Source-Cons'] = max(x/len(s1) for x in c1.values()) loc['Target-Cons'] = max(x/len(s2) for x in c2.values()) if loc['Source-Cons'] > cons_cut or loc['Target-Cons'] > cons_cut: loc.update({'Correct-Num':'too conserved', 'Total-Num':'too conserved'}) if not short_linkage_format: writer.writerow(loc) if loc['Source-Cons'] > cons_cut: source_skip.add((loc['Source-Start'], loc['Source-End'])) if loc['Target-Cons'] > cons_cut: target_skip.add((loc['Target-Start'], loc['Target-End'])) continue mappings = prediction_mapping(tuple(s1), tuple(s2)) score = sum([z for _, _, z in mappings])/len(s1) loc['Total-Score'] = score for field, val in calculator.calculate_all(seq1, seq2): loc[field] = val #print '%(Source-Start)i, %(Source-End)i, %(Target-Start)i, %(Target-Start)i, %(Total-Score)f' % loc line_count += 1 for source, target, val in mappings: loc.update({'Source-Seq':rm1[source], 'Target-Seq':rm2[target], 'Correct-Num':val, 'Total-Num':c1[source], 'This-Score': val/c1[source]}) writer.writerow(loc) handle.flush() os.fsync(handle.fileno()) return line_count
def PredictionAnalysis(align1, align2, outfile, cons_cut = 0.99, **kwargs): if not os.path.exists(outfile): mode = 'w' last = None elif check_headers(outfile): mode = 'w' last = None else: mode = 'a' with open(outfile) as handle: iterable = csv.DictReader(handle, delimiter = '\t') last = get_last(iterable) a1 = Alignment.alignment_from_file(align1) a2 = Alignment.alignment_from_file(align2) sprot = prot_from_path(align1) tprot = prot_from_path(align2) defaults = dict(zip(LINK_FIELDS, [None]*len(LINK_FIELDS))) defaults['Source-Prot']=sprot defaults['Target-Prot']=tprot defaults.pop('Source-Start') defaults.pop('Source-End') defaults.pop('Target-Start') defaults.pop('Target-End') calculator = LinkCalculator() rmheaders = dict((head, None) for head in calculator.get_fields()) headers = sorted(set(a1.seqs.keys()) & set(a2.seqs.keys())) iterable = product(izip(count(), a1.iterate_columns(headers)), izip(count(), a2.iterate_columns(headers))) if last: fiterable = dropwhile(iterable, lambda x:x[0][0]<last[2] and x[1][0]<last[3]) else: fiterable = iterable ohandle = open(outfile, mode) writer = csv.DictWriter(ohandle, LINK_FIELDS, delimiter = '\t') if mode == 'w': writer.writerow(dict(zip(LINK_FIELDS, LINK_FIELDS))) for (ind1, seq1), (ind2, seq2) in fiterable: cseq1 = '' cseq2 = '' for s1, s2 in zip(seq1, seq2): if s1 != '-' and s2 != '-': cseq1 += s1 cseq2 += s2 if not cseq1 or not cseq2: continue c1 = make_counts(cseq1) c2 = make_counts(cseq2) row = dict() row.update(defaults) row['Source-Start'] = ind1 row['Source-End'] = ind1+1 row['Target-Start'] = ind2 row['Target-End'] = ind2+1 row['Source-Cons'] = max(x/len(cseq1) for x in c1.values()) row['Target-Cons'] = max(x/len(cseq2) for x in c2.values()) if row['Source-Cons'] == 1 or row['Target-Cons'] == 1: writer.writerow(row) continue row['Source-Entropy'] = calculate_entropy(cseq1) row['Target-Entropy'] = calculate_entropy(cseq2) for field, val in calculator.calculate_all(cseq1, cseq2): row[field] = val mappings = prediction_mapping(cseq1, cseq2) score = sum([z for _, _, z in mappings])/len(cseq1) row['Total-Score'] = score source, target, val = mappings[0] row.update({'Source-Seq':source, 'Target-Seq':target, 'Correct-Num':val, 'Total-Num':c1[source], 'This-Score': val/c1[source]}) writer.writerow(row) row.update(rmheaders) for source, target, val in mappings[1:]: row.update({'Source-Seq':source, 'Target-Seq':target, 'Correct-Num':val, 'Total-Num':c1[source], 'This-Score': val/c1[source]}) writer.writerow(row) ohandle.flush() os.fsync(ohandle.fileno())