def PredictionAnalysis(align1, align2, outfile, granular = True, open_mode = 'w', limit_functions = set(), **kwargs): a1 = Alignment.alignment_from_file(align1) a2 = Alignment.alignment_from_file(align2) print 'loaded alignments' sprot = prot_from_path(align1) tprot = prot_from_path(align2) defaults = dict(zip(LinkFields.LINK_FIELDS, [None]*len(LinkFields.LINK_FIELDS))) defaults['S1-Prot']=sprot defaults['S2-Prot']=tprot defaults.pop('Source-Start') defaults.pop('Source-End') defaults.pop('Target-Start') defaults.pop('Target-End') calculator = LinkUtils.LinkCalculator() rmheaders = dict((head, None) for head in calculator.get_fields()) submats = LinkUtils.get_all_sub_mats() ohandle = open(outfile, open_mode) owriter = csv.DictWriter(ohandle, LinkFields.LINK_FIELDS, delimiter = '\t', extrasaction='ignore') if open_mode == 'w': owriter.writerow(dict(zip(LinkFields.LINK_FIELDS, LinkFields.LINK_FIELDS))) done = set() else: done = get_done(outfile) if granular: for row in task_loader(None, a1, a2, defaults,submats, 50, align1==align2, limit_functions=limit_functions, found_items = done): owriter.writerows(convert_row_to_writeable_rows(row, rmheaders)) else: process_que = Queue(1000) loader = Thread(target=task_loader, args= (process_que, a1, a2, defaults,submats, 50, align1==align2), kwargs={'limit_functions':limit_functions, 'found_items': done}) loader.start() print 'waiting for first' item = process_que.get() while item is not None: try: row = item.get(timeout = 60*30, interval = 60*1) except TimeoutError: logging.warning('no result for one!') item = process_que.get() continue logging.info('%i %i' % (row['S1-Start'], row['S2-End'])) owriter.writerows(convert_row_to_writeable_rows(row, rmheaders)) item = process_que.get()
def link_calculator(row, submats, seq1, seq2, granular = False,limit_functions = set()): c1 = AlignUtils.make_counts(seq1) c2 = AlignUtils.make_counts(seq2) row['S1-Entropy'] = AlignUtils.calculate_entropy(seq1) row['S2-Entropy'] = AlignUtils.calculate_entropy(seq2) row['S12-Mapping'] = LinkUtils.prediction_mapping(seq1, seq2) row['S21-Mapping'] = LinkUtils.prediction_mapping(seq2, seq1) row['SeqLength'] = len(seq1) row['S1-Cons'] = max(x/len(seq1) for x in c1.values()) row['S2-Cons'] = max(x/len(seq2) for x in c2.values()) if row['S1-Cons'] > 0.99999 or row['S2-Cons'] > 0.99999: return row logging.info('%s\t%f\t%s\t%f' % (seq1, row['S1-Cons'], seq2, row['S2-Cons'])) processfuns = [] for name, mat in submats: if granular: processfuns.append(('SBASC_'+name, LinkUtils.calculate_SBASC, (mat,))) else: processfuns.append(('SBASC_'+name, partial(LinkUtils.calculate_SBASC, mat), ())) processfuns.append(('Mutual_Info', LinkUtils.calculate_mutual_info, {})) processfuns.append(('OMES', LinkUtils.calculate_OMES, ())) processfuns.append(('Linkage', LinkUtils.calculate_mapping, ())) suffs = ['_raw', '_pval', '_null', '_count'] if limit_functions: processfuns = [x for x in processfuns if x[0] in limit_functions] for name, func, evals in processfuns: if granular: logging.info('calculating %s %i %i' % (name, row['S1-Start'], row['S2-Start'])) res = LinkUtils.celery_calculate_vals(seq1, seq2, func, preargs=evals) else: res = LinkUtils.calculate_vals(seq1, seq2, func) for val, suff in zip(res, suffs): row[name+suff] = val logging.info(name+suff+':'+str(val)) return row