for g in gsets[ii]: counts[g] = counts.get(g, 0) + 1 mx = max(counts.values()) genesets[ii] = set( [x for x, y in counts.iteritems() if y == mx]) print 'empty common genes:', ii, 'clone_size:', clone_size, 'mx-genecount', mx, 'newgeneset:', genesets[ ii], em for genes, segtype in zip(genesets, segtypes_lowercase): assert genes tag = segtype + '_genes' assert tag in outl # should already be there, now over-writing outl[tag] = ';'.join(sorted(genes)) ## update reps reps = sorted(set((util.get_rep(x, organism) for x in genes))) tag = segtype + '_reps' assert tag in outl # should already be there, now over-writing outl[tag] = ';'.join(reps) ## update countreps countreps = sorted( set((util.get_mm1_rep_gene_for_counting(x, organism) for x in genes))) tag = segtype + '_countreps' assert tag in outl # should already be there, now over-writing outl[tag] = ';'.join(countreps) total_clones += 1 out.write(make_tsv_line(outl, outfields, '-') + '\n')
def reconstruct_field_from_data(field, l, organism): try: if field in l: return l[field] if field == 'subject': if 'mouse' in l: return l['mouse'] elif field.endswith('_prob'): if prob_warning not in warnings: print prob_warning warnings.add(prob_warning) return 1.0 elif field == 'clone_size': if clone_size_warning not in warnings: print clone_size_warning warnings.add(clone_size_warning) return 1 elif field[:3] in ['va_', 'ja_', 'vb_', 'jb_']: ## these are all pretty similar prefix = field[:3] tag = field[3:] #print 'prefix:',prefix,tag,l.keys() if tag == 'gene': # only one place to get this genes_field = prefix + 'genes' if genes_field not in l: return None genes = l[genes_field].split(listsep) #print 'genes:',genes return sorted(genes)[0] elif tag == 'genes': if prefix + 'blast_hits' in l: hits = l[prefix + 'blast_hits'] if hits == '-': #failed to find any hits return None else: return listsep.join(sorted(util.get_top_genes(hits))) elif prefix + 'gene' in l: ## just take the one gene we know return l[prefix + 'gene'] elif tag == 'rep': if prefix + 'gene' in l: if "*" in l[prefix + 'gene']: return util.get_rep(l[prefix + 'gene'], organism) else: if rep_warning not in warnings: print rep_warning warnings.add(rep_warning) return l[prefix + 'gene'] elif tag == 'reps': ## we should already have hit 'genes' in the list of fields we are trying to fill !!! if "*" in l[prefix + 'gene']: return listsep.join( sorted((util.get_rep(x, organism) for x in l[prefix + 'genes'].split(listsep)))) else: if rep_warning not in warnings: print rep_warning warnings.add(rep_warning) return l[prefix + 'gene'] elif tag == 'countreps': if "*" in l[prefix + 'gene']: return listsep.join( sorted( util.countreps_from_genes( l[prefix + 'genes'].split(listsep), organism))) else: if rep_warning not in warnings: print rep_warning warnings.add(rep_warning) return l[prefix + 'gene'] elif field.endswith('_quals'): seqfield = field[:5] + '_nucseq' if seqfield not in l: return None return '.'.join(['60'] * len(l[seqfield])) except Exception as inst: ## this is not the best way to handle it... print 'Hit an exception trying to get field {} from line'.format( field), inst print 'Failed to reconstruct {} from the input fields: {}'\ .format( field, ' '.join( sorted( l.keys() ) ) ) return None
def __init__(self, organism, va_genes, vb_genes, cdr3a, cdr3b): self.va_reps = frozenset((util.get_rep(x, organism) for x in va_genes)) self.vb_reps = frozenset((util.get_rep(x, organism) for x in vb_genes)) self.cdr3a = cdr3a[:] self.cdr3b = cdr3b[:]