def mutationTally(conf, args): # old proc6 '''number of lines mutating in this particular gene''' record = utils.parse_genbank(conf.REF_GENOME) # 4327 CDS, 4397 gene # feature types: 'rRNA', 'repeat_region', 'tRNA', 'source', 'misc_feature', 'CDS', 'gene' # snpcounttotal, snptypetotals = snpcount( # shouldn't do here. do above. # out_fn = None out_fn = fileName(args) genomediffs = {} for gd_file in conf.GENOMEDIFF_FILES: parse_genomediff(gd_file, record, genomediffs=genomediffs) print '\n' counts = mutated_lines_per_gene(genomediffs, conf.snp_types) # a dict with open(out_fn, 'wb') as fp: for tag, data in counts: line = str(tag) line += '\t' + str(tuple(data['genes'])) line += '\t' + str(len(data['lines'])) line += '\t' + str(tuple(data['lines'])) if conf.GENE_PRODUCT == True: line += '\t' + str(tuple(data['gene_product'])) fp.write(line + '\n')
def dNdS(conf, args): # old proc4 ''' Calculates dN/dS for all genes using the mutations in the provided genomediff files. ''' out_fn = fileName(args) record = utils.parse_genbank(conf.REF_GENOME) ''' genomediffs will be the master dictionary of mutations. Each mutation stores the line it came from, and is uniquely id'd. ''' genomediffs = {} for gd_file in conf.GENOMEDIFF_FILES: parse_genomediff(gd_file, record, genomediffs) print '\n' dNdS_counts, dNtotal, dStotal, dNdS1, dNdS2, \ dNdS3plus = calculate_dNdS(genomediffs) # print dNdS_counts print "dN:", dNtotal, " dS:", dStotal, " dN/dS:", \ float(dNtotal)/float(dStotal) print "dN/dS 1:", dNdS1, '\n', "dN/dS 2:", dNdS2, \ '\n', "dN/dS 3+:", dNdS3plus
def Statisticulate(conf, snptotal, tree_and_annotation=None, reps=1000): ''' Calculates statistics of parallel evolution given where mutations occurred. right now, works only for nonsynonymous mutations. ''' ref_record = utils.parse_genbank(conf.REF_GENOME) ######## count nonsynonymous mutations in each gene in the gd files. if tree_and_annotation is None: # default: assume star phylogeny. nonsynonymous_mutations = {} for gd_file in conf.GENOMEDIFF_FILES: gd_dict = parse_genomediff(gd_file, ref_record) for mut_id, gd in gd_dict.iteritems(): if gd.mut_type == 'SNP' and gd.snp_type == 'nonsynonymous': # These all only have one locus_tag, and we can't # use a list as a key, so just get the value locus_tag = gd.locus_tag[0] if locus_tag not in nonsynonymous_mutations: nonsynonymous_mutations[locus_tag] = 1 else: nonsynonymous_mutations[locus_tag] += 1 else: # a tree and annotation of mutation is provided. gtree, col_annotation = tree_and_annotation assert gtree is not None assert col_annotation is not None nonsynonymous_mutations = {} ## the root of gtree contains information about independent mutations. ## the cost at position X at the root is the number of independent mutations ## at position X (based on the given phylogeny). mut_counts = [min(x.values()) for x in gtree[0]['cost']] for i, mut_tuple in enumerate(col_annotation): column, pos, locus_tag = mut_tuple if locus_tag not in nonsynonymous_mutations: nonsynonymous_mutations[locus_tag] = mut_counts[i] else: nonsynonymous_mutations[locus_tag] += mut_counts[i] # # How many times does a dN occur in the gene? pval_numerator = {k: 0 for k in nonsynonymous_mutations} genes, cdf = formGenomeCDF(ref_record, nonsynonymous_mutations.keys()) for replicate in range(reps): for m in range(snptotal): nulldist = {k: 0 for k in nonsynonymous_mutations} # # draw a random number, and see which gene mutated. rando = random.random() if rando <= cdf[-1]: # # rando is in the gene set. for i, x in enumerate(cdf): if rando <= x: nulldist[genes[i]] = nulldist[genes[i]] + 1 break # # found the right bin. for g in nulldist: if nulldist[g] >= nonsynonymous_mutations[g]: pval_numerator[g] = pval_numerator[g] + 1 pvals = {k: float(v) / float(reps) for k, v in pval_numerator.iteritems()} for k, v in pvals.iteritems(): print "locus_tag:", k, "p-value:", v
def BasicSNPCount(conf): ''' return the total number of nonsynonymous SNPs in genes, assumes all mutations are independent (star phylogeny).''' snpcount = 0 ref_record = utils.parse_genbank(conf.REF_GENOME) for gd_file in conf.GENOMEDIFF_FILES: gd_dict = parse_genomediff(gd_file, ref_record) for k, v in gd_dict.iteritems(): if v.mut_type != 'SNP': # # only consider SNPs, continue if v.snp_type == 'nonsynonymous': # and those in genes. snpcount = snpcount + 1 return snpcount
def analyticalEJB(conf, args): # old proc5 '''analytical solution''' out_fn = fileName(args) record = utils.parse_genbank(conf.REF_GENOME) # utils.print_genbank_summary(record) genomediffs = {} for gd_file in conf.GENOMEDIFF_FILES: parse_genomediff(gd_file, record, genomediffs=genomediffs) print '\n' snpcounting = snpcount(genomediffs, conf.GENOMEDIFF_FILES, conf.snp_types) '''
def infoRegion(conf, args): # old proc7 '''Procedure 7: find most informative regions of the genome. take union of all genome diffs; need position and originating line info. find windows that are most dense with SNPs for freq-seq. windows must: contain haplotypes that distinguish all (or many) LTEE pops. ''' ref_record = utils.parse_genbank(conf.REF_GENOME) mut_list = [] conf.GENOMEDIFF_FILES.sort() # sort to ensure order is always the same for gd_file in conf.GENOMEDIFF_FILES: gd_dict = parse_genomediff(gd_file, ref_record) mut_list = mut_list + gd_dict.values() windows2 = makeWindows(ref_record, mut_list) markers = pickWindows(conf, windows2) printWindows(markers)
def SNPsToAlignment(conf): ''' rows are lexicographically sorted conf.GENOMEDIFF_FILES, and the last row is the reference sequence. columns are all positions that evolved in the set of genomes. ''' ref_record = utils.parse_genbank(conf.REF_GENOME) # utils.print_genbank_summary(ref_record) snps = [] # # each elt in snps is a tuple: (position, old_base, # new_base, locus_tag, label) conf.GENOMEDIFF_FILES.sort() # # So I can assume the diffs are sorted. for gd_file in conf.GENOMEDIFF_FILES: gd_dict = parse_genomediff(gd_file, ref_record) for k, v in gd_dict.iteritems(): if v.mut_type != 'SNP': # # only consider SNPs continue # old_base = ref_record[v.position] snps.append( (v.position + 1, v.old_base, v.new_base, v.locus_tag[0], gd_file)) snps.sort(key=lambda elt: elt[0]) # sort by position. # cols = sorted([x for x in set([elt[0] for elt in snps])]) ## NOTE: parse_genomediff converts 1-based indexing to 0-based indexing; ## this line changes it back for reporting to be consistent with original gd files. cols = [x for x in set([elt[0] for elt in snps])] cols.sort() ## The LAST row of the alignment is the reference. alignment = [[''] * len(cols) for x in range(len(conf.GENOMEDIFF_FILES)+1)] for elt in snps: i = conf.GENOMEDIFF_FILES.index(elt[4]) j = cols.index(elt[0]) alignment[-1][j] = elt[1] # # the reference sequence. alignment[i][j] = elt[2] # now fill the empty entries in the matrix w/ the ref seq value. ref = alignment[-1] #print ref for i in range(len(alignment)): for j in range(len(cols)): if alignment[i][j] == '': alignment[i][j] = ref[j] str_alignment = [''.join(x) for x in alignment] aln_ids = [os.path.splitext(gd)[0] for gd in conf.GENOMEDIFF_FILES] aln_ids = aln_ids + [ref_record.id] # add the reference. site_recs = [ SeqRecord( Seq(x), id=y) for x, y in zip( str_alignment, aln_ids)] # # turn into a Biopython Alignment object. msa = MultipleSeqAlignment(site_recs) ## return both the msa as well as the position and gene for each column in the alignment. msa_annotation = [] for i,pos in enumerate(cols): locus = None for elt in snps: if elt[0] == pos: locus = elt[3] break annotation = (i,pos,locus) msa_annotation.append(annotation) return msa, msa_annotation