Ejemplo n.º 1
0
def mutationTally(conf, args):  # old proc6
    '''number of lines mutating in this particular gene'''
    record = utils.parse_genbank(conf.REF_GENOME)

    # 4327 CDS, 4397 gene
    # feature types: 'rRNA', 'repeat_region', 'tRNA', 'source', 'misc_feature', 'CDS', 'gene'
    # snpcounttotal, snptypetotals = snpcount(  # shouldn't do here. do above.
    # out_fn = None

    out_fn = fileName(args)

    genomediffs = {}
    for gd_file in conf.GENOMEDIFF_FILES:
        parse_genomediff(gd_file, record, genomediffs=genomediffs)
    print '\n'

    counts = mutated_lines_per_gene(genomediffs, conf.snp_types)  # a dict

    with open(out_fn, 'wb') as fp:
        for tag, data in counts:
            line = str(tag)
            line += '\t' + str(tuple(data['genes']))
            line += '\t' + str(len(data['lines']))
            line += '\t' + str(tuple(data['lines']))
            if conf.GENE_PRODUCT == True:
                line += '\t' + str(tuple(data['gene_product']))
            fp.write(line + '\n')
Ejemplo n.º 2
0
def dNdS(conf, args):  # old proc4
    '''
    Calculates dN/dS for all genes using the mutations in the
    provided genomediff files.
    '''
    out_fn = fileName(args)
    record = utils.parse_genbank(conf.REF_GENOME)

    '''
    genomediffs will be the master dictionary of mutations.
    Each mutation stores the line it came from, and is uniquely
    id'd.
    '''
    genomediffs = {}
    for gd_file in conf.GENOMEDIFF_FILES:
        parse_genomediff(gd_file, record, genomediffs)
    print '\n'

    dNdS_counts, dNtotal, dStotal, dNdS1, dNdS2, \
        dNdS3plus = calculate_dNdS(genomediffs)
    # print dNdS_counts
    print "dN:", dNtotal, "  dS:", dStotal, "  dN/dS:", \
      float(dNtotal)/float(dStotal)
    print "dN/dS 1:", dNdS1, '\n', "dN/dS 2:", dNdS2, \
      '\n', "dN/dS 3+:", dNdS3plus
Ejemplo n.º 3
0
def Statisticulate(conf, snptotal, tree_and_annotation=None, reps=1000):
    ''' Calculates statistics of parallel evolution given where mutations occurred. 
    right now, works only for nonsynonymous mutations. '''
    ref_record = utils.parse_genbank(conf.REF_GENOME)
    ######## count nonsynonymous mutations in each gene in the gd files.
    if tree_and_annotation is None: # default: assume star phylogeny.
        nonsynonymous_mutations = {}
        for gd_file in conf.GENOMEDIFF_FILES:
            gd_dict = parse_genomediff(gd_file, ref_record)
            for mut_id, gd in gd_dict.iteritems():
                if gd.mut_type == 'SNP' and gd.snp_type == 'nonsynonymous':
                    # These all only have one locus_tag, and we can't
                    # use a list as a key, so just get the value
                    locus_tag = gd.locus_tag[0]
                    if locus_tag not in nonsynonymous_mutations:
                        nonsynonymous_mutations[locus_tag] = 1
                    else:
                        nonsynonymous_mutations[locus_tag] += 1

    else:  # a tree and annotation of mutation is provided.
        gtree, col_annotation = tree_and_annotation
        assert gtree is not None
        assert col_annotation is not None
        nonsynonymous_mutations = {}
        ## the root of gtree contains information about independent mutations.
        ## the cost at position X at the root is the number of independent mutations
        ## at position X (based on the given phylogeny).
        mut_counts = [min(x.values()) for x in gtree[0]['cost']]
        for i, mut_tuple in enumerate(col_annotation):
            column, pos, locus_tag = mut_tuple
            if locus_tag not in nonsynonymous_mutations:
                nonsynonymous_mutations[locus_tag] = mut_counts[i]
            else:
                nonsynonymous_mutations[locus_tag] += mut_counts[i]

    # # How many times does a dN occur in the gene?
    pval_numerator = {k: 0 for k in nonsynonymous_mutations}
    genes, cdf = formGenomeCDF(ref_record, nonsynonymous_mutations.keys())
    for replicate in range(reps):
        for m in range(snptotal):
            nulldist = {k: 0 for k in nonsynonymous_mutations}
            # # draw a random number, and see which gene mutated.
            rando = random.random()
            if rando <= cdf[-1]:  # # rando is in the gene set.
                for i, x in enumerate(cdf):
                    if rando <= x:
                        nulldist[genes[i]] = nulldist[genes[i]] + 1
                        break  # # found the right bin.
        for g in nulldist:
            if nulldist[g] >= nonsynonymous_mutations[g]:
                pval_numerator[g] = pval_numerator[g] + 1
    pvals = {k: float(v) / float(reps) for k, v in pval_numerator.iteritems()}
    for k, v in pvals.iteritems():
        print "locus_tag:", k, "p-value:", v
Ejemplo n.º 4
0
def BasicSNPCount(conf):
    ''' return the total number of nonsynonymous SNPs in genes,
        assumes all mutations are
        independent (star phylogeny).'''
    snpcount = 0
    ref_record = utils.parse_genbank(conf.REF_GENOME)
    for gd_file in conf.GENOMEDIFF_FILES:
        gd_dict = parse_genomediff(gd_file, ref_record)
        for k, v in gd_dict.iteritems():
            if v.mut_type != 'SNP':  # # only consider SNPs,
                continue
            if v.snp_type == 'nonsynonymous':  # and those in genes.
                snpcount = snpcount + 1
    return snpcount
Ejemplo n.º 5
0
def analyticalEJB(conf, args):  # old proc5
    '''analytical solution'''
    out_fn = fileName(args)
    record = utils.parse_genbank(conf.REF_GENOME)
    # utils.print_genbank_summary(record)

    genomediffs = {}
    for gd_file in conf.GENOMEDIFF_FILES:
        parse_genomediff(gd_file, record, genomediffs=genomediffs)
    print '\n'

    snpcounting = snpcount(genomediffs, conf.GENOMEDIFF_FILES, conf.snp_types)

    '''
Ejemplo n.º 6
0
def infoRegion(conf, args):  # old proc7
    '''Procedure 7: find most informative regions of the genome.
    take union of all genome diffs; need position and originating line info.
    find windows that are most dense with SNPs for freq-seq.
    windows must: contain haplotypes that distinguish all (or many) LTEE pops.
    '''
    ref_record = utils.parse_genbank(conf.REF_GENOME)
    mut_list = []
    conf.GENOMEDIFF_FILES.sort()  # sort to ensure order is always the same

    for gd_file in conf.GENOMEDIFF_FILES:
        gd_dict = parse_genomediff(gd_file, ref_record)
        mut_list = mut_list + gd_dict.values()

    windows2 = makeWindows(ref_record, mut_list)
    markers = pickWindows(conf, windows2)
    printWindows(markers)
Ejemplo n.º 7
0
def SNPsToAlignment(conf):
    ''' rows are lexicographically sorted conf.GENOMEDIFF_FILES, and
    the last row is the reference sequence. columns are all positions
    that evolved in the set of genomes.
    '''

    ref_record = utils.parse_genbank(conf.REF_GENOME)
    # utils.print_genbank_summary(ref_record)

    snps = []
    # # each elt in snps is a tuple: (position, old_base,
    # new_base, locus_tag, label)
    conf.GENOMEDIFF_FILES.sort()  # # So I can assume the diffs are sorted.
    for gd_file in conf.GENOMEDIFF_FILES:
        gd_dict = parse_genomediff(gd_file, ref_record)
        for k, v in gd_dict.iteritems():
            if v.mut_type != 'SNP':  # # only consider SNPs
                continue
   
         # old_base = ref_record[v.position]
            snps.append(
                (v.position + 1,
                 v.old_base,
                 v.new_base,
                 v.locus_tag[0],
                 gd_file))
    snps.sort(key=lambda elt: elt[0])  # sort by position.
    # cols = sorted([x for x in set([elt[0] for elt in snps])])
    ## NOTE: parse_genomediff converts 1-based indexing to 0-based indexing; 
    ## this line changes it back for reporting to be consistent with original gd files.
    cols = [x for x in set([elt[0] for elt in snps])]
    cols.sort()
    ## The LAST row of the alignment is the reference.
    alignment = [[''] * len(cols)
                  for x in range(len(conf.GENOMEDIFF_FILES)+1)]

    for elt in snps:
        i = conf.GENOMEDIFF_FILES.index(elt[4])
        j = cols.index(elt[0])
        alignment[-1][j] = elt[1]  # # the reference sequence.
        alignment[i][j] = elt[2]
    # now fill the empty entries in the matrix w/ the ref seq value.
    ref = alignment[-1]
    #print ref
    for i in range(len(alignment)):
        for j in range(len(cols)):
            if alignment[i][j] == '':
                alignment[i][j] = ref[j]

    str_alignment = [''.join(x) for x in alignment]
    aln_ids = [os.path.splitext(gd)[0] for gd in conf.GENOMEDIFF_FILES]
    aln_ids = aln_ids + [ref_record.id]  # add the reference.
    site_recs = [
        SeqRecord(
            Seq(x), id=y) for x, y in zip(
            str_alignment, aln_ids)]
    # # turn into a Biopython Alignment object.
    msa = MultipleSeqAlignment(site_recs)
    ## return both the msa as well as the position and gene for each column in the alignment.
    msa_annotation = []
    for i,pos in enumerate(cols):
        locus = None
        for elt in snps:
            if elt[0] == pos:
                locus = elt[3]
                break
        annotation = (i,pos,locus)
        msa_annotation.append(annotation)
    return msa, msa_annotation