Python get_xml_data Examples, parse_dbsnp_xml.get_xml_data Python Examples

Example #1

0

Show file

File: get_dbsnp_variability_for_all_uces.py Project: crinfante/2011-fairclothetal-systbiol-uce

def main():
    args = get_args()
    if args.dupefile:
        dupes = get_dupes(args.dupefile)
    else:
        dupes = None
    #pdb.set_trace()
    # get dbSNP data
    all_snps = get_xml_data(args.xml)
    used = set()
    # iterate over intersections
    args.output.write('rsid,pos,maf,1000g\n')
    for row in args.dbsnp:
        if not row.startswith('UCE'):
            uce, chromo, start, end, snp, snps, snpe = row.strip('\n').split(',')
            start, end, snps, snpe = map(int, [start, end, snps, snpe])
            # get relative position
            if not snpe - snps > 1 and snp not in used and not uce in dupes:
                middle = int(round((start + end)/2, 0))
                rel_snp_pos = snps - middle
                # lookup data for snps
                if all_snps[snp.strip('rs')].val_1000G and all_snps[snp.strip('rs')].val_1000G.lower() == 'true':
                    thousandg = True
                else:
                    thousandg = False
                if not all_snps[snp.strip('rs')].freq_freq:
                    freq = 0.0
                else:
                    freq = float(all_snps[snp.strip('rs')].freq_freq)
                args.output.write("{0},{1},{2},{3}\n".format(
                    snp, 
                    rel_snp_pos,
                    freq, 
                    thousandg
                    )
                )
                # make sure we skip any duplicates
                used.add(snp)

Example #2

0

Show file

def main():
    args = get_args()
    if args.dupefile:
        dupes = get_dupes(args.dupefile)
    else:
        dupes = None
    used = set()
    mx = max([int(row.strip('\n').split(',')[3]) \
            - int(row.strip('\n').split(',')[2]) \
            for row in open(args.dbsnp,'rU') if not row.startswith('UCE')])
    # get the SNP metadata
    all_snps = get_xml_data(args.xml)
    # find the middle
    overall_middle = int(round(mx / 2, 0))
    # list to hold results
    l = numpy.zeros(mx + 1)
    positions = copy.deepcopy(l)
    # create a dict to hold the results by position in longest array
    #differences = dict((d,numpy.array([])) for d in range(-middle, middle + 1))
    # iterate over intersections
    d = {}
    if args.output2:
        args.output2.write(
            'UCE,chromo,uce-start,uce-end,snp-name,snp-start,snp-end,1000gvalidated,freq\n'
        )
    for row in open(args.dbsnp, 'rU'):
        if not row.startswith('UCE'):
            uce, chromo, start, end, snp, snps, snpe = row.strip('\n').split(
                ',')
            start, end, snps, snpe = map(int, [start, end, snps, snpe])
            # get middle of this UCE
            middle = int(round((start + end) / 2, 0))
            #pdb.set_trace()
            if snp not in used:
                if not snpe - snps > 1 \
                    and (uce not in dupes) \
                    and all_snps[snp.strip('rs')].val_1000G == 'true' \
                    and all_snps[snp.strip('rs')].freq_freq is not None:
                    if not uce in d.keys():
                        d[uce] = numpy.zeros(mx + 1)
                    rel_snp_pos = snps - middle
                    d[uce][overall_middle +
                           rel_snp_pos] = all_snps[snp.strip('rs')].freq_freq
                if args.output2 and not snpe - snps > 1 and (
                        snp not in used) and (uce not in dupes):
                    args.output2.write("{},{},{},{},{},{},{},{},{}\n".format(
                        uce, chromo, start, end, snp, snps, snpe,
                        all_snps[snp.strip('rs')].val_1000G,
                        all_snps[snp.strip('rs')].freq_freq))
                used.add(snp)
    stack = numpy.array([d[uce] for uce in d.keys()])
    #pdb.set_trace()
    # compute the running average
    win = 25
    data = sum(stack > 0)
    weightings = numpy.repeat(1.0, win) / win
    running = numpy.convolve(data, weightings)[win - 1:-(win - 1)]
    args.output.write("pos,avg,ci,datatype\n")
    for base in range(len(running)):
        pos = base - overall_middle
        args.output.write("{},{},,running\n".format(pos, running[base]))
    # also output the average heterozygosity of 1000 Genome validated, hetero SNPs.
    for base in range(len(stack[0])):
        pos = base - overall_middle
        values = numpy.where(stack[:, base] != 0)[0]
        # reindex
        avg = numpy.mean(stack[:, base][values])
        ci = 1.96 * (numpy.std(stack[:, base][values], ddof=1) /
                     numpy.sqrt(len(stack[:, base][values])))
        args.output.write("{},{},{},mean_hetero\n".format(pos, avg, ci))
    win = 25
    data = numpy.mean(stack, axis=1)
    weightings = numpy.repeat(1.0, win) / win
    running = numpy.convolve(data, weightings)[win - 1:-(win - 1)]
    for base in range(len(stack[0])):
        pos = base - overall_middle
        args.output.write("{},{},,running_hetero\n".format(pos, running[base]))

Example #3

0

Show file

File: get_dbsnp_freq_stats.py Project: crinfante/2011-fairclothetal-systbiol-uce

def main():
    args = get_args()
    if args.dupefile:
        dupes = get_dupes(args.dupefile)
    else:
        dupes = None
    used = set()
    mx = max([int(row.strip('\n').split(',')[3]) \
            - int(row.strip('\n').split(',')[2]) \
            for row in open(args.dbsnp,'rU') if not row.startswith('UCE')])
    # get the SNP metadata
    all_snps = get_xml_data(args.xml)
    # find the middle
    overall_middle = int(round(mx/2, 0))
    # list to hold results 
    l = numpy.zeros(mx + 1)
    positions = copy.deepcopy(l)
    # create a dict to hold the results by position in longest array
    #differences = dict((d,numpy.array([])) for d in range(-middle, middle + 1))
    # iterate over intersections
    d = {}
    if args.output2:
        args.output2.write('UCE,chromo,uce-start,uce-end,snp-name,snp-start,snp-end,1000gvalidated,freq\n')
    for row in open(args.dbsnp, 'rU'):
        if not row.startswith('UCE'):
            uce, chromo, start, end, snp, snps, snpe = row.strip('\n').split(',')
            start, end, snps, snpe = map(int, [start, end, snps, snpe])
            # get middle of this UCE
            middle = int(round((start + end)/2, 0))
            #pdb.set_trace()
            if snp not in used:
                if not snpe - snps > 1 \
                    and (uce not in dupes) \
                    and all_snps[snp.strip('rs')].val_1000G == 'true' \
                    and all_snps[snp.strip('rs')].freq_freq is not None:
                    if not uce in d.keys():
                        d[uce] = numpy.zeros(mx + 1)
                    rel_snp_pos = snps - middle
                    d[uce][overall_middle + rel_snp_pos] = all_snps[snp.strip('rs')].freq_freq
                if args.output2 and not snpe - snps > 1 and (snp not in used) and (uce not in dupes):
                    args.output2.write("{},{},{},{},{},{},{},{},{}\n".format(
                        uce, chromo, start, end, snp, snps, 
                        snpe, all_snps[snp.strip('rs')].val_1000G, 
                        all_snps[snp.strip('rs')].freq_freq))
                used.add(snp)
    stack = numpy.array([d[uce] for uce in d.keys()])
    #pdb.set_trace()
    # compute the running average
    win = 25
    data = sum(stack > 0)
    weightings = numpy.repeat(1.0, win) / win
    running = numpy.convolve(data, weightings)[win-1:-(win-1)]
    args.output.write("pos,avg,ci,datatype\n")
    for base in range(len(running)):
        pos = base - overall_middle
        args.output.write("{},{},,running\n".format(pos,running[base]))
    # also output the average heterozygosity of 1000 Genome validated, hetero SNPs.
    for base in range(len(stack[0])):
        pos = base - overall_middle
        values = numpy.where(stack[:,base] != 0)[0]
        # reindex
        avg = numpy.mean(stack[:,base][values])
        ci = 1.96 * (numpy.std(stack[:,base][values], ddof = 1)/numpy.sqrt(len(stack[:,base][values])))
        args.output.write("{},{},{},mean_hetero\n".format(pos, avg, ci))
    win = 25
    data = numpy.mean(stack, axis = 1)
    weightings = numpy.repeat(1.0, win) / win
    running = numpy.convolve(data, weightings)[win-1:-(win-1)]
    for base in range(len(stack[0])):
        pos = base - overall_middle
        args.output.write("{},{},,running_hetero\n".format(pos,running[base]))