Esempio n. 1
0
def main():
    global prev_lines
    #parse ARGs
    parser = argparse.ArgumentParser(description='Filter SNP positions by call quality and min. coverage. Awaits filenames for M, P .vcf files, and M, P .sam files.')
    parser.add_argument('filenames', type=str, nargs='+', help='paths to .vcf files with M, P SNPs and to corresponding .sam files')
    args = parser.parse_args()
    
    if len(args.filenames) != 4: exit("Unexpected number of arguments passed! Expecting 4 filenames.")
    
    #treat these as CONSTANTS!
    M = 0; P = 1; MR = 2; PR = 3;
    ALL_VCF = [M, P]
    ALL = [M, P, MR, PR]
    
    '''
    Get union of the M and P SNP positions
    '''
    print "  Stage 1"
    #list of input files
    in_files = [open(args.filenames[i], "r" ) for i in ALL]
    
    #dictionary to store the union of SNP positions
    data = dict()
    loci = dict()
    
    #read the first SNP from M, P vcf files
    snps = [[] for i in ALL_VCF]
    for i in ALL_VCF:
        #skip the header
        line = in_files[i].readline()
        while len(line) > 0 and line[0] == '#': line = in_files[i].readline()
        #split
        snps[i] = line.split('\t')
    
    #get list of all positions in UNION of M and P SNP positions
    while len(snps[M])>2 or len(snps[P])>2: #while there is a SNP positions in M or P
        #get the position of SNP that occure first
        for i in ALL_VCF:
            if snps[i][0] == '': #if an input files is already at EOF
                snps[i][0] = 'chrZZ'
                snps[i].append(1e15)
            else: #convert to int
                snps[i][1] = int(snps[i][1])
        #chromosome
        min_chr = min(snps[M][0], snps[P][0])
        #position
        min_pos = 1e15
        for i in ALL_VCF:
            if min_chr == snps[i][0] and snps[i][1] < min_pos:
                min_pos = snps[i][1]
        
        #get alleles
        alleles = [['.', '.'] for x in [M, P]]
        for i in [M, P]:
            #if there is a SNP in the data at this position, use it
            if min_chr == snps[i][0] and min_pos == snps[i][1]:
                alleles[i] = [snps[i][3][0], snps[i][4][0]]

        #if there is no info on some allele, impute the reference allele
        ref_allele = alleles[M][0]
        if ref_allele == '.': ref_allele = alleles[P][0]
        for i in [M, P]:
            for a in [0, 1]:
                if alleles[i][a] == '.': alleles[i][a] = ref_allele

        #check for homozygous alternative sites
        for i in [M, P]:
            #if there is a SNP in the data at this position
            if min_chr == snps[i][0] and min_pos == snps[i][1]:
                info = snps[i]
                if len(info) <= 2: continue
                #parse out genotype config info
                gt = info[9].split(':')[0]
                #if homozygous alternative
                if gt[0] == '1':
                    alleles[i][0] = snps[i][4][0]
                if gt[2] == '1':
                    alleles[i][1] = snps[i][4][0]
                if gt[0] == '0':
                    alleles[i][0] = snps[i][3][0]
                if gt[2] == '0':
                    alleles[i][1] = snps[i][3][0]
        
        #take note that for this position we need to get allele counts in plasma samaples
        loci[min_pos] = alleles
        sp.add_pos(min_pos, data)
        
        #read input: next SNP
        for i in ALL_VCF:
            if min_chr >= snps[i][0] and min_pos >= snps[i][1]:
                snps[i] = in_files[i].readline().split('\t')
        #END WHILE
        
        
    '''
    Get coverage information of the SNP positions from corresponding .sam files
    '''
    print "  Stage 2"
    #fetch allele support for the UNION positions in maternal and paternal reads
    #set up datastructures for counting allele support in diffrenct SAM files
    posInfo = [dict() for i in ALL]
    for R in [MR, PR]:
        posInfo[R] = copy.deepcopy(data)
    
    #fetch the reads in plasma SAM file and get counts for the positions originally specified in 'data'   
    for R in [MR, PR]:
        while True:
            line = in_files[R].readline()
            if not line: break
            if len(line) > 0 and line[0] == '@': continue
            sp.pile_up(sp.mapping_parser(line), posInfo[R])
                
    
    '''
    Filter the SNP positions according to call quality and coverage
    '''
    print "  Stage 3"
    #reopen VCF files
    for f in in_files: f.close()
    in_files = [open(args.filenames[i], "r" ) for i in ALL_VCF]
    #list of output files
    out_files = [open(args.filenames[i][:-3]+"ftr.vcf", "w") for i in ALL_VCF]
    
    #read the first SNP from M, P vcf files
    snps = [[] for i in ALL_VCF]
    for i in ALL_VCF:
        snps[i] = getlineFromFile(in_files, out_files, i)
        
    #positions ignored from the union of M and P
    ignored_pos = 0
    
    while len(snps[M])>2 or len(snps[P])>2: #while there is a SNP positions in M or P
        #get the position of SNP that occure first
        for i in ALL_VCF:
            if snps[i][0] == '': #if an input files is already at EOF
                snps[i][0] = 'chrZZ'
                snps[i].append(1e15)
            else: #convert to int
                snps[i][1] = int(snps[i][1])
        #chromosome
        min_chr = min(snps[M][0], snps[P][0])
        #position
        min_pos = 1e15
        for i in ALL_VCF:
            if min_chr == snps[i][0] and snps[i][1] < min_pos:
                min_pos = snps[i][1]
        
        #get genotype call quality
        callQ = [0. for i in ALL_VCF]
        for i in ALL_VCF:
            #if there is a SNP in the data at this position
            if min_chr == snps[i][0] and min_pos == snps[i][1]:
                info = snps[i]
                if len(info) <= 2: continue
                #parse out quality info
                callQ[i] = float(info[5])
        qualityOK = bool(callQ[M] >= 75 or callQ[P] >= 75)
        
        #get coverage info
        alleles = loci[min_pos]
        coverage = [0 for i in [M, P]]
        count_sum = [0 for i in [M, P]]
        for i in [M, P]:
            a1 = alleles[i][0]
            a2 = alleles[i][1]
            count_a1 = 0
            count_a2 = 0
            try: count_a1 = posInfo[i+2][min_pos][a1]
            except: print i, min_pos, a1, posInfo[i+2][min_pos], alleles[i]
            try: count_a2 = posInfo[i+2][min_pos][a2]
            except: print i, min_pos, a2, posInfo[i+2][min_pos], alleles[i]
            
            count_sum[i] = sum(posInfo[i+2][min_pos].values())
            coverage[i] = count_a1 + count_a2 #posInfo[i+2][min_pos][a1] + posInfo[i+2][min_pos][a2]
            """ #using mpileup to get the coverage info
            cmd = 'samtools mpileup -r %(chr)s:%(pos)d-%(pos)d __%(gnm)s.part.bam' % {'chr':min_chr, 'pos':min_pos, 'gnm':'MP'[i]}
            process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            out, err = process.communicate()
            if process.returncode == 0:
                fields = out.split('\t')
                if len(fields) < 4:
                    #there is no coverage in the BAM file for this position
                    coverage[i] = 0
                    #print "!!! :", out, "|", err, "|", cmd
                else:
                    coverage[i] = int(fields[3])
                #print out, '>>coverage>>', coverage[i]
            else:
                print err
            """
        coverageOK = bool(coverage[M] >= 15 and coverage[P] >= 15)
        contaminationOK = False
        try: contaminationOK = bool(float(coverage[M])/count_sum[M] >= 0.9 and float(coverage[P])/count_sum[P] >= 0.9)
        except: contaminationOK = False
            
        if coverageOK and not contaminationOK:
            print min_pos, "- contamination M:", coverage[M], posInfo[M+2][min_pos], alleles[M], "P:", coverage[P], posInfo[P+2][min_pos], alleles[P]
        
        #MOK = bool(callQ[M] >= 75 or coverage[M] >= 15)
        #POK = bool(callQ[P] >= 75 or coverage[P] >= 15)
        #if not (MOK and POK): #ignore positions that are not good enough
        if not (qualityOK and coverageOK and contaminationOK): #ignore positions that are not good enough
            ignored_pos += 1
            for i in ALL_VCF:
                #if there is a SNP in the data at this position, skip it
                if min_chr == snps[i][0] and min_pos == snps[i][1]:
                    #print min_pos, callQ[M], callQ[P], coverage[M], coverage[P],  prev_lines[i],
                    prev_lines[i] = ''      
            
        
        #read input: next SNP
        for i in ALL_VCF:
            if min_chr >= snps[i][0] and min_pos >= snps[i][1]:
                snps[i] = getlineFromFile(in_files, out_files, i)
        
        #END WHILE
    
    print "Low quality positions ignored in the region:", ignored_pos  
Esempio n. 2
0
def main():
    global prev_lines
    #parse ARGs
    parser = argparse.ArgumentParser(
        description=
        'Filter SNP positions by call quality and min. coverage. Awaits filenames for M, P .vcf files, and M, P .sam files.'
    )
    parser.add_argument(
        'filenames',
        type=str,
        nargs='+',
        help=
        'paths to .vcf files with M, P SNPs and to corresponding .sam files')
    args = parser.parse_args()

    if len(args.filenames) != 4:
        exit("Unexpected number of arguments passed! Expecting 4 filenames.")

    #treat these as CONSTANTS!
    M = 0
    P = 1
    MR = 2
    PR = 3
    ALL_VCF = [M, P]
    ALL = [M, P, MR, PR]
    '''
    Get union of the M and P SNP positions
    '''
    print "  Stage 1"
    #list of input files
    in_files = [open(args.filenames[i], "r") for i in ALL]

    #dictionary to store the union of SNP positions
    data = dict()
    loci = dict()

    #read the first SNP from M, P vcf files
    snps = [[] for i in ALL_VCF]
    for i in ALL_VCF:
        #skip the header
        line = in_files[i].readline()
        while len(line) > 0 and line[0] == '#':
            line = in_files[i].readline()
        #split
        snps[i] = line.split('\t')

    #get list of all positions in UNION of M and P SNP positions
    while len(snps[M]) > 2 or len(
            snps[P]) > 2:  #while there is a SNP positions in M or P
        #get the position of SNP that occure first
        for i in ALL_VCF:
            if snps[i][0] == '':  #if an input files is already at EOF
                snps[i][0] = 'chrZZ'
                snps[i].append(1e15)
            else:  #convert to int
                snps[i][1] = int(snps[i][1])
        #chromosome
        min_chr = min(snps[M][0], snps[P][0])
        #position
        min_pos = 1e15
        for i in ALL_VCF:
            if min_chr == snps[i][0] and snps[i][1] < min_pos:
                min_pos = snps[i][1]

        #get alleles
        alleles = [['.', '.'] for x in [M, P]]
        for i in [M, P]:
            #if there is a SNP in the data at this position, use it
            if min_chr == snps[i][0] and min_pos == snps[i][1]:
                alleles[i] = [snps[i][3][0], snps[i][4][0]]

        #if there is no info on some allele, impute the reference allele
        ref_allele = alleles[M][0]
        if ref_allele == '.': ref_allele = alleles[P][0]
        for i in [M, P]:
            for a in [0, 1]:
                if alleles[i][a] == '.': alleles[i][a] = ref_allele

        #check for homozygous alternative sites
        for i in [M, P]:
            #if there is a SNP in the data at this position
            if min_chr == snps[i][0] and min_pos == snps[i][1]:
                info = snps[i]
                if len(info) <= 2: continue
                #parse out genotype config info
                gt = info[9].split(':')[0]
                #if homozygous alternative
                if gt[0] == '1':
                    alleles[i][0] = snps[i][4][0]
                if gt[2] == '1':
                    alleles[i][1] = snps[i][4][0]
                if gt[0] == '0':
                    alleles[i][0] = snps[i][3][0]
                if gt[2] == '0':
                    alleles[i][1] = snps[i][3][0]

        #take note that for this position we need to get allele counts in plasma samaples
        loci[min_pos] = alleles
        sp.add_pos(min_pos, data)

        #read input: next SNP
        for i in ALL_VCF:
            if min_chr >= snps[i][0] and min_pos >= snps[i][1]:
                snps[i] = in_files[i].readline().split('\t')
        #END WHILE
    '''
    Get coverage information of the SNP positions from corresponding .sam files
    '''
    print "  Stage 2"
    #fetch allele support for the UNION positions in maternal and paternal reads
    #set up datastructures for counting allele support in diffrenct SAM files
    posInfo = [dict() for i in ALL]
    for R in [MR, PR]:
        posInfo[R] = copy.deepcopy(data)

    #fetch the reads in plasma SAM file and get counts for the positions originally specified in 'data'
    for R in [MR, PR]:
        while True:
            line = in_files[R].readline()
            if not line: break
            if len(line) > 0 and line[0] == '@': continue
            sp.pile_up(sp.mapping_parser(line), posInfo[R])
    '''
    Filter the SNP positions according to call quality and coverage
    '''
    print "  Stage 3"
    #reopen VCF files
    for f in in_files:
        f.close()
    in_files = [open(args.filenames[i], "r") for i in ALL_VCF]
    #list of output files
    out_files = [
        open(args.filenames[i][:-3] + "ftr.vcf", "w") for i in ALL_VCF
    ]

    #read the first SNP from M, P vcf files
    snps = [[] for i in ALL_VCF]
    for i in ALL_VCF:
        snps[i] = getlineFromFile(in_files, out_files, i)

    #positions ignored from the union of M and P
    ignored_pos = 0

    while len(snps[M]) > 2 or len(
            snps[P]) > 2:  #while there is a SNP positions in M or P
        #get the position of SNP that occure first
        for i in ALL_VCF:
            if snps[i][0] == '':  #if an input files is already at EOF
                snps[i][0] = 'chrZZ'
                snps[i].append(1e15)
            else:  #convert to int
                snps[i][1] = int(snps[i][1])
        #chromosome
        min_chr = min(snps[M][0], snps[P][0])
        #position
        min_pos = 1e15
        for i in ALL_VCF:
            if min_chr == snps[i][0] and snps[i][1] < min_pos:
                min_pos = snps[i][1]

        #get genotype call quality
        callQ = [0. for i in ALL_VCF]
        for i in ALL_VCF:
            #if there is a SNP in the data at this position
            if min_chr == snps[i][0] and min_pos == snps[i][1]:
                info = snps[i]
                if len(info) <= 2: continue
                #parse out quality info
                callQ[i] = float(info[5])
        qualityOK = bool(callQ[M] >= 75 or callQ[P] >= 75)

        #get coverage info
        alleles = loci[min_pos]
        coverage = [0 for i in [M, P]]
        count_sum = [0 for i in [M, P]]
        for i in [M, P]:
            a1 = alleles[i][0]
            a2 = alleles[i][1]
            count_a1 = 0
            count_a2 = 0
            try:
                count_a1 = posInfo[i + 2][min_pos][a1]
            except:
                print i, min_pos, a1, posInfo[i + 2][min_pos], alleles[i]
            try:
                count_a2 = posInfo[i + 2][min_pos][a2]
            except:
                print i, min_pos, a2, posInfo[i + 2][min_pos], alleles[i]

            count_sum[i] = sum(posInfo[i + 2][min_pos].values())
            coverage[
                i] = count_a1 + count_a2  #posInfo[i+2][min_pos][a1] + posInfo[i+2][min_pos][a2]
            """ #using mpileup to get the coverage info
            cmd = 'samtools mpileup -r %(chr)s:%(pos)d-%(pos)d __%(gnm)s.part.bam' % {'chr':min_chr, 'pos':min_pos, 'gnm':'MP'[i]}
            process = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            out, err = process.communicate()
            if process.returncode == 0:
                fields = out.split('\t')
                if len(fields) < 4:
                    #there is no coverage in the BAM file for this position
                    coverage[i] = 0
                    #print "!!! :", out, "|", err, "|", cmd
                else:
                    coverage[i] = int(fields[3])
                #print out, '>>coverage>>', coverage[i]
            else:
                print err
            """
        coverageOK = bool(coverage[M] >= 15 and coverage[P] >= 15)
        contaminationOK = False
        try:
            contaminationOK = bool(
                float(coverage[M]) / count_sum[M] >= 0.9
                and float(coverage[P]) / count_sum[P] >= 0.9)
        except:
            contaminationOK = False

        if coverageOK and not contaminationOK:
            print min_pos, "- contamination M:", coverage[M], posInfo[
                M + 2][min_pos], alleles[M], "P:", coverage[P], posInfo[
                    P + 2][min_pos], alleles[P]

        #MOK = bool(callQ[M] >= 75 or coverage[M] >= 15)
        #POK = bool(callQ[P] >= 75 or coverage[P] >= 15)
        #if not (MOK and POK): #ignore positions that are not good enough
        if not (qualityOK and coverageOK and
                contaminationOK):  #ignore positions that are not good enough
            ignored_pos += 1
            for i in ALL_VCF:
                #if there is a SNP in the data at this position, skip it
                if min_chr == snps[i][0] and min_pos == snps[i][1]:
                    #print min_pos, callQ[M], callQ[P], coverage[M], coverage[P],  prev_lines[i],
                    prev_lines[i] = ''

        #read input: next SNP
        for i in ALL_VCF:
            if min_chr >= snps[i][0] and min_pos >= snps[i][1]:
                snps[i] = getlineFromFile(in_files, out_files, i)

        #END WHILE

    print "Low quality positions ignored in the region:", ignored_pos
Esempio n. 3
0
def main():
    #parse ARGs
    parser = argparse.ArgumentParser(description='Prepare SNP support data for FCNV. Read filenames: for joined M&P phased .vcf file; plasma, M, and P .sam files; and for centromeres list.')
    parser.add_argument('filenames', type=str, nargs='+', help='paths to 1) .vcf file with phased M & P SNPs; 2) reads in SAM format for plasma, M, and P samples; 3) centromeres list file.')
    args = parser.parse_args()
    
    if len(args.filenames) != 5: exit("Unexpected number of arguments passed! Expecting 5 filenames.")
    
    #treat these as CONSTANTS!
    MP = 0; PLR = 1; MR = 2; PR = 3; CT = 4; #in_files
    ALL = [MP, PLR, MR, PR, CT]
    M = 0; P = 1; #maternal, paternal
    ALDOC = 0; GT = 1; #out_files: allele DOC and ground truth
    
    #list of input files
    in_files = [open(args.filenames[i], "r" ) for i in ALL]
    
    #read centromeres positions
    centromeres = dict()
    for line in in_files[CT].readlines():
        line = line.rstrip('\n').split('\t')
        if line[0] not in centromeres.keys(): centromeres[line[0]] = []
        centromeres[line[0]] += [(int(line[1]), int(line[2]))]
        
    
    #allele counts in plasma samples for particular positions
    pos_data = dict()
    loci = dict()
    processed_chr = ''
    skipped_in_centromere = 0
    
    print "  Getting union of SNP positions and corresponding list of alleles " + datetime.now().strftime('%m-%d-%H-%M')
    #read SNPs from M, P, F vcf files
    snps = [[] for i in [M, P]]
    
    #get genotypes for all positions in UNION of M and P SNP positions
    while True:
        line = in_files[MP].readline()
        #skip if part of the header
        while len(line) > 0 and line[0] == '#': line = in_files[MP].readline()
        if not line: break
        
        fields = line.rstrip('\n').split('\t')
        if processed_chr == '': processed_chr = 'chr'+fields[0]
        if processed_chr != 'chr'+fields[0]:
            print "WARNING: multiple chromosomes in the input", processed_chr, "|", 'chr'+fields[0]
        pos = int(fields[1])
        ref = fields[3]
        alt = fields[4]
        
        #get M and P haplotypes
        snps[M] = fields[9].split(':')[0].split('|')
        for x in [0, 1]:
            if snps[M][x] == '0': snps[M][x] = ref
            else: snps[M][x] = alt
            
        snps[P] = fields[10].split(':')[0].split('|')
        for x in [0, 1]:
            if snps[P][x] == '0': snps[P][x] = ref
            else: snps[P][x] = alt
        
        #if in centromere region, skip
        centromere_regions = centromeres[processed_chr]
        if is_within_intervals(pos, centromere_regions):
            skipped_in_centromere += 1
            continue
        
        #take note that for this position we need to get allele counts in plasma samaples
        alleles = (snps[M], snps[P])
        loci[pos] = alleles
        sp.add_pos(pos, pos_data)
        
        #END WHILE
    
    print "  Piling up the reads " + datetime.now().strftime('%m-%d-%H-%M')
    #set up datastructures for counting allele support in diffrenct SAM files
    posInfo = [dict() for i in ALL]
    for R in [PLR, MR, PR]:
        posInfo[R] = copy.deepcopy(pos_data)
        
    #fetch the reads in plasma SAM file and get counts for the positions originally specified in 'pos_data'
    for R in [PLR, MR, PR]:
        while True:
            line = in_files[R].readline()
            if not line: break
            if len(line) > 0 and line[0] == '@': continue
            sp.pile_up(sp.mapping_parser(line), posInfo[R])    
    
    
    print "  Writing output " + datetime.now().strftime('%m-%d-%H-%M')
    #list of output files
    out_files = [None for i in [ALDOC, GT]]
    out_files[ALDOC] = open(processed_chr + "_alleles_docOWN.txt", "w")
    out_files[GT] = open(processed_chr + "_targetOWN.txt", "w")
    print >>out_files[ALDOC], '#POS\tA\tC\tG\tT\tM_hapA\tM_hapB\tDP_hapA\tDP_hapB\tP_hapA\tP_hapB\tDP_hapA\tDP_hapB'
    
    skipped_low_doc = 0
    #print info / compute stats for each SNP position
    for pos in sorted(pos_data.keys()):
        alleles = loci[pos]
        
        #print the plasma allele counts 
        nuc_counts = posInfo[PLR][pos]
        tmp = []
        for nuc in 'ACGT': #to make sure they are in the right order
            try:
                tmp.append(str(nuc_counts[nuc]))
            except KeyError:
                tmp.append('0')
                
        #if the plasma coverage is too low, skip this position        
        if sum(map(int, tmp)) < 50: 
            print pos, "- low overall coverage", sum(map(int, tmp))
            skipped_low_doc += 1
            continue
        
        print >>out_files[ALDOC], str(pos) + '\t' + '\t'.join(tmp),
        
        #output M, P alleles at this SNP locus
        for i, r in [(M, MR), (P, PR)]:
            a1 = alleles[i][0]
            a2 = alleles[i][1]
            count_a1 = 0
            count_a2 = 0
            try: count_a1 = posInfo[r][pos][a1]
            except: print i, pos, a1, posInfo[r][pos], alleles[i]
            try: count_a2 = posInfo[r][pos][a2]
            except: print i, pos, a2, posInfo[r][pos], alleles[i]
            
            if a1 == a2:
                count_a1 /= 2.
                count_a2 /= 2.
            
            print >>out_files[ALDOC], '\t{0}\t{1}\t{2}\t{3}'.format(a1, a2, count_a1, count_a2),
        print >>out_files[ALDOC], '\n',
        
        if 10181440 <= pos and pos <= 10281440:
            print >>out_files[GT], '{0}\t{1}\t{2}\t{3}'.format(pos, 'N', 'N', 6)
        else:
            print >>out_files[GT], '{0}\t{1}\t{2}\t{3}'.format(pos, 'N', 'N', 3)
     
    print "Low overall coverage positions ignored:", skipped_low_doc
    print "Ignored positions in centromere regions:", skipped_in_centromere   
    print "DONE " + datetime.now().strftime('%m-%d-%H-%M')
def main():
    #parse ARGs
    parser = argparse.ArgumentParser(
        description=
        'Analyze mixture and allele ratios for SNP positions in union(M, P). Read filenames for M, P .vcf files. Further M, F .sam files with reads that together form plasma reads.'
    )
    parser.add_argument(
        'filenames',
        type=str,
        nargs='+',
        help=
        'paths to .vcf files with M, P SNPs and *sorted* M, F reads in SAM format'
    )
    args = parser.parse_args()

    if len(args.filenames) != 4:
        die("Unexpected number of arguments passed! Expecting 4 filenames.")

    #treat these as CONSTANTS!
    M = 0
    P = 1
    MR = 2
    FR = 3
    ALL = [M, P, MR, FR]

    #list of input files
    in_files = [open(args.filenames[i], "r") for i in ALL]

    #list of output files
    #out_files = [None for i in ALL]
    #out_files[M] = open("M_alleles.txt", "w")

    #union of maternal and paternal SNP positions
    data = dict()
    loci = dict()

    #read SNPs from M, P, F vcf files
    snps = [[] for i in [M, P]]
    for i in [M, P]:
        #skip the header
        line = in_files[i].readline()
        while len(line) > 0 and line[0] == '#':
            line = in_files[i].readline()
        #split
        snps[i] = line.split('\t')

    #union the maternal and paternals SNPs positions
    while len(snps[M]) > 2 or len(
            snps[P]) > 2:  #while there is a SNP positions in M or P
        #get the position of SNP that occure first
        for i in [M, P]:
            if snps[i][0] == '':  #if an input files is already at EOF
                snps[i][0] = 'chrZZ'
                snps[i].append(1e15)
            else:  #convert to int
                snps[i][1] = int(snps[i][1])
        #chromosome
        min_chr = min(snps[M][0], snps[P][0])
        #position
        min_pos = 1e15
        for i in [M, P]:
            if min_chr == snps[i][0] and snps[i][1] < min_pos:
                min_pos = snps[i][1]

        #get alleles
        alleles = [['.', '.'] for x in [M, P]]
        for i in [M, P]:
            #if there is a SNP in the data at this position, use it
            if min_chr == snps[i][0] and min_pos == snps[i][1]:
                alleles[i] = [snps[i][3], snps[i][4]]

        #if there is no info on some allele, impute the reference allele
        ref_allele = alleles[M][0]
        if ref_allele == '.': ref_allele = alleles[P][0]
        for i in [M, P]:
            for a in [0, 1]:
                if alleles[i][a] == '.': alleles[i][a] = ref_allele

        #organize the haplotypes in M, P (phased VCF files)
        for i in [M, P]:
            #if there is a SNP in the data at this position
            if min_chr == snps[i][0] and min_pos == snps[i][1]:
                info = snps[i]
                if len(info) <= 2: continue
                #parse out haplotype config info
                ht = map(int, info[9].split('/'))
                #get the configuration
                phased_alleles = [alleles[i][ht[0]], alleles[i][ht[1]]]
                alleles[i] = phased_alleles

        #take note that for this position we need to get allele counts in plasma samaples
        sp.add_pos(min_pos, data)
        #loci[min_pos] = alleles
        #print min_pos, ": M:", alleles[M], " P:", alleles[P], " F:", alleles[F]

        #read input: next SNP
        for i in [M, P]:
            if min_chr >= snps[i][0] and min_pos >= snps[i][1]:
                snps[i] = in_files[i].readline().split('\t')

    #fetch the maternal and fetal portion of plasma reads from SAM files,
    # and get counts for the positions specified in 'data'
    posInfo = [dict() for i in ALL]
    posInfo[MR] = copy.deepcopy(data)
    posInfo[FR] = copy.deepcopy(data)

    for R in [MR, FR]:
        while True:
            line = in_files[R].readline()
            if not line: break
            if len(line) > 0 and line[0] == '@': continue
            sp.pile_up(sp.mapping_parser(line), posInfo[R])

    #compute and print the stats
    for pos in sorted(data.keys()):
        MR_nuc_counts = posInfo[MR][pos]
        FR_nuc_counts = posInfo[FR][pos]
        try:
            local_mix_ratio = float(sum(FR_nuc_counts.values())) / (
                sum(FR_nuc_counts.values()) + 9 * sum(MR_nuc_counts.values()))
        except ZeroDivisionError:
            local_mix_ratio = 0

        print pos, local_mix_ratio,
        '''
        for i, NC in enumerate([MR_nuc_counts, FR_nuc_counts]):
            tmp = []
            for nuc in 'ACGT': #to make sure they are in the right order
                try:
                    tmp.append(NC[nuc])
                except KeyError:
                    tmp.append(0)
            try:    
                summ = float(sum(tmp))
                tmp = [tmp[i]/summ for i in range(len(tmp))]
            except ZeroDivisionError:
                pass
                #tmp = [0]
            #print sorted(tmp),
            try:
                ind1 = 'ACGT'.index(loci[pos][i][0].upper())
                ind2 = 'ACGT'.index(loci[pos][i][1].upper())
                print abs(tmp[ind1] - tmp[ind2]), 
            except IndexError:
                print "X", loci[pos][i][0].upper(), loci[pos][i][1].upper(), tmp,
                print 0, 
        '''
        print " "
def main():
    #parse ARGs
    parser = argparse.ArgumentParser(description='Analyze mixture and allele ratios for SNP positions in union(M, P). Read filenames for M, P .vcf files. Further M, F .sam files with reads that together form plasma reads.')
    parser.add_argument('filenames', type=str, nargs='+', help='paths to .vcf files with M, P SNPs and *sorted* M, F reads in SAM format')
    args = parser.parse_args()
    
    if len(args.filenames) != 4: die("Unexpected number of arguments passed! Expecting 4 filenames.")
    
    #treat these as CONSTANTS!
    M = 0; P = 1; MR = 2; FR = 3;
    ALL = [M, P, MR, FR]
    
    #list of input files
    in_files = [open(args.filenames[i], "r" ) for i in ALL]
    
    #list of output files
    #out_files = [None for i in ALL]
    #out_files[M] = open("M_alleles.txt", "w")

    
    #union of maternal and paternal SNP positions
    data = dict()
    loci = dict()
    
    #read SNPs from M, P, F vcf files
    snps = [[] for i in [M, P]]
    for i in [M, P]:
        #skip the header
        line = in_files[i].readline()
        while len(line) > 0 and line[0] == '#': line = in_files[i].readline()
        #split
        snps[i] = line.split('\t')
    
    #union the maternal and paternals SNPs positions
    while len(snps[M])>2 or len(snps[P])>2: #while there is a SNP positions in M or P
        #get the position of SNP that occure first
        for i in [M, P]:
            if snps[i][0] == '': #if an input files is already at EOF
                snps[i][0] = 'chrZZ'
                snps[i].append(1e15)
            else: #convert to int
                snps[i][1] = int(snps[i][1])
        #chromosome
        min_chr = min(snps[M][0], snps[P][0])
        #position
        min_pos = 1e15
        for i in [M, P]:
            if min_chr == snps[i][0] and snps[i][1] < min_pos:
                min_pos = snps[i][1]
        
        #get alleles
        alleles = [['.', '.'] for x in [M, P]]
        for i in [M, P]:
            #if there is a SNP in the data at this position, use it
            if min_chr == snps[i][0] and min_pos == snps[i][1]:
                alleles[i] = [snps[i][3], snps[i][4]]

            
        #if there is no info on some allele, impute the reference allele
        ref_allele = alleles[M][0]
        if ref_allele == '.': ref_allele = alleles[P][0]
        for i in [M, P]:
            for a in [0, 1]:
                if alleles[i][a] == '.': alleles[i][a] = ref_allele

        #organize the haplotypes in M, P (phased VCF files)
        for i in [M, P]:
            #if there is a SNP in the data at this position
            if min_chr == snps[i][0] and min_pos == snps[i][1]:
                info = snps[i]
                if len(info) <= 2: continue
                #parse out haplotype config info
                ht = map(int, info[9].split('/'))
                #get the configuration
                phased_alleles = [alleles[i][ht[0]], alleles[i][ht[1]]]
                alleles[i] = phased_alleles
                  
        #take note that for this position we need to get allele counts in plasma samaples
        sp.add_pos(min_pos, data)
        #loci[min_pos] = alleles
        #print min_pos, ": M:", alleles[M], " P:", alleles[P], " F:", alleles[F]
            
        #read input: next SNP
        for i in [M, P]:
            if min_chr >= snps[i][0] and min_pos >= snps[i][1]:
                snps[i] = in_files[i].readline().split('\t')
    
    #fetch the maternal and fetal portion of plasma reads from SAM files,
    # and get counts for the positions specified in 'data'
    posInfo = [dict() for i in ALL]
    posInfo[MR] = copy.deepcopy(data)
    posInfo[FR] = copy.deepcopy(data)
    
    for R in [MR, FR]:
        while True:
            line = in_files[R].readline()
            if not line: break
            if len(line) > 0 and line[0] == '@': continue
            sp.pile_up(sp.mapping_parser(line), posInfo[R])

    #compute and print the stats
    for pos in sorted(data.keys()):
        MR_nuc_counts = posInfo[MR][pos]
        FR_nuc_counts = posInfo[FR][pos]
        try:
            local_mix_ratio = float(sum(FR_nuc_counts.values())) / (sum(FR_nuc_counts.values()) + 9*sum(MR_nuc_counts.values()))
        except ZeroDivisionError:
            local_mix_ratio = 0
        
        print pos, local_mix_ratio,
        '''
        for i, NC in enumerate([MR_nuc_counts, FR_nuc_counts]):
            tmp = []
            for nuc in 'ACGT': #to make sure they are in the right order
                try:
                    tmp.append(NC[nuc])
                except KeyError:
                    tmp.append(0)
            try:    
                summ = float(sum(tmp))
                tmp = [tmp[i]/summ for i in range(len(tmp))]
            except ZeroDivisionError:
                pass
                #tmp = [0]
            #print sorted(tmp),
            try:
                ind1 = 'ACGT'.index(loci[pos][i][0].upper())
                ind2 = 'ACGT'.index(loci[pos][i][1].upper())
                print abs(tmp[ind1] - tmp[ind2]), 
            except IndexError:
                print "X", loci[pos][i][0].upper(), loci[pos][i][1].upper(), tmp,
                print 0, 
        '''    
        print " "
Esempio n. 6
0
def main():
    #parse ARGs
    parser = argparse.ArgumentParser(
        description=
        'Prepare SNP data for FCNV. Read filenames: for M, P, and F .vcf files; plasma, M, and P .sam files; and for centromeres list.'
    )
    parser.add_argument(
        'filenames',
        type=str,
        nargs='+',
        help=
        'paths to 1) .vcf files with M, P, F SNPs; 2) reads in SAM format for plasma, M, and P samples; 3) centromeres list file.'
    )
    args = parser.parse_args()

    if len(args.filenames) != 7:
        exit("Unexpected number of arguments passed! Expecting 7 filenames.")

    #treat these as CONSTANTS!
    M = 0
    P = 1
    F = 2
    PLASMA = 3
    MR = 4
    PR = 5
    CT = 6
    ALL = [M, P, F, PLASMA, MR, PR, CT]

    #list of input files
    in_files = [open(args.filenames[i], "r") for i in ALL]

    #list of output files
    out_files = [None for i in [M, P, F, PLASMA]]
    out_files[M] = open("M_alleles.txt", "w")
    out_files[P] = open("P_alleles.txt", "w")
    out_files[F] = open("F_alleles.txt", "w")
    out_files[PLASMA] = open("plasma_samples.txt", "w")
    date = datetime.now().strftime('%m-%d-%H-%M')
    out_pos_file = open("positions" + date + ".txt", "w")

    #read centromeres positions
    centromeres = dict()
    for line in in_files[CT].readlines():
        line = line.rstrip('\n').split('\t')
        if line[0] not in centromeres.keys(): centromeres[line[0]] = []
        centromeres[line[0]] += [(int(line[1]), int(line[2]))]

    #allele counts in plasma samples for particular positions
    data = dict()
    loci = dict()
    processed_chr = ''

    print "  Getting union of SNP positions and corresponding list of alleles"
    #read SNPs from M, P, F vcf files
    snps = [[] for i in [M, P, F]]
    for i in [M, P, F]:
        #skip the header
        line = in_files[i].readline()
        while len(line) > 0 and line[0] == '#':
            line = in_files[i].readline()
        #split
        snps[i] = line.split('\t')

    #get genotypes for all positions in UNION of M and P SNP positions
    while len(snps[M]) > 2 or len(
            snps[P]) > 2:  #while there is a SNP positions in M or P
        #get the position of SNP that occure first
        for i in [M, P, F]:
            if snps[i][0] == '':  #if an input files is already at EOF
                snps[i][0] = 'chrZZ'
                snps[i].append(1e15)
            else:  #convert to int
                snps[i][1] = int(snps[i][1])
        #chromosome
        min_chr = min(snps[M][0], snps[P][0])
        if processed_chr == '': processed_chr = min_chr
        if processed_chr != min_chr:
            print "WARNING: multiple chromosomes in the input", processed_chr, "|", min_chr
        #position
        min_pos = 1e15
        for i in [M, P]:
            if min_chr == snps[i][0] and snps[i][1] < min_pos:
                min_pos = snps[i][1]

        #get alleles
        alleles = [['.', '.'] for x in [M, P, F]]
        for i in [M, P, F]:
            #if there is a SNP in the data at this position, use it
            if min_chr == snps[i][0] and min_pos == snps[i][1]:
                alleles[i] = [snps[i][3][0], snps[i][4][0]]

        #if there is no info on some allele, impute the reference allele
        ref_allele = alleles[M][0]
        if ref_allele == '.': ref_allele = alleles[P][0]
        for i in [M, P, F]:
            for a in [0, 1]:
                if alleles[i][a] == '.': alleles[i][a] = ref_allele

        #check for homozygous alternative sites in Fetal VCF
        for i in [F]:
            #if there is a SNP in the data at this position
            if min_chr == snps[i][0] and min_pos == snps[i][1]:
                info = snps[i]
                if len(info) <= 2: continue
                #parse out genotype config info
                gt = info[9].split(':')[0]
                #if homozygous alternative
                if gt[0] == '1':
                    alleles[i][0] = snps[i][4][0]
                if gt[2] == '1':
                    alleles[i][1] = snps[i][4][0]
                if gt[0] == '0':
                    alleles[i][0] = snps[i][3][0]
                if gt[2] == '0':
                    alleles[i][1] = snps[i][3][0]

        #organize the haplotypes in M, P (phased VCF files)
        for i in [M, P]:
            #if there is a SNP in the data at this position
            if min_chr == snps[i][0] and min_pos == snps[i][1]:
                info = snps[i]
                if len(info) <= 2: continue
                #parse out haplotype config info
                ht = map(int, info[9].split('/'))
                #get the configuration
                phased_alleles = [alleles[i][ht[0]], alleles[i][ht[1]]]
                alleles[i] = phased_alleles

        #take note that for this position we need to get allele counts in plasma samaples
        loci[min_pos] = alleles
        sp.add_pos(min_pos, data)

        #read input: next SNP
        for i in [M, P, F]:
            if min_chr >= snps[i][0] and min_pos >= snps[i][1]:
                snps[i] = in_files[i].readline().split('\t')

        #END WHILE

    print "  Aligning the reads"
    #set up datastructures for counting allele support in diffrenct SAM files
    posInfo = [dict() for i in ALL]
    for R in [PLASMA, MR, PR]:
        posInfo[R] = copy.deepcopy(data)

    #fetch the reads in plasma SAM file and get counts for the positions originally specified in 'data'
    for R in [PLASMA, MR, PR]:
        while True:
            line = in_files[R].readline()
            if not line: break
            if len(line) > 0 and line[0] == '@': continue
            sp.pile_up(sp.mapping_parser(line), posInfo[R])

    print "  Writing output"
    skipped_in_centromere = 0
    skipped_low = 0
    centromere_regions = centromeres[processed_chr]
    #print info / compute stats for each SNP position
    for pos in sorted(data.keys()):
        alleles = loci[pos]
        #if alleles[M][0] != alleles[M][1]:

        #print the plasma allele counts
        nuc_counts = posInfo[PLASMA][pos]
        tmp = []
        for nuc in 'ACGT':  #to make sure they are in the right order
            try:
                tmp.append(str(nuc_counts[nuc]))
            except KeyError:
                tmp.append('0')

        #if the plasma coverage is too low, skip this position
        if sum(map(int, tmp)) < 20:
            print pos, "- low overall coverage", sum(map(int, tmp))
            skipped_low += 1
            continue
        #if in centromere region, skip
        if is_within_intervals(pos, centromere_regions):
            skipped_in_centromere += 1
            continue

        print >> out_files[PLASMA], ' '.join(tmp)

        #output M, P, F alleles at this SNP locus
        for i, r in [(M, MR), (P, PR)]:
            a1 = alleles[i][0]
            a2 = alleles[i][1]
            count_a1 = 0
            count_a2 = 0
            try:
                count_a1 = posInfo[r][pos][a1]
            except:
                print i, pos, a1, posInfo[r][pos], alleles[i]
            try:
                count_a2 = posInfo[r][pos][a2]
            except:
                print i, pos, a2, posInfo[r][pos], alleles[i]

            if a1 == a2:
                count_a1 /= 2.
                count_a2 /= 2.

            print >> out_files[i], a1, a2, count_a1, count_a2

        print >> out_files[F], alleles[F][0], alleles[F][1], 3
        print >> out_pos_file, pos, "- M:", alleles[M], " P:", alleles[
            P], " F:", alleles[F]

    print "Low overall coverage positions ignored:", skipped_low
    print "Ignored positions in centromere regions:", skipped_in_centromere
Esempio n. 7
0
def main():
    #parse ARGs
    parser = argparse.ArgumentParser(description='Prepare SNP data for FCNV. Read filenames: for M, P, and F .vcf files; plasma, M, and P .sam files; and for centromeres list.')
    parser.add_argument('filenames', type=str, nargs='+', help='paths to 1) .vcf files with M, P, F SNPs; 2) reads in SAM format for plasma, M, and P samples; 3) centromeres list file.')
    args = parser.parse_args()
    
    if len(args.filenames) != 7: exit("Unexpected number of arguments passed! Expecting 7 filenames.")
    
    #treat these as CONSTANTS!
    M = 0; P = 1; F = 2; PLASMA = 3; MR = 4; PR = 5; CT = 6;
    ALL = [M, P, F, PLASMA, MR, PR, CT]
    
    #list of input files
    in_files = [open(args.filenames[i], "r" ) for i in ALL]
    
    #list of output files
    out_files = [None for i in [M, P, F, PLASMA]]
    out_files[M] = open("M_alleles.txt", "w")
    out_files[P] = open("P_alleles.txt", "w")
    out_files[F] = open("F_alleles.txt", "w")
    out_files[PLASMA] = open("plasma_samples.txt", "w")
    date = datetime.now().strftime('%m-%d-%H-%M')
    out_pos_file = open("positions" + date + ".txt", "w")
    
    #read centromeres positions
    centromeres = dict()
    for line in in_files[CT].readlines():
        line = line.rstrip('\n').split('\t')
        if line[0] not in centromeres.keys(): centromeres[line[0]] = []
        centromeres[line[0]] += [(int(line[1]), int(line[2]))]
        
    
    #allele counts in plasma samples for particular positions
    data = dict()
    loci = dict()
    processed_chr = ''
    
    print "  Getting union of SNP positions and corresponding list of alleles"
    #read SNPs from M, P, F vcf files
    snps = [[] for i in [M, P, F]]
    for i in [M, P, F]:
        #skip the header
        line = in_files[i].readline()
        while len(line) > 0 and line[0] == '#': line = in_files[i].readline()
        #split
        snps[i] = line.split('\t')
    
    #get genotypes for all positions in UNION of M and P SNP positions
    while len(snps[M])>2 or len(snps[P])>2: #while there is a SNP positions in M or P
        #get the position of SNP that occure first
        for i in [M, P, F]:
            if snps[i][0] == '': #if an input files is already at EOF
                snps[i][0] = 'chrZZ'
                snps[i].append(1e15)
            else: #convert to int
                snps[i][1] = int(snps[i][1])
        #chromosome
        min_chr = min(snps[M][0], snps[P][0])
        if processed_chr == '': processed_chr = min_chr
        if processed_chr != min_chr: print "WARNING: multiple chromosomes in the input", processed_chr, "|", min_chr
        #position
        min_pos = 1e15
        for i in [M, P]:
            if min_chr == snps[i][0] and snps[i][1] < min_pos:
                min_pos = snps[i][1]
        
        #get alleles
        alleles = [['.', '.'] for x in [M, P, F]]
        for i in [M, P, F]:
            #if there is a SNP in the data at this position, use it
            if min_chr == snps[i][0] and min_pos == snps[i][1]:
                alleles[i] = [snps[i][3][0], snps[i][4][0]]

            
        #if there is no info on some allele, impute the reference allele
        ref_allele = alleles[M][0]
        if ref_allele == '.': ref_allele = alleles[P][0]
        for i in [M, P, F]:
            for a in [0, 1]:
                if alleles[i][a] == '.': alleles[i][a] = ref_allele

        #check for homozygous alternative sites in Fetal VCF
        for i in [F]:
            #if there is a SNP in the data at this position
            if min_chr == snps[i][0] and min_pos == snps[i][1]:
                info = snps[i]
                if len(info) <= 2: continue
                #parse out genotype config info
                gt = info[9].split(':')[0]
                #if homozygous alternative
                if gt[0] == '1':
                    alleles[i][0] = snps[i][4][0]
                if gt[2] == '1':
                    alleles[i][1] = snps[i][4][0]
                if gt[0] == '0':
                    alleles[i][0] = snps[i][3][0]
                if gt[2] == '0':
                    alleles[i][1] = snps[i][3][0] 
                    
        #organize the haplotypes in M, P (phased VCF files)
        for i in [M, P]:
            #if there is a SNP in the data at this position
            if min_chr == snps[i][0] and min_pos == snps[i][1]:
                info = snps[i]
                if len(info) <= 2: continue
                #parse out haplotype config info
                ht = map(int, info[9].split('/'))
                #get the configuration
                phased_alleles = [alleles[i][ht[0]], alleles[i][ht[1]]]
                alleles[i] = phased_alleles
        
        #take note that for this position we need to get allele counts in plasma samaples
        loci[min_pos] = alleles
        sp.add_pos(min_pos, data)
            
        #read input: next SNP
        for i in [M, P, F]:
            if min_chr >= snps[i][0] and min_pos >= snps[i][1]:
                snps[i] = in_files[i].readline().split('\t')

        #END WHILE
    
    print "  Aligning the reads"
    #set up datastructures for counting allele support in diffrenct SAM files
    posInfo = [dict() for i in ALL]
    for R in [PLASMA, MR, PR]:
        posInfo[R] = copy.deepcopy(data)
        
    #fetch the reads in plasma SAM file and get counts for the positions originally specified in 'data'
    for R in [PLASMA, MR, PR]:
        while True:
            line = in_files[R].readline()
            if not line: break
            if len(line) > 0 and line[0] == '@': continue
            sp.pile_up(sp.mapping_parser(line), posInfo[R])    
    
    print "  Writing output"
    skipped_in_centromere = 0
    skipped_low = 0
    centromere_regions = centromeres[processed_chr]
    #print info / compute stats for each SNP position
    for pos in sorted(data.keys()):
        alleles = loci[pos]
        #if alleles[M][0] != alleles[M][1]:
        
        #print the plasma allele counts 
        nuc_counts = posInfo[PLASMA][pos]
        tmp = []
        for nuc in 'ACGT': #to make sure they are in the right order
            try:
                tmp.append(str(nuc_counts[nuc]))
            except KeyError:
                tmp.append('0')
                
        #if the plasma coverage is too low, skip this position        
        if sum(map(int, tmp)) < 20: 
            print pos, "- low overall coverage", sum(map(int, tmp))
            skipped_low += 1
            continue
        #if in centromere region, skip
        if is_within_intervals(pos, centromere_regions):
            skipped_in_centromere += 1
            continue
        
        print >>out_files[PLASMA], ' '.join(tmp)
        
        #output M, P, F alleles at this SNP locus
        for i, r in [(M, MR), (P, PR)]:
            a1 = alleles[i][0]
            a2 = alleles[i][1]
            count_a1 = 0
            count_a2 = 0
            try: count_a1 = posInfo[r][pos][a1]
            except: print i, pos, a1, posInfo[r][pos], alleles[i]
            try: count_a2 = posInfo[r][pos][a2]
            except: print i, pos, a2, posInfo[r][pos], alleles[i]
            
            if a1 == a2:
                count_a1 /= 2.
                count_a2 /= 2.
            
            print >>out_files[i], a1, a2, count_a1, count_a2
            
        print >>out_files[F], alleles[F][0], alleles[F][1], 3
        print >>out_pos_file, pos, "- M:", alleles[M], " P:", alleles[P], " F:", alleles[F]
     
    print "Low overall coverage positions ignored:", skipped_low
    print "Ignored positions in centromere regions:", skipped_in_centromere