Ejemplo n.º 1
0
def parse_vcf(vcffile, logfh):
    """Parse sample VCF file."""
    logfh.write("Parsing vcffile: {} -".format(vcffile))
    fh = open_file(vcffile)
    lines = fh.readlines()
    fh.close()
    while lines: # skip vcf header
        if not lines[0].startswith('##'):
            break
        lines.pop(0)
    fields = lines.pop(0).lstrip('#').rstrip().split("\t")
    variants = {}
    for line in lines:
        vals = line.rstrip().split("\t")
        d = dict(zip(fields, vals))
        if d['QUAL'][0].isdigit():
            pos = int(d['POS'])
            variants[pos] = d
    logfh.write(" {} variants\n".format(len(variants)))
    # special case for counting (TG)9-9T
#    if 87842 in variants and 87823 not in variants:
#        variants[87823] = {'CHROM':'CFTR', 'POS':'87823', 
#                           'REF':'ATG', 'ALT':'A' }
#    if 87844 in variants and 87845 not in variants:
#        variants[87845] = {'CHROM':'CFTR', 'POS':'87845', 
#                           'REF':'G', 'ALT':'T' }
    # special case for counting (TG)13-5T
#    if 87846 in variants and 87823 not in variants:
#        variants[87823] = {'CHROM':'CFTR', 'POS':'87823', 
#                           'REF':'A', 'ALT':'ATG' }
#    if 87847 in variants and variants[87847]['REF']=='T' and\
#        variants[87847]['ALT']=='TG':
#        variants[87847]['ALT'] += ',G'
    return (fields, variants)
Ejemplo n.º 2
0
def separate_vcf(vcffile, ref, outdir, args):
    sys.stderr.write("\nReading {}\n".format(vcffile))
    fh = open_file(vcffile)
    fieldlist = [ l for l in fh.readlines() if l.startswith('#CHROM') ]
    fh.close()
    if fieldlist:
        fields = fieldlist[0].rstrip().split("\t")
        samples = fields[9:]
    else:
        sys.stderr.write("Could not find VCF header in {}".format(vcffile))
        sys.exit(1)
    sys.stderr.write("  Found {} samples.\n".format(len(samples)))
    outvcfs = []
    logfh = sys.stderr
    if args.logdir:
        logfile = os.path.basename(vcffile).replace('.vcf','') +\
                  ".separate_vcf.log"
        logfile = os.path.join(args.logdir, logfile)
        logfh = open(logfile, 'w')
    for sample in samples:
        outvcf = "{}.separated.vcf".format(sample)
        if outdir: outvcf = os.path.join(outdir, outvcf)
        run_select_variants(vcffile, outvcf, sample, ref, logfh, args)
    if args.logdir:
        logfh.close()
    return samples
Ejemplo n.º 3
0
def parse_vcf(vcffile):
    """Parse variants in joint vcf file."""
    sys.stderr.write("VCF file {} -".format(vcffile))
    fh = open_file(vcffile)
    lines = fh.readlines()
    fh.close()
    while lines: # skip vcf header
        if not lines[0].startswith('##'):
            break
        lines.pop(0)
    fields = lines.pop(0).lstrip('#').rstrip().split("\t")
    variants = []
    for line in lines:
        vals = line.rstrip().split("\t")
        d = dict(zip(fields, vals))
        variants.append(d)
    sys.stderr.write(" {} variants\n".format(len(variants)))
    return (fields, variants)
Ejemplo n.º 4
0
def parse_vcf(vcffile):
    """Parse variants in joint vcf file."""
    sys.stderr.write("VCF file {} -".format(vcffile))
    fh = open_file(vcffile)
    lines = fh.readlines()
    fh.close()
    while lines: # skip vcf header
        if not lines[0].startswith('##'):
            break
        lines.pop(0)
    fields = lines.pop(0).lstrip('#').rstrip().split("\t")
    variants = []
    for line in lines:
        vals = line.rstrip().split("\t")
        d = dict(zip(fields, vals))
        variants.append(d)
    sys.stderr.write(" {} variants\n".format(len(variants)))
    return (fields, variants)
Ejemplo n.º 5
0
def parse_vcf(vcffile):
    fh = open_file(vcffile)
    lines = fh.readlines()
    fh.close()
    header = []
    while lines: # vcf header
        if lines[0].startswith('#'):
            header.append(lines.pop(0))
        else:
            break
    fields = header[-1].lstrip('#').rstrip().split("\t")
    vcfinfo = {}
    for line in lines:
        vals = line.rstrip().split("\t")
        d = dict(zip(fields, vals))
        poskey = create_pos_key(d['CHROM'], d['POS'])
        if poskey in vcfinfo:
            sys.stderr.write("  Variant pos {} duplicated\n".format(poskey))
        vcfinfo[poskey] = d
    return (header, fields, vcfinfo)
Ejemplo n.º 6
0
def parse_vcf(vcffile):
    fh = open_file(vcffile)
    lines = fh.readlines()
    fh.close()
    header = []
    while lines:  # vcf header
        if lines[0].startswith('#'):
            header.append(lines.pop(0))
        else:
            break
    fields = header[-1].lstrip('#').rstrip().split("\t")
    vcfinfo = {}
    for line in lines:
        vals = line.rstrip().split("\t")
        d = dict(zip(fields, vals))
        poskey = create_pos_key(d['CHROM'], d['POS'])
        if poskey in vcfinfo:
            sys.stderr.write("  Variant pos {} duplicated\n".format(poskey))
        vcfinfo[poskey] = d
    return (header, fields, vcfinfo)
Ejemplo n.º 7
0
Archivo: cftr.py Proyecto: eulaf/CFseq
def parse_bedfile(bedfile):
    """Parse ROI from bed file."""
    sys.stderr.write("\nReading {}\n".format(bedfile))
    fh = open_file(bedfile)
    lines = fh.readlines()
    fh.close()

    roi = defaultdict(dict)
    fields = ['CHROM', 'START', 'END', 'NAME']
    numlines = 0
    for line in lines:
        vals = line.rstrip().split("\t")
        d = dict(zip(fields, vals))
        d['START'] = int(d['START']) + 1 # convert to 1-based
        d['END'] = int(d['END'])
        roi[d['CHROM']][d['START']] = d
        numlines += 1
    sys.stderr.write("  Got {} lines in {} chrom\n".format(numlines,
                     len(roi.keys())))
    return roi
Ejemplo n.º 8
0
def trim_primers(fqfile, alignout, max_trim_len, primerinfo, outlabel, logfh,
                 args):
    """Returns trimmed fastq file and file with list of sequence names"""
    trimmedfq = outlabel + ".trimmed.fastq"
    seqfile = outlabel + ".seqlist.txt"
    logfh.write("    Trimming fq: {}\n".format(trimmedfq))
    if have_files([trimmedfq, seqfile], args.force, stderr=logfh):
        logfh.write("      Already have {}\n".format(trimmedfq))
        return (trimmedfq, seqfile)
    aligns = parse_alignout(alignout)
    seqlist = []
    with open(trimmedfq, 'w') as outfq:
        inseq = FastQParser(fqfile)
        for seqrec in inseq:
            seqlist.append(seqrec.id)
            if seqrec.id in aligns:
                primer = aligns[seqrec.id]['primer']
                if primerinfo[primer]['overlap']:
                    primerend = aligns[seqrec.id]['end'] +\
                                aligns[seqrec.id]['left']
                    subrec = seqrec[primerend:]
                    if args.debug:
                        logfh.write("{}\tTrimming\t{}\n".format(
                            primer, seqrec.id))
                else:
                    if args.debug:
                        logfh.write("{}\tNot trimming\t{}\n".format(
                            primer, seqrec.id))
                    subrec = seqrec
            else:  #trim default max_primer_len+2
                subrec = seqrec[max_trim_len:]
            outfq.write("{}\n".format(subrec.fastq()))
    logfh.write("    Seq list: {}\n".format(seqfile))
    if have_file(seqfile, True, stderr=logfh):
        logfh.write("      Still have {}\n".format(seqfile))
        sys.exit()
    with open_file(seqfile, 'w') as ifh:
        ifh.write("\n".join(seqlist) + "\n")
    return (trimmedfq, seqfile)
Ejemplo n.º 9
0
def trim_primers(fqfile, alignout, max_trim_len, primerinfo, outlabel, logfh, 
                 args):
    """Returns trimmed fastq file and file with list of sequence names"""
    trimmedfq = outlabel + ".trimmed.fastq"
    seqfile = outlabel + ".seqlist.txt"
    logfh.write("    Trimming fq: {}\n".format(trimmedfq))
    if have_files([trimmedfq, seqfile], args.force, stderr=logfh):
        logfh.write("      Already have {}\n".format(trimmedfq))
        return (trimmedfq, seqfile)
    aligns = parse_alignout(alignout)
    seqlist = []
    with open(trimmedfq, 'w') as outfq:
        inseq = FastQParser(fqfile)
        for seqrec in inseq:
            seqlist.append(seqrec.id)
            if seqrec.id in aligns:
                primer = aligns[seqrec.id]['primer']
                if primerinfo[primer]['overlap']:
                    primerend = aligns[seqrec.id]['end'] +\
                                aligns[seqrec.id]['left']
                    subrec = seqrec[primerend:]
                    if args.debug:
                        logfh.write("{}\tTrimming\t{}\n".format(
                                         primer, seqrec.id))
                else:
                    if args.debug:
                        logfh.write("{}\tNot trimming\t{}\n".format(
                                         primer, seqrec.id))
                    subrec = seqrec
            else: #trim default max_primer_len+2
                subrec = seqrec[max_trim_len:]
            outfq.write("{}\n".format(subrec.fastq()))
    logfh.write("    Seq list: {}\n".format(seqfile))
    if have_file(seqfile, True, stderr=logfh):
        logfh.write("      Still have {}\n".format(seqfile))
        sys.exit()
    with open_file(seqfile, 'w') as ifh:
        ifh.write("\n".join(seqlist)+"\n")
    return (trimmedfq, seqfile)