Exemple #1
0
def main(opts):
    """Main function"""
    counter = 0

    vcf_reader = vcf.Reader(open(opts.vcf_file, "rb"))
    writer_willem = vcf.Writer(open(opts.output, 'w'), vcf_reader, lineterminator='\n')

    if "samtools" in vcf_reader._header_lines[2]:
        vcf_type = "samtools"
    elif "freeBayes" in vcf_reader._header_lines[2]:
        vcf_type = "freeBayes"
    else:
        print("Unknown vcf type, tool only handels vcf's produced by samtools or freeBayes")
        vcf_type = "unknown"
        # sys.exit()

    # only if a pair file is specified
    if opts.pair_file is not None:
        print("its not none")
        pairs = []
        with open(opts.pair_file, "r") as file:
            for line in file:
                clean_line = line.rstrip()
                splitted_line = clean_line.split(",")
                pairs.append(splitted_line)

    # for every record apply the filters
    for record in vcf_reader:
        # counter += 1

        genotype_info_first = genotype_depth(record, opts.call_rate)
        if genotype_info_first:
            qual_pass = filter_qual(record, opts.record_qual)
            depth_pass = filter_depth(record, opts.record_depth)

            if qual_pass and depth_pass:
                # print("pass qual and dept")
                new_record = filter_sample_depth(record, opts.sample_depth, vcf_type)
                some_site = filter_site(new_record, opts.fraction_het)

                if some_site:
                    # print("passed heterozygot filter")
                    new_record = sample_het(new_record, vcf_type)

                    if opts.pair_file is not None:
                        # print("its not none")
                        pair_record = check_pairs(new_record, pairs, vcf_type)

                        # if pair_filter:
                        genotype_info_second = genotype_depth(pair_record, opts.call_rate)
                        if genotype_info_second:
                            # print("passed all writing")
                            counter += 1
                            writer_willem.write_record(pair_record)

                            # print(counter)
                            # if counter > 200:
                            #     break

                    else:
                        genotype_info_second = genotype_depth(new_record, opts.call_rate)
                        if genotype_info_second:
                            counter += 1
                            # print("passed all writing")
                            writer_willem.write_record(new_record)

                            # print(counter)
                            # if counter > 200:
                            #     break

    writer_willem.close()
Exemple #2
0
            sampleToBam[fields[0]] = fields[1]

# Parse sample list
unmappedToBam = dict()
if args.unmappedToBam:
    with open(args.unmappedToBam) as f:
        for line in f:
            fields = line.strip().split('\t')
            unmappedToBam[fields[0]] = fields[1]

# Parse VCF
sv = collections.defaultdict(list)
observedSamples = set()
if args.vcfFile:
    vcf_reader = vcf.Reader(
        open(args.vcfFile), 'r',
        compressed=True) if args.vcfFile.endswith('.gz') else vcf.Reader(
            open(args.vcfFile), 'r', compressed=False)
    for record in vcf_reader:
        svlen = record.INFO['END'] - record.POS
        if (svlen <= maxSize) and ((not args.siteFilter) or
                                   (len(record.FILTER) == 0)):
            if record.INFO['SVTYPE'] == "TRA":
                continue
            carrier = set()
            dupBounds = set()
            delBounds = set()
            for call in record.samples:
                if (call.sample in sampleToBam.keys()) and (call.called) and (
                        call.gt_type != 0):
                    carrier.add(call.sample)
# inputs
vcf_file = snakemake.input["vcf_file"]
merged_vcf = snakemake.input["merged_vcf"]
gbk_file = snakemake.input["gbk_file"]
reference = snakemake.params["reference"]

# rename reference if assembled genome
if "_assembled_genome" in reference:
    reference = re.sub("_assembled_genome", "", reference)

# output
report_file = snakemake.output["html_file"]

# parse vcf
merged_vcf_records = [
    i for i in vcf.Reader(codecs.open(merged_vcf, 'r', 'latin-1'))
]


def parse_gbk(gbk_file):
    with open(gbk_file, "r") as f:
        record_dict = SeqIO.to_dict(SeqIO.parse(f, 'genbank'))
    return record_dict


def get_neiboring_orf(position, feature_list):
    '''
    Identify neiboring feature of variant position.
    Input: SeqRecord and position (integer)
    Output: List of two strings with the closest features located before and after the input position
    (either locus_tags/gene names).
Exemple #4
0
 def test_issue_16(self):
     reader = vcf.Reader(fh('issue-16.vcf'))
     n = reader.next()
     assert n.QUAL == None
Exemple #5
0
            REG_START, REG_END = None, None
        print('CHROM = {}'.format(REG_CHROM))
        print('START = {}'.format(REG_START))
        print('END = {}'.format(REG_END))
        # CHECK HERE
    if args.nonorm: NORM = False
    DEBUG = args.debug
    PRINT_ALLELES = args.alleles
    COUNTFILTERS = args.countfilters
    if args.filter: FILTER = True

    VCFFILE = args.vcf
    OUTFILE = args.out

    # If region is specified, just parse through that.
    vcf_reader = vcf.Reader(open(VCFFILE, "rb"))
    if args.region is not None:
        try:
            vcf_reads = vcf_reader.fetch(str(REG_CHROM), int(REG_START),
                                         int(REG_END))
        except:
            vcf_reads = vcf_reader.fetch(str(REG_CHROM))
    else:
        vcf_reads = vcf_reader
    if SAMPLES == []: SAMPLES = vcf_reads.samples
    SAMPLES = [item for item in SAMPLES if item in vcf_reads.samples]

    counters = {
        "numloci": 0,
        "minmaf": 0,
        "minsamples": 0,
Exemple #6
0
 def testOpenFilehandle(self):
     r = vcf.Reader(fh('example-4.0.vcf'))
     self.assertEqual(self.samples, r.samples)
     self.assertEqual('example-4.0.vcf', os.path.split(r.filename)[1])
Exemple #7
0
 def testOpenFilehandleGzipped(self):
     r = vcf.Reader(fh('tb.vcf.gz', 'rb'))
     self.assertEqual(self.samples, r.samples)
Exemple #8
0
 def setUp(self):
     self.reader = vcf.Reader(fh(self.filename))
Exemple #9
0
    def testParse(self):
        reader = vcf.Reader(fh('samtools.vcf'))

        self.assertEqual(len(reader.samples), 1)
        self.assertEqual(sum(1 for _ in reader), 11)
                    dest='vcfFile1',
                    type=str,
                    help='vcfFile1',
                    nargs='?',
                    default=None)
parser.add_argument('-R',
                    '--keepRefCalls',
                    action="store_true",
                    dest='keepRefCalls',
                    help='do not remove calls in which only ref is called',
                    default=False)

args = parser.parse_args()

vcfFile1 = open(args.vcfFile1, 'r')
reader1 = vcf.Reader(vcfFile1)

vcfoutF1 = replace(args.vcfFile1, '.vcf', '.MKSNGL.vcf')
vcfoutF1 = replace(vcfoutF1, '.vcf.gz', '.vcf')
print >> sys.stdout, "VCFOUT1", vcfoutF1
vcfoutF1 = open(vcfoutF1, 'w')

_Filter = collections.namedtuple('Filter', ['id', 'desc'])
reader1.filters['Singleton'] = _Filter(id='Singleton',
                                       desc='only one minor variant at locus')

reader2 = copy.copy(reader1)

vcfout1 = vcf.Writer(vcfoutF1, reader2)

#makeCallData = vcf.model.make_calldata_tuple(("GT","ALTP","REFP","GP"))
Exemple #11
0
 def test_meta(self):
     # expect no exceptions raised
     reader = vcf.Reader(fh('gatk_26_meta.vcf'))
     assert 'GATKCommandLine' in reader.metadata
     self.assertEqual(reader.metadata['GATKCommandLine'][0]['CommandLineOptions'], '"analysis_type=LeftAlignAndTrimVariants"')
     self.assertEqual(reader.metadata['GATKCommandLine'][1]['CommandLineOptions'], '"analysis_type=VariantAnnotator annotation=[HomopolymerRun, VariantType, TandemRepeatAnnotator]"')
Exemple #12
0
    def simulate_error_sample(self,
                              alpha,
                              beta,
                              error_vcf_path,
                              tumor_bam_path,
                              normal_bam_path,
                              output_bam_path,
                              margin=700):
        print("============= simulate_error_sample =============")  # debug
        vcf_reader = vcf.Reader(open(error_vcf_path, 'r'))
        tumor_reader = BamReader()
        normal_reader = BamReader()

        tumor_reads = ReadCollector()
        normal_reads = ReadCollector()
        num_output_pair = 0

        with tumor_reader.prepare(tumor_bam_path), normal_reader.prepare(
                normal_bam_path):
            output_bam = pysam.AlignmentFile(output_bam_path,
                                             'w',
                                             header=tumor_reader.bam.header)
            for record in vcf_reader:
                Chr = record.CHROM
                pos = record.POS

                p = list(np.random.dirichlet([alpha, beta], 1).flat)
                tumor_proportoin_here = p[0]
                normal_proportion_here = p[1]

                rct, rcn, act, acn = 0, 0, 0, 0  # debug
                for sample in record.samples:  # debug
                    rct += sample["RCT"]  # debug
                    rcn += sample["RCN"]  # debug
                    act += sample["ACT"]  # debug
                    acn += sample["ACN"]  # debug
                    break  # debug

                ref_prediction = int(rct * tumor_proportoin_here +
                                     rcn * normal_proportion_here)  # debug
                alt_prediction = int(act * tumor_proportoin_here +
                                     acn * normal_proportion_here)  # debug
                print( str(record.CHROM) + ":" + str(record.POS) + ", (tumor proportion, normal proportion): " + \
                       str((tumor_proportoin_here, normal_proportion_here)) + ", (ref, alt) predicted: " + \
                       str((ref_prediction, alt_prediction)) )                                                      # debug

                tumor_reads.clear()
                normal_reads.clear()
                for read in tumor_reader.search(Chr,
                                                pos - margin,
                                                pos + margin,
                                                f_flag=0,
                                                F_flag=2816):
                    tumor_reads.push(read)

                for read in normal_reader.search(Chr,
                                                 pos - margin,
                                                 pos + margin,
                                                 f_flag=0,
                                                 F_flag=2816):
                    normal_reads.push(read)

                for ID, read_group in tumor_reads:
                    reads = read_group[1]
                    if len(
                            reads
                    ) >= 2 and tumor_proportoin_here >= np.random.rand():
                        for read in reads:
                            read.query_name = 'error_' + str(
                                num_output_pair) + '_' + read.query_name
                            output_bam.write(read)
                        num_output_pair += 1

                for ID, read_group in normal_reads:
                    reads = read_group[1]
                    if len(
                            reads
                    ) >= 2 and normal_proportion_here >= np.random.rand():
                        for read in reads:
                            read.query_name = 'error_' + str(
                                num_output_pair) + '_' + read.query_name
                            output_bam.write(read)
                        num_output_pair += 1

        output_bam.close()
Exemple #13
0
    def simulate_sample(self,
                        proportion,
                        vcf_path,
                        tumor_bam_path,
                        normal_bam_path,
                        output_bam_path,
                        min_vaf=0.0,
                        max_vaf=1.0,
                        margin=700,
                        node_info_tag='NODE'):
        sys.stderr.writelines("min_vaf: " + str(min_vaf) + ", max_vaf: " +
                              str(max_vaf) + "\n")
        vcf_reader = vcf.Reader(open(vcf_path, 'r'))
        tumor_reader = BamReader()
        normal_reader = BamReader()

        tumor_reads = ReadCollector()
        normal_reads = ReadCollector()
        num_output_pair = 0

        with tumor_reader.prepare(tumor_bam_path), normal_reader.prepare(
                normal_bam_path):
            output_bam = pysam.AlignmentFile(output_bam_path,
                                             'w',
                                             header=tumor_reader.bam.header)
            for record in vcf_reader:
                Chr = record.CHROM
                pos = record.POS
                nodes = re.split('/', record.INFO[node_info_tag])
                nodes = map(int, nodes)

                tumor_proportoin_here = 0.0
                normal_proportion_here = 1.0
                for node in nodes:
                    tumor_proportoin_here += proportion[node]
                    normal_proportion_here -= proportion[node]

                # rct, rcn, act, acn = 0, 0, 0, 0 # debug
                # for sample in record.samples:   # debug
                #     rct += sample["RCT"]        # debug
                #     rcn += sample["RCN"]        # debug
                #     act += sample["ACT"]        # debug
                #     acn += sample["ACN"]        # debug
                #     break                       # debug

                # ref_prediction = int(rct * tumor_proportoin_here + rcn * normal_proportion_here )                   # debug
                # alt_prediction = int(act * tumor_proportoin_here + acn * normal_proportion_here )                   # debug
                # print( str(record.CHROM) + ":" + str(record.POS) + ", (tumor proportion, normal proportion): " + \
                #        str((tumor_proportoin_here, normal_proportion_here)) + ", (ref, alt) predicted: " + \
                #        str((ref_prediction, alt_prediction)) )                                                      # debug

                vaf = 0.0
                if len(record.samples) != 1:
                    raise Exception(
                        "Unexpected number of samples in answer vcf.")

                sample = record.samples[0]
                rct = int(sample["RCT"])
                act = int(sample["ACT"])
                depth = rct + act
                vaf = (1.0 * act) / (depth * 1.0)
                if not (min_vaf <= vaf <= max_vaf):
                    rcn = int(sample["RCN"])  # debug
                    acn = int(sample["ACN"])  # debug

                    ref_prediction = int(rct * tumor_proportoin_here +
                                         rcn * normal_proportion_here)  # debug
                    alt_prediction = int(act * tumor_proportoin_here +
                                         acn * normal_proportion_here)  # debug
                    sys.stderr.writelines(
                        "filtered: " + str(record.CHROM) + ":" +
                        str(record.POS) +  # debug
                        ", (tumor proportion, normal proportion): " +  # debug
                        str((tumor_proportoin_here, normal_proportion_here))
                        +  # debug
                        ", (ref, alt) predicted: " + str(
                            (ref_prediction, alt_prediction)) + "\n")  # debug
                    continue
                else:
                    rcn = int(sample["RCN"])  # debug
                    acn = int(sample["ACN"])  # debug

                    ref_prediction = int(rct * tumor_proportoin_here +
                                         rcn * normal_proportion_here)  # debug
                    alt_prediction = int(act * tumor_proportoin_here +
                                         acn * normal_proportion_here)  # debug
                    sys.stderr.writelines(
                        "passed: " + str(record.CHROM) + ":" +
                        str(record.POS) +  # debug
                        ", (tumor proportion, normal proportion): " +  # debug
                        str((tumor_proportoin_here, normal_proportion_here))
                        +  # debug
                        ", (ref, alt) predicted: " + str(
                            (ref_prediction, alt_prediction)) + "\n")  # debug

                tumor_reads.clear()
                normal_reads.clear()
                for read in tumor_reader.search(Chr,
                                                pos - margin,
                                                pos + margin,
                                                f_flag=0,
                                                F_flag=2816):
                    tumor_reads.push(read)

                for read in normal_reader.search(Chr,
                                                 pos - margin,
                                                 pos + margin,
                                                 f_flag=0,
                                                 F_flag=2816):
                    normal_reads.push(read)

                for ID, read_group in tumor_reads:
                    reads = read_group[1]
                    if len(
                            reads
                    ) >= 2 and tumor_proportoin_here >= np.random.rand():
                        for read in reads:
                            read.query_name = 'tumor_' + str(
                                num_output_pair) + '_' + read.query_name
                            output_bam.write(read)
                        num_output_pair += 1

                for ID, read_group in normal_reads:
                    reads = read_group[1]
                    if len(
                            reads
                    ) >= 2 and normal_proportion_here >= np.random.rand():
                        for read in reads:
                            read.query_name = 'tumor_' + str(
                                num_output_pair) + '_' + read.query_name
                            output_bam.write(read)
                        num_output_pair += 1

        output_bam.close()
Exemple #14
0
    'ALAA20-3_DNA366', 'BELA18-1_DNA57', 'BELA18-3_DNA58', 'BELA18-4_DNA59',
    'BELC18-1_DNA127', 'BELC18-2_DNA128', 'BELC18-4_DNA129'
]

filelist = glob.glob("test_data/*.filter.vcf")
print(filelist)

data = {}

destf = open('db_alt.json', 'w')
tracking = open('tracking.txt', 'w')

for individual in individuals:
    curr_individual = {}
    print("reading " + individual)
    vcf_reader = vcf.Reader(
        open('test_data/' + individual + '.filter.vcf', 'r'))

    total_count = 0
    num_duplicates = 0

    for record in vcf_reader:

        annotations = record.INFO['ANN']
        for ann in annotations:
            fields = ann.split('|')
            duplicate = False
            # According to SnpEff docs, fields are (1-indexed):
            # 1. allele
            # 2. effect
            # 4. gene name
            # 5. gene ID
Exemple #15
0
 def test_dunder_eq(self):
     reader = vcf.Reader(fh('example-4.0.vcf'))
     var = reader.next()
     example_call = var.samples[0]
     self.assertFalse(example_call == None)
     self.assertFalse(None == example_call)
Exemple #16
0
 def testParse(self):
     reader = vcf.Reader(fh('bcftools.vcf'))
     self.assertEqual(len(reader.samples), 1)
     for r in reader:
         for s in r.samples:
             s.phased
Exemple #17
0
    def setUp(self):
        self.reader = vcf.Reader(fh('tb.vcf.gz', 'rb'))

        self.run = vcf.parser.pysam is not None
Exemple #18
0
 def testParse(self):
     reader = vcf.Reader(fh('gonl.chr20.release4.gtc.vcf'))
     for _ in reader:
         pass
Exemple #19
0
 def testOpenFilename(self):
     r = vcf.Reader(filename=self.fp('example-4.0.vcf'))
     self.assertEqual(self.samples, r.samples)
Exemple #20
0
 def test_contig_line(self):
     reader = vcf.Reader(fh('gonl.chr20.release4.gtc.vcf'))
     self.assertEqual(reader.contigs['1'].length, 249250621)
Exemple #21
0
 def testOpenFilenameGzipped(self):
     r = vcf.Reader(filename=self.fp('tb.vcf.gz'))
     self.assertEqual(self.samples, r.samples)
Exemple #22
0
 def test_samples(self):
     self.reader = vcf.Reader(fh(self.filename), strict_whitespace=True)
     self.assertEqual(self.reader.samples, self.samples)
Exemple #23
0
			ann.append('1/1') # Rank / total
			ann.append('')    # HGVS.c
			ann.append('')    # HGVS.p
			ann.append('')    # cDNA_position
			ann.append('')    # CDS_position
			ann.append('')    # Protein_position
			ann.append(dist)  # Distance to feature
			ann.append('')  # Errors, Warnings or Information messages
			anns.append ('|'.join(ann))
		INFO['ANN'] = anns
		
		record  = vcf.model._Record(CHROM, POS, ID, REF, alts, QUAL, FILTER, INFO, FORMAT, snames)
		record.samples = reader._parse_samples (samples, FORMAT, record)
		return record
			
	reader = vcf.Reader(filename="{{i.infile}}")
	reader.infos["ANN"] = vcf.parser._Info("ANN", 1, "String", "Annotation by ANNOVAR", "", "")
	snames = {v:k for k,v in enumerate(reader.samples)}
	writer = vcf.Writer(open(outfile, 'w'), reader)
	f2conv = "{{o.outfile | prefix}}.variant_function"
	lastvid= ''
	lastr  = []
	with open (f2conv) as f:
		for line in f:
			line = line.strip("\r\n")
			if not line: continue
			
			parts = line.split("\t")
			varid = parts[2] + '|' + parts[3] + '|' + parts[12] + '|' + parts[5]
			
			if lastvid != varid and lastvid:
Exemple #24
0
 def test_num_calls(self):
     reader = vcf.Reader(fh('example-4.0.vcf'))
     for var in reader:
         num_calls = (var.num_hom_ref + var.num_hom_alt + \
                      var.num_het + var.num_unknown)
         self.assertEqual(len(var.samples), num_calls)
    writer = csv.writer(csv_file, delimiter="\t")
    for key, value in used_dict.items():
        writer.writerow([key, value])
    for key, value in file_dict.items():
        writer.writerow([key, value])

########Single_SNP_probability and Multi_SNP_probability (1 hour)############
pos = []
snp1 = []
snp2 = []
allcombination = []
allpos = []
for file in file_list:
    pos = []
    with open(file) as vcffile:
        vcfReader = vcf.Reader(vcffile)
        for record in vcfReader:
            pos.append(record.POS)
            allpos.append(record.POS)
        i = 0

Single_SNP_prob = {}
Single_SNP_proba = {}
Single_SNP_probs = []
Single_SNP_probp = []
counts = Counter(allpos)
Single_SNP_proba = dict(counts)
for element in Single_SNP_proba:
    if (float(Single_SNP_proba[element]) / len(file_list)) >= 0.01:
        Single_SNP_prob[str(element)] = (float(Single_SNP_proba[element]) /
                                         len(file_list))
Exemple #26
0
 def test_dunder_eq(self):
     rec = vcf.Reader(fh('example-4.0.vcf')).next()
     self.assertFalse(rec == None)
     self.assertFalse(None == rec)
Exemple #27
0
def main():
    """

    :return:
    """

    parser = argparse.ArgumentParser()
    parser.add_argument(
        '-i',
        '--inVCF',
        required=True,
        help='The input VCF file name including full/relative path')
    parser.add_argument(
        '-f',
        '--founders',
        required=True,
        help=
        'The parental genotypes in TSV format [CHROM, POS, REF, ALT, GT1, GT2]'
    )
    parser.add_argument('-c',
                        '--chrom',
                        required=True,
                        help='specify which chromosome the VCF file is from')
    parser.add_argument('-p',
                        '--prefix',
                        default='out',
                        help='the prefix for the output files')
    parser.add_argument('-o',
                        '--outputDir',
                        default='',
                        help='the name of the output directory')
    parser.add_argument('-v',
                        '--verbose',
                        action='store_true',
                        help='print more information')
    args = parser.parse_args()

    logger = logging.getLogger('root')
    FORMAT = "[%(filename)s:%(lineno)4s - %(funcName)20s() ] %(levelname)10s - %(message)s"

    if args.verbose:
        logging.basicConfig(level=logging.INFO, format=FORMAT)
    else:
        logging.basicConfig(level=logging.WARNING, format=FORMAT)

    parentalGenosDict = OrderedDict()
    with openIOFile(args.founders) as parentalGenosInput:
        parentalGenosHeader = parentalGenosInput.readline().strip().split()
        logging.info(f'header: {parentalGenosHeader}')

        try:
            assert verifyParentalGenosFileStructure(parentalGenosHeader)
        except AssertionError:
            message = f'the header of the parental genotypes file [{args.founders}] is missing. ' \
                      f'Add a header: CHROM, POS, REF, ALT, Parent1_Geno, Parent2_Geno'
            logging.critical(message)
            sys.exit(message)

        parents = parentalGenosHeader[4:6]
        logging.info(f'parents: {parents}')
        parentalGenosDict = extractParentGenosForGivenChrom(
            parentalGenosInput, args.chrom, parentalGenosHeader)

    logging.info(f"parental genotypes have been stored in memory")

    genotypeTranslationDictProper = {
        '0/0': '0',
        '0/1': '1',
        '1/1': '2',
        './.': 'NA'
    }
    genotypeTranslationDictInv = {
        '0/0': '2',
        '0/1': '1',
        '1/1': '0',
        './.': 'NA'
    }

    logging.info(f"opening the input VCF file...")
    vcfFileInput = openIOFile(args.inVCF)
    vcf_reader = vcf.Reader(vcfFileInput)

    nSamples = len(vcf_reader.samples)
    logging.info(f"calculated number of samples: [{nSamples}]")

    # prepare the data structures for the output file
    outputHeader = ['sample']

    outputGenosDict = OrderedDict()
    outputGenosDict['positions'] = []
    for sample in vcf_reader.samples:
        outputGenosDict[sample] = []

    logging.info(f"iterating through the VCF records")

    i = 0
    for record in vcf_reader:
        i += 1
        if i % 10000 == 0:
            logging.info(f"processing record # {i}")
        # print(record.CHROM, record.POS, record.num_called, record.num_unknown)

        # ensure that the current VCF position is in the parental genotypes table
        try:
            assert str(record.POS) in parentalGenosDict.keys()
            logging.debug(f'found {record.POS}')
        except AssertionError:
            message = f'position {record.POS} was not found in the founding SNPs'
            logging.warning(message)
            continue

        # check whether the REF allele matches between the VCF and the parental genos table
        try:
            assert parentalGenosDict[str(record.POS)]['REF'] == record.REF
        except AssertionError:
            message = f'REF alleles for position {record.POS} don\'t match in founding SNPs and STITCH VCF'
            logging.critical(message)
            sys.exit(message)

        # check whether the ALT allele matches between the VCF and the parental genos table
        try:
            assert parentalGenosDict[str(record.POS)]['ALT'] == record.ALT[0]
        except AssertionError:
            message = f'ALT alleles for position {record.POS} don\'t match in founding SNPs and STITCH VCF'
            logging.critical(message)
            sys.exit(message)

        # determine which translation dictionary will be used
        if parentalGenosDict[str(
                record.POS)][parents[0]] == '0/0' and parentalGenosDict[str(
                    record.POS)][parents[1]] == '1/1':
            translateGeno = genotypeTranslationDictProper.copy()
        elif parentalGenosDict[str(
                record.POS)][parents[0]] == '1/1' and parentalGenosDict[str(
                    record.POS)][parents[1]] == '0/0':
            translateGeno = genotypeTranslationDictInv.copy()
        else:
            message = f'An unexpected genotype combination was encountered in the parents at position [{record.POS}]'
            logging.warn(message)
            warnings.warn(message, Warning)
            continue

        outputGenosDict['positions'].append(str(record.POS))
        # iterate through the samples in the VCF
        # re-code genos as 0, 1, or 2
        for sample in vcf_reader.samples:
            trGeno = translateGeno[record.genotype(sample)['GT']]
            outputGenosDict[sample].append(trGeno)

    logging.debug(pprint.pformat(outputGenosDict))

    try:
        assert verifyOutputGenoIntegrity(outputGenosDict, vcf_reader.samples)
    except AssertionError:
        message = f'the output genotype dictionary is not correct'
        logging.critical(message)
        sys.exit(message)

    outputFN = f'{args.prefix}.{args.chrom}.genos.csv'
    outputFile = openIOFile(outputFN, args.outputDir, 'w')
    outputHeader += outputGenosDict['positions']
    outputFile.write(','.join(map(str, outputHeader)) + '\n')

    for sample in vcf_reader.samples:
        outputLine = [sample] + outputGenosDict[sample]
        outputFile.write(','.join(map(str, outputLine)) + '\n')
Exemple #28
0
 def test_pickle(self):
     reader = vcf.Reader(fh('example-4.0.vcf'))
     for var in reader:
         self.assertEqual(cPickle.loads(cPickle.dumps(var)), var)
def parse_vcf(vcf_file, gbk_file):
    '''
    Given a vcf input file and the gbk of the reference genome, return an html table of identified variants.
    '''
    vcf_reader = vcf.Reader(codecs.open(vcf_file, 'r', 'latin-1'))
    gbk_dico = parse_gbk(gbk_file)

    filter_head = [
        '%s' % (vcf_reader.filters[i].id) for i in vcf_reader.filters
    ]
    header = [
        "contig", "length", "position", "REF", "ALT", "location", "type",
        "ORF", "gene", "orf_before", "orf_after"
    ]
    header += filter_head
    if 'assembled' in vcf_file:
        header.append("InRef")
        header.append("Fail Others")

    table_rows = []
    snp_count = 0
    for n, vcf_record in enumerate(vcf_reader):
        try:
            contig = gbk_dico[vcf_record.CHROM]
        except KeyError:
            print("Missing contig", vcf_record.CHROM)
            continue
        variant_feature = search_mutated_feature(vcf_record, gbk_dico)

        if variant_feature["mut_location"] == 'Intergenic':
            orf_before, orf_after = get_neiboring_orf(int(vcf_record.POS),
                                                      contig.features)
        else:
            orf_before, orf_after = ['-', '-']

        contig_name = vcf_record.CHROM

        # skip ppositions with genomtype identical to REF
        if vcf_record.samples[0]['GT'] in ['.', '0']:
            continue
        snp_count += 1
        position = vcf_record.POS

        #  REF and ALT with respective depth in parenthesis
        ref = "%s (%s/%s)" % (vcf_record.REF, vcf_record.samples[0]['AD'][0],
                              vcf_record.samples[0]['DP'])
        if len(vcf_record.ALT[0]) == 1:
            alt = "%s (%s/%s)" % (vcf_record.ALT[0],
                                  vcf_record.samples[0]['AD'][1],
                                  vcf_record.samples[0]['DP'])
        else:
            alt = "%sbp (%s/%s)" % (len(
                vcf_record.ALT[0]), vcf_record.samples[0]['AD'][1],
                                    vcf_record.samples[0]['DP'])
        filter_status = []

        # if any of the test failed, set PASS as failed
        if len(vcf_record.FILTER) != 0:
            vcf_record.FILTER.append('PASS')

        for filter_name in vcf_reader.filters:
            if filter_name in vcf_record.FILTER:
                if filter_name == 'PASS':
                    filter_status.append('NO')
                else:
                    filter_status.append('-')
            else:
                if filter_name == 'PASS':
                    filter_status.append('YES')
                else:
                    filter_status.append('+')
        row = [
            contig_name,
            len(contig), position, ref, alt, variant_feature["mut_location"],
            variant_feature["mut_type"], variant_feature["orf_name"],
            variant_feature["gene"], orf_before, orf_after
        ]

        row += list(filter_status)

        #  if comparison to assembled genome, add data about self mapping
        #  (IF A VARIANT IS ALSO IDENTIFIED IN THAT MAPPING, PROBABLY A FALSE POSITIVE)
        if 'assembled' in vcf_file:
            GT, PASS = check_reference_mapping_GT(merged_vcf_records,
                                                  contig_name, position,
                                                  reference)
            row.append(GT)
            row.append(PASS)

        table_rows.append(row)

    df = pandas.DataFrame(table_rows, columns=header)

    # cell content is truncated if colwidth not set to -1
    pandas.set_option('display.max_colwidth', -1)

    df_str = df.to_html(index=False,
                        bold_rows=False,
                        classes=["dataTable"],
                        table_id="snps_table",
                        escape=False,
                        border=0)

    return df_str.replace("\n", "\n" + 10 * " ")
Exemple #30
0
    def test_parser(self):
        """Basic tests for the parser.
        """
        VCF_DATATYPE = Dataset.TYPE.VCF_FREEBAYES
        alignment_group = AlignmentGroup.objects.create(
                label='test alignment', reference_genome=self.reference_genome)
        copy_and_add_dataset_source(alignment_group, VCF_DATATYPE,
                VCF_DATATYPE, TEST_GENOME_SNPS)

        Chromosome.objects.create(
            reference_genome=self.reference_genome,
            label='Chromosome',
            num_bases=9001)

        # Create experiment sample objects having UIDs that correspond to those
        # in the vcf file. This is a bit "fake" in that the actual pipeline we
        # will be generating the vcf file from the samples (see add_groups()
        # stage of pipeline.
        with open(TEST_GENOME_SNPS) as fh:
            reader = vcf.Reader(fh)
            experiment_sample_uids = reader.samples
        num_experiment_samples = len(experiment_sample_uids)
        for sample_uid in experiment_sample_uids:
            ExperimentSample.objects.create(
                uid=sample_uid,
                project=self.project,
                label='fakename:' + sample_uid
            )

        # Count the number of records in the vcf file for testing.
        record_count = 0
        with open(TEST_GENOME_SNPS) as fh:
            for record in vcf.Reader(fh):
                record_count += 1

        # Parse the vcf
        parse_alignment_group_vcf(alignment_group, VCF_DATATYPE)


        variant_list = Variant.objects.filter(
                reference_genome=self.reference_genome)

        # There should be one Variant object for each record.
        self.assertEqual(record_count, len(variant_list))

        # Spot-check a few variants.
        self.assertEqual(1, len(Variant.objects.filter(
                reference_genome=self.reference_genome,
                position=376)))

        v_453 = Variant.objects.get(reference_genome=self.reference_genome,
                position=453)
        self.assertEqual(['G'], v_453.get_alternates())

        # Check false negatives.
        self.assertEqual(0, len(Variant.objects.filter(
                reference_genome=self.reference_genome,
                position=454)))

        # There should be one VariantCallerCommonData object for each record.
        self.assertEqual(record_count,
                len(VariantCallerCommonData.objects.filter(
                        variant__reference_genome=self.reference_genome)))

        # There should also be one VariantEvidence object per Variant x Sample.
        for variant in variant_list:
            vccd = variant.variantcallercommondata_set.all()[0]
            self.assertEqual(num_experiment_samples,
                    len(vccd.variantevidence_set.all()))

        # Check that alternate data is populated.
        #Chromosome  1330    .   CG  C,GC,AG 126.036 .   AB=0.5,0.5,1;ABP=3.0103,3.0103,7.35324;AC=1,1,1;AF=0.0833333,0.0833333,0.0833333;AN=12;AO=1,1,2;CIGAR=1M1D,2X,1X1M;DP=10;DPRA=1.33333,1.33333,1.33333;EPP=5.18177,5.18177,3.0103;EPPR=4.45795;HWE=-16.5861;LEN=1,2,1;MEANALT=2,2,1;MQM=60,37,48.5;MQMR=40.8333;NS=6;NUMALT=3;ODDS=1.50408;PAIRED=1,0,0.5;PAIREDR=0.166667;RO=6;RPP=5.18177,5.18177,7.35324;RPPR=16.0391;RUN=1,1,1;SAP=5.18177,5.18177,3.0103;SRP=4.45795;TYPE=del,mnp,snp;XAI=0,0.0102041,0.00515464;XAM=0,0.0102041,0.0253649;XAS=0,0,0.0202103;XRI=0.0016835;XRM=0.00835084;XRS=0.00666733;technology.illumina=1,1,1;BVAR GT:DP:RO:QR:AO:QA:GL    .   0/0:1:1:36:0,0,0:0,0,0:0,-0.30103,-3.6,-0.30103,-3.6,-3.6,-0.30103,-3.6,-3.6,-3.6   0/0:2:2:76:0,0,0:0,0,0:0,-0.60206,-7.03,-0.60206,-7.03,-7.03,-0.60206,-7.03,-7.03,-7.03 1/2:2:0:0:1,1,0:108,31,0:-8.645,-3.40103,-3.1,-6.30103,-0.30103,-6,-8.645,-3.40103,-6.30103,-8.645  .   0/3:2:0:0:0,0,2:0,0,73:-6.935,-6.935,-6.935,-6.935,-6.935,-6.935,-0.60206,-0.60206,-0.60206,0   0/0:2:2:72:0,0,0:0,0,0:0,-0.60206,-6.84,-0.60206,-6.84,-6.84,-0.60206,-6.84,-6.84,-6.84 .   0/0:1:1:34:0,0,0:0,0,0:0,-0.30103,-3.4,-0.30103,-3.4,-3.4,-0.30103,-3.4,-3.4,-3.4   .
        v_1330 = Variant.objects.get(reference_genome=self.reference_genome,
                position=1330)
        self.assertEqual(set(v_1330.get_alternates()), set(['C', 'GC', 'AG']))
        v_1330_c = VariantAlternate.objects.get(variant=v_1330, alt_value='C')
        self.assertTrue(len(v_1330_c.variantevidence_set.all()))
        v_1330_gc = VariantAlternate.objects.get(variant=v_1330, alt_value='GC')
        self.assertTrue(len(v_1330_gc.variantevidence_set.all()))
        self.assertEqual(v_1330_c.data['INFO_ABP'], v_1330_gc.data['INFO_ABP'])