Example #1
0
    def __apply_filter_step(self, step_dict, entries_list):
        """
        Apply filter method on entries in VCF/BED as defined in class VCFfilter
        """
        self.out_fname = self.fname.replace(
            ".vcf", ".%02d_%s.vcf" % (step_dict['order'], step_dict['name']))
        if not self.skip:
            try:
                vcfout_filtered = vcf.VCFWriter(
                    open(os.path.join(self.out_dir, self.out_fname), "w"),
                    self.vcf_template)
                entries_list = list(
                    getattr(VCFfilters(),
                            step_dict['method'])(entries_list,
                                                 template=self.vcf_template,
                                                 sample=self.sample,
                                                 **step_dict))

                for record in entries_list:
                    vcfout_filtered.write_record(record)
                vcfout_filtered.close()
            except AttributeError:
                print u"[Error] Method %s not defined." % step_dict['method']
                raise
        else:
            pass
        return entries_list
Example #2
0
def filter_by_background(in_vcf, full_vcf, background, data):
    """Filter SV calls also present in background samples.

    Skips filtering of inversions, which are not characterized differently
    between cases and controls in test datasets.
    """
    Filter = collections.namedtuple('Filter', ['id', 'desc'])
    back_filter = Filter(id='InBackground',
                         desc='Rejected due to presence in background sample')
    out_file = "%s-filter.vcf" % utils.splitext_plus(in_vcf)[0]
    if not utils.file_uptodate(out_file, in_vcf) and not utils.file_uptodate(
            out_file + ".vcf.gz", in_vcf):
        with file_transaction(data, out_file) as tx_out_file:
            with open(tx_out_file, "w") as out_handle:
                reader = vcf.VCFReader(filename=in_vcf)
                reader.filters["InBackground"] = back_filter
                full_reader = vcf.VCFReader(filename=full_vcf)
                writer = vcf.VCFWriter(out_handle, template=reader)
                for out_rec, rec in zip(reader, full_reader):
                    rec_type = rec.genotype(dd.get_sample_name(data)).gt_type
                    if rec_type == 0 or any(rec_type == rec.genotype(
                            dd.get_sample_name(x)).gt_type
                                            for x in background):
                        out_rec.add_filter("InBackground")
                    writer.write_record(out_rec)
    return vcfutils.bgzip_and_index(out_file, data["config"])
Example #3
0
def _add_reject_flag(in_file, config):
    """Add REJECT flag to all records that aren't flagged somatic
    (SS=2)"""

    Filter = namedtuple('Filter', ['id', 'desc'])
    reject_filter = Filter(id='REJECT',
                           desc='Rejected as non-SOMATIC or by quality')
    # NOTE: PyVCF will write an uncompressed VCF
    base, ext = utils.splitext_plus(in_file)
    name = "rejectfix"
    out_file = "{0}-{1}{2}".format(base, name, ".vcf")

    if utils.file_exists(in_file):
        reader = vcf.VCFReader(filename=in_file)
        # Add info to the header of the reader
        reader.filters["REJECT"] = reject_filter
        with file_transaction(config, out_file) as tx_out_file:
            with open(tx_out_file, "wb") as handle:
                writer = vcf.VCFWriter(handle, template=reader)
                for record in reader:
                    if "SS" in record.INFO:
                        # VarScan encodes it as a string
                        # TODO: Set it as integer when cleaning

                        if record.INFO["SS"] != "2":
                            record.add_filter("REJECT")
                    writer.write_record(record)

        # Re-compress the file
        out_file = bgzip_and_index(out_file, config)
        move_vcf(in_file, "{0}.orig".format(in_file))
        move_vcf(out_file, in_file)
        with open(out_file, "w") as out_handle:
            out_handle.write("Moved to {0}".format(in_file))
Example #4
0
    def exclude(entries, distance, exfile, **kwargs):
        """
        Performs exclusion operations with bedtools window -v
        :param entries:
        :param distance:
        :param exfile:
        :param kwargs:
        :return:
        """
        # create temporary VCF files
        vcf_temp_in = tempfile.NamedTemporaryFile(
            suffix=".vcf")  # write IN-VCF to disk temporarily
        vcf_temp_out = tempfile.NamedTemporaryFile(
            suffix=".vcf")  # write OUT-VCF to disk temporarily
        vcfin = vcf.VCFWriter(vcf_temp_in, kwargs["template"])
        vcfout = vcf.VCFWriter(vcf_temp_out, kwargs["template"])

        for record in entries:  # write IN -VCF
            vcfin.write_record(record)
        vcfin.flush()  # Inbetween flushing (avoid clogging)

        entries = BedTool(
            vcf_temp_in.name)  # generate BedTool object from VCF we just wrote
        # format string (e.g. for sample-specific exclude files)
        if 'sample' in kwargs.keys():
            exfile = exfile.format(SAMPLE=kwargs['sample'])

        entries = entries.window(
            exfile, w=distance, v=True,
            output=vcf_temp_out.name)  # apply window operation
        if len(entries) > 0:
            passed_vcf = vcf.VCFReader(vcf_temp_out)  # return VCF object
        else:
            passed_vcf = []  # return empty list
        vcfin.close()  # close writer, delete temporary file
        # TODO implement closing of vcfout without breaking reading
        return passed_vcf
def filter_file(source, destination):

    reader = vcf.VCFReader(filename=source)

    with open(destination, "w") as handle:
        writer = vcf.VCFWriter(handle, reader)

        for record in reader:
            if all(not sample.called for sample in record):
                continue
            writer.write_record(record)

    final = pysam.tabix_index(destination, preset="vcf", force=True)

    return final
Example #6
0
    def add_annotation(self):
        """
        <p>
        Read the input VCF file, add annotations to the #INFO column and write it back to the output VCF file.
        </p>
        """

        vcfReader = vcf.Reader(open(self.inputFile, 'r'))
        """
        How to add info header
         <http://www.1000genomes.org/wiki/Analysis/Variant%20Call%20Format/vcf-variant-call-format-version-41>
        """
        vcfReader.infos['TSSOL'] = VcfInfo(
            'TSSOL', vcf_field_counts['A'], 'String',
            'Info indicates whether the variant overlapping with the'
            ' transcription start site(TSS)')
        vcfReader.infos['CCURI'] = VcfInfo(
            'CCURI', vcf_field_counts['A'], 'String',
            'Info includes the URL of the cage cluster to which the'
            ' variant overlapping')
        vcfReader.infos['SAMPURI'] = VcfInfo(
            'SAMPURI', vcf_field_counts['A'], 'String',
            'Info includes the URL of the samples with to which the'
            ' variant overlapping')

        vcfWriter = vcf.VCFWriter(open(self.outputFile, 'w'), vcfReader)

        cnt = 0
        cnt_block = 100
        t1 = time.time()

        #pool = Pool(self.n_parallel)
        #batch = list(itertools.islice(vcfReader, self.n_parallel))
        #res = pool.map(parallel_annotation_caller, zip([self]*len(batch), batch))

        for record in vcfReader:
            vcfWriter.write_record(self.get_annotation(record))

            if cnt % cnt_block == 1:
                t2 = time.time()
                ips = cnt_block / (t2 - t1)
                print "speed: %.2f iters/s = %d iters p/h = %.1f hours/million iters" % \
                      (ips, ips * 3600, 1000000 / ips / 3600)
                t1 = time.time()
            cnt += 1

        vcfWriter.close()
Example #7
0
def fix_somatic_calls(in_file, config):
    """Fix somatic variant output, standardize it to the SOMATIC flag.
    """
    if vcf is None:
        raise ImportError("Require PyVCF for manipulating cancer VCFs")

    # HACK: Needed to replicate the structure used by PyVCF
    Info = namedtuple('Info', ['id', 'num', 'type', 'desc'])
    somatic_info = Info(id='SOMATIC', num=0, type='Flag', desc='Somatic event')
    Filter = namedtuple('Filter', ['id', 'desc'])
    reject_filter = Filter(id='REJECT',
                           desc='Rejected as non-SOMATIC or by quality')
    # NOTE: PyVCF will write an uncompressed VCF
    base, ext = utils.splitext_plus(in_file)
    name = "somaticfix"
    out_file = "{0}-{1}{2}".format(base, name, ".vcf")

    if utils.file_exists(in_file):
        reader = vcf.VCFReader(filename=in_file)
        # Add info to the header of the reader
        reader.infos["SOMATIC"] = somatic_info
        reader.filters["REJECT"] = reject_filter
        for ext in [".gz", ".gz.tbi"]:
            if os.path.exists(out_file + ext):
                os.remove(out_file + ext)
        with file_transaction(config, out_file) as tx_out_file:
            with open(tx_out_file, "wb") as handle:
                writer = vcf.VCFWriter(handle, template=reader)
                for record in reader:
                    # Handle FreeBayes
                    is_somatic = False
                    if "VT" in record.INFO:
                        if record.INFO["VT"] == "somatic":
                            record.add_info("SOMATIC", True)
                            is_somatic = True
                        # Discard old record
                        del record.INFO["VT"]
                    if not is_somatic:
                        record.add_filter("REJECT")
                    writer.write_record(record)

        # Re-compress the file
        out_file = bgzip_and_index(out_file, config)
        move_vcf(in_file, "{0}.orig".format(in_file))
        move_vcf(out_file, in_file)
        with open(out_file, "w") as out_handle:
            out_handle.write("Moved to {0}".format(in_file))
Example #8
0
def main():
    """The main function
    """

    parser = cmdline_parser()
    args = parser.parse_args()

    if args.verbose:
        LOG.setLevel(logging.INFO)
    if args.debug:
        LOG.setLevel(logging.DEBUG)

    assert os.path.exists(args.bam), ("BAM file %s does not exist" % args.bam)
    samfh = pysam.Samfile(args.bam)

    # setup vcf_reader
    #
    if args.vcfin == '-':
        vcf_reader = vcf.VCFReader(sys.stdin)
    else:
        vcf_reader = vcf.VCFReader(filename=args.vcfin)

    variants = [r for r in vcf_reader]
    LOG.info("Loaded %d variants" % len(variants))

    if args.mtc.lower() != 'None':
        LOG.info("Will use %s for MTC on %s with alpha %f" %
                 (args.mtc, args.mtc_tag, args.mtc_alpha))
    else:
        LOG.info("No multiple testing correction will be done")

    # setup vcf_writer
    #
    if args.vcfout == '-':
        fh_out = sys.stdout
    else:
        if os.path.exists(args.vcfout):
            LOG.fatal(
                "Cowardly refusing to overwrite already existing file %s" %
                (args.vcfout))
            sys.exit(1)

        if args.vcfout[-3:] == '.gz':
            fh_out = gzip.open(args.vcfout, 'w')
        else:
            fh_out = open(args.vcfout, 'w')
    # pyvcf needs template as arg to VCFWriter, whereas LoFreq's vcf clone didn't
    vcf_writer = vcf.VCFWriter(fh_out, vcf_reader, lineterminator=os.linesep)
    #vcf_writer = vcf.VCFWriter(fh_out)
    #vcf_writer.meta_from_reader(vcf_reader)

    pvalues = []
    for (var_no, var) in enumerate(variants):
        if var_no % 500 == 1:
            LOG.info("Computing bias for var %d of %d" %
                     (var_no, len(variants)))

        if var.INFO.has_key('INDEL'):
            LOG.warn("Skipping unsupported indel variant %s:%d" %
                     (var.CHROM, var.POS))
            continue

        reads = list(
            samfh.fetch(reference=var.CHROM, start=var.POS - 1, end=var.POS))
        LOG.debug("%s %d: %d (unfiltered) reads covering position" %
                  (var.CHROM, var.POS, len(reads)))

        ref_mquals = []
        alt_mquals = []
        ref_bquals = []
        alt_bquals = []
        # only for PE
        #ref_isize = []
        #alt_isize = []
        # following two meant to test
        #alt_vpos = []
        #rlens = []

        for r in reads:

            if skip_read(r):
                continue

            orphan = (r.flag & 0x1) and not (r.flag & 0x2)
            if orphan and not args.use_orphan:
                continue

            if r.mapq < args.min_mq:
                continue

            vpos_on_read = [
                vpos_on_read for (vpos_on_read, vpos_on_ref) in r.aligned_pairs
                if vpos_on_ref == var.POS - 1
            ]
            assert len(vpos_on_read) == 1
            vpos_on_read = vpos_on_read[0]
            if vpos_on_read == None:  # skip deletions
                continue

            #alt_vpos.append(vpos_on_read)
            #rlens.append(r.rlen)

            b = r.query[vpos_on_read]
            bq = ord(r.qqual[vpos_on_read]) - 33
            mq = r.mapq

            if bq < args.min_bq:
                continue

            assert len(var.REF) == 1 and len(var.ALT) == 1
            if b.upper() == var.REF[0].upper():
                ref_mquals.append(mq)
                ref_bquals.append(bq)
                #if not args.use_orphan:
                #    ref_isize.append(abs(r.tlen))
            elif b.upper() == str(var.ALT[0]).upper():
                alt_mquals.append(mq)
                alt_bquals.append(bq)
                #if not args.use_orphan:
                #    alt_isize.append(abs(r.tlen))
            else:
                LOG.debug("Skipping non-ref-alt base %s at %s:%d" %
                          (b, var.CHROM, var.POS))
                continue

        LOG.debug("After filtering at %s:%d: %d ref mquals and %d alt mquals" %
                  (var.CHROM, var.POS, len(ref_mquals), len(alt_mquals)))

        # mannwhitneyu fails if all values the same
        if len(set(ref_mquals).union(alt_mquals)) == 1:
            m_pv = 1.0
        elif len(ref_mquals) == 0 or len(alt_mquals) == 0:
            m_pv = 1.0
        else:
            # compute only if alternate quals are smaller on average
            if mean(alt_mquals) < mean(ref_mquals):
                ustat = mannwhitneyu(ref_mquals, alt_mquals)
                m_pv = ustat[1]
            else:
                m_pv = 1.0

        # same for bqs
        if len(set(ref_bquals).union(alt_bquals)) == 1:
            b_pv = 1.0
        elif len(ref_bquals) == 0 or len(alt_bquals) == 0:
            b_pv = 1.0
        else:
            if mean(alt_bquals) < mean(ref_bquals):
                ustat = mannwhitneyu(ref_bquals, alt_bquals)
                b_pv = ustat[1]
            else:
                b_pv = 1.0
        # same for isize-qs
        #if len(ref_isize) and len(alt_isize):
        #    if len(set(ref_isize).union(alt_isize))==1:
        #        i_pv = 1
        #    else:
        #        ustat = mannwhitneyu(ref_isize, alt_isize)
        #        i_pv = ustat[1]
        #else:
        #    i_pv = 1

        c_pv = fisher_comb(m_pv, b_pv)

        #import pdb; pdb.set_trace()
        LOG.debug("%s %d: mb %f bb %f cb %f" %
                  (var.CHROM, var.POS, m_pv, b_pv, c_pv))

        var.INFO['MB'] = prob_to_phredqual(m_pv)
        var.INFO['BB'] = prob_to_phredqual(b_pv)
        #var.INFO['IB'] = prob_to_phredqual(i_pv)
        var.INFO['CB'] = prob_to_phredqual(c_pv)

        if args.mtc.lower() != 'none':
            pvalues.append(phredqual_to_prob(int(var.INFO[args.mtc_tag])))

    if args.mtc.lower() != 'none':

        ftag = "%s<%f" % (args.mtc, args.mtc_alpha)
        rej_idxs = []
        if args.mtc == 'bonf':
            rej_idxs = [
                i for (i, p) in enumerate(
                    multiple_testing.Bonferroni(pvalues).corrected_pvals)
                if p < args.mtc_alpha
            ]

        elif args.mtc == 'holmbonf':
            rej_idxs = [
                i for (i, p) in enumerate(
                    multiple_testing.Bonferroni(pvalues).corrected_pvals)
                if p < args.mtc_alpha
            ]

        elif args.mtc == 'fdr':
            rej_idxs = fdr.fdr(pvalues, a=args.mtc_alpha)

        else:
            raise ValueError("unknown MTC method %s" % args.mtc)

        for i in rej_idxs:
            # pyvcf filter is empty if not set. lofreq's vcf clone was . or PASS
            #if not variants[i].FILTER or variants[i].FILTER in [".", "PASS"]:
            #    new_f = [ftag]
            #else:
            #    new_f = "%s;%s" % (variants[i].FILTER, ftag)
            #variants[i] = variants[i]._replace(FILTER=new_f)
            variants[i].FILTER.append(ftag)

        LOG.info("%d of %d variants didn't pass filter" %
                 (len(rej_idxs), len(variants)))

    # pyvcf doesn't need write_metainfo or write_header
    #vcf_writer.write_metainfo()
    #vcf_writer.write_header()
    for var in variants:
        filtered = len(var.FILTER) > 0 and var.FILTER not in [".", "PASS"]
        if args.pass_only and filtered:
            continue
        # LoFreq's vcf clone called this write_rec()
        vcf_writer.write_record(var)

    if fh_out != sys.stdout:
        fh_out.close()
Example #9
0
     try:
         self.input_vcf_file = input_vcf_file
         self.reader = vcf.VCFReader(filename=self.input_vcf_file)
     except Exception, e:
         logging.error("Error opening input VCF file: " + str(e))
         raise ValueError("Error opening input VCF file: " + str(e))
 else:
     logging.error("Input VCF file does not exist!")
     raise ValueError("Input VCF file does not exist!")
 # loads writer
 try:
     if output_vcf_file is None or output_vcf_file == "":
         output_vcf = sys.stdout
     else:
         output_vcf = open(output_vcf_file, 'w')
     self.writer = vcf.VCFWriter(output_vcf, self.reader)
 except Exception, e:
     logging.error("Error opening output VCF file: " + str(e))
     raise ValueError("Error opening output VCF file: " + str(e))
 # loads writer for duplicated variants
 try:
     if output_vcf_file is None or output_vcf_file == "":
         output_duplicated_vcf = sys.stderr
     else:
         duplicated_vcf_file = os.path.join(
             os.path.dirname(os.path.realpath(output_vcf_file)),
             os.path.splitext(os.path.basename(output_vcf_file))[0] +
             ".duplicated.vcf")
         output_duplicated_vcf = open(duplicated_vcf_file, 'w')
     self.writer_duplicated = vcf.VCFWriter(output_duplicated_vcf,
                                            self.reader)
def main():
    """The main function
    """

    parser = cmdline_parser()
    args = parser.parse_args()

    if args.verbose:
        LOG.setLevel(logging.INFO)
    if args.debug:
        LOG.setLevel(logging.DEBUG)

    assert os.path.exists(args.bam), ("BAM file %s does not exist" % args.bam)
    samfh = pysam.Samfile(args.bam)

    # setup vcf_reader
    #
    if args.vcfin[-3:] == '.gz':
        fh_in = gzip.open(args.vcfin)
        compressed = True
    else:
        compressed = False
        if args.vcfin == '-':
            fh_in = sys.stdin
        else:
            fh_in = open(args.vcfin)
    vcf_reader = vcf.VCFReader(fh_in, compressed)

    # setup vcf_writer
    #
    if args.vcfout == '-':
        fh_out = sys.stdout
    else:
        if os.path.exists(args.vcfout):
            LOG.fatal("Cowardly refusing to overwrite already existing"
                      " file %s" % (args.vcfout))
            sys.exit(1)

        if args.vcfout[-3:] == '.gz':
            fh_out = gzip.open(args.vcfout, 'w')
        else:
            fh_out = open(args.vcfout, 'w')

    # pyvcf needs template as arg to VCFWriter, whereas LoFreq's vcf
    # clone didn't
    vcf_writer = vcf.VCFWriter(fh_out, vcf_reader, lineterminator=os.linesep)
    #vcf_writer = vcf.VCFWriter(fh_out)
    #vcf_writer.meta_from_reader(vcf_reader)
    # FIXME should add filter description to header

    for (var_no, var) in enumerate(vcf_reader):
        if var_no % 500 == 1:
            LOG.info("Analyzing variant %d" % (var_no))

        if 'INDEL' in var.INFO:
            LOG.warn("Skipping indel %s:%d" % (var.CHROM, var.POS))
            continue
        if len(var.REF) > 1 or len(var.ALT) > 1:
            LOG.warn("Skipping indel (not tagged as such) %s:%d" %
                     (var.CHROM, var.POS))
            continue

        reads = list(
            samfh.fetch(reference=var.CHROM, start=var.POS - 1, end=var.POS))
        LOG.debug("%s %d: %d (unfiltered) reads covering position" %
                  (var.CHROM, var.POS, len(reads)))

        ref_bquals = []
        alt_bquals = []

        # FIXME huge code overlap with lofreq2_bias.py
        for r in reads:

            if skip_read(r):
                continue

            # determine position on read for variant to then determine
            # the current base and its basequal
            #
            vpos_on_read = [
                vpos_on_read for (vpos_on_read, vpos_on_ref) in r.aligned_pairs
                if vpos_on_ref == var.POS - 1
            ]
            #if False:
            #    if len(vpos_on_read)!=1:
            #        #import pdb; pdb.set_trace()
            #        from IPython import embed; embed()
            assert len(vpos_on_read) == 1
            vpos_on_read = vpos_on_read[0]
            if vpos_on_read == None:  # skip deletions
                continue

            b = r.query[vpos_on_read]
            bq = ord(r.qqual[vpos_on_read]) - 33

            assert len(var.REF) == 1 and len(var.ALT) == 1
            if b.upper() == var.REF[0].upper():
                ref_bquals.append(bq)
            elif b.upper() == str(var.ALT[0]).upper():
                alt_bquals.append(bq)
            else:
                LOG.debug("Skipping non-ref-alt base %s at %s:%d" %
                          (b, var.CHROM, var.POS))
                continue

        # " A candidate is rejected if, in the control data, there are
        # (i) >= 2 observations of the alternate allele or they represent
        # >= 3% of the reads; and (ii) their sum of quality scores is >=
        # 20."
        # FIXME set filter var.INFO['AN'] = True
        print_this_var = True
        num_alt = len(alt_bquals)
        num_ref = len(ref_bquals)
        num_both = num_alt + num_ref
        if num_both == 0:
            LOG.warn("No alt or ref bases for var %s" % var)
            print_this_var = True
        else:
            if (num_alt >= 2 or num_alt / float(num_both) >= 0.03
                ) and sum(alt_bquals) > 20:
                var.FILTER.append(FILTER_TAG)
                if args.pass_only:
                    print_this_var = False
        if print_this_var:
            # LoFreq's vcf clone called this write_rec()
            vcf_writer.write_record(var)

    if fh_in != sys.stdout:
        fh_in.close()
    if fh_out != sys.stdout:
        fh_out.close()
Example #11
0
def makevcf(lMutatationsFile, lVcfTemplate, lMixtureRatio, lJobName):
    """
    Create vcf file according to values in the mutations file
    The header will be picked up from the template vcf file
    :param lMutatationsFile: Mutations csv file
    :param lVcfTemplate: Template vcf file for header
    :param lMixtureRatio: Mixture ratio
    :param lJobName: Jobname
    :return: A dict with paths of vcf files created
    """
    rVcfFilePathDict = {}
    contributorList = lMixtureRatio.split(":")
    currentContributor = 0
    NoAlleleSpecifiedChar = '-'

    # Reading in the vcf template file for header and a dummy record
    vcfTemplateObj = vcf.Reader(open(lVcfTemplate, 'r'))
    dummyRecord = vcfTemplateObj.next()

    # Create a vcf file for each contributor
    for indContributor in contributorList:
        contributorStrRep = 'contributor' + str((currentContributor + 1))
        vcfWriteObj = vcf.VCFWriter(
            open(lJobName + "_" + contributorStrRep + ".vcf", 'w'),
            vcfTemplateObj)
        logging.info('Creating vcf file for %s' % contributorStrRep)

        with open(lMutatationsFile, 'rU') as csvFileName:
            csvFile = csv.DictReader(csvFileName, dialect=csv.excel)

            for indRow in csvFile:
                alternateAlleles = indRow['Alternate Alleles'].split('/')
                alleleFreq = indRow['Allele Frequency'].split('/')
                currentAlternateAllele, currentAlleleFreq = None, None
                try:
                    currentAlternateAllele = alternateAlleles[
                        currentContributor]
                    currentAlleleFreq = alleleFreq[currentContributor]
                except IndexError:
                    logging.warning(
                        'No alternate allele or freq given for contributor %d at site %s:%s',
                        contributorStrRep, indRow['Chromosome'],
                        indRow['Position'])
                else:
                    currentRecord = deepcopy(dummyRecord)
                    if currentAlternateAllele != NoAlleleSpecifiedChar and currentAlleleFreq != NoAlleleSpecifiedChar:
                        currentRecord.CHROM = indRow['Chromosome']
                        currentRecord.POS = indRow['Position']
                        currentRecord.REF = indRow['Reference Allele']
                        currentRecord.ALT = currentAlternateAllele.split(
                            ',')  # pyvcf takes a list for ALT
                        currentRecord.INFO['AF'] = currentAlleleFreq
                        if float(
                                currentAlleleFreq
                        ) >= 1.0:  # if HOM then pl=3 else pl=default 2 from template
                            currentRecord.INFO['pl'] = 3
                        # Adding support for mutation type i.e. SUBSTITUTE for SNP INSERT/DELETE for ins/del
                        # specifying currentRecord.ALT[0] as we have list as alternate allele while we want
                        # to check the length of the first alternate allele. Hard coding 0 in here as i dont think
                        # we will have scenario of specifying multiple alternate alleles for one contributor
                        if len(currentRecord.REF) == len(currentRecord.ALT[0]):
                            currentRecord.INFO['mt'] = 'SUBSTITUTE'
                        elif len(currentRecord.REF) > len(
                                currentRecord.ALT[0]):
                            currentRecord.INFO['mt'] = 'DELETE'
                        elif len(currentRecord.REF) < len(
                                currentRecord.ALT[0]):
                            currentRecord.INFO['mt'] = 'INSERT'
                        else:
                            logging.warning(
                                'Unknown mutation type observed for contributor %d at site %s:%s',
                                contributorStrRep, indRow['Chromosome'],
                                indRow['Position'])
                        vcfWriteObj.write_record(currentRecord)

        vcfWriteObj.close()
        rVcfFilePathDict[contributorStrRep] = os.path.abspath(
            lJobName + "_" + contributorStrRep + ".vcf")
        currentContributor += 1

    return rVcfFilePathDict
    def addTSSInfo(self, vcfInputFile):
        vcf_reader = vcf.Reader(open(vcfInputFile, 'r'))
        vcf_reader.infos['TSSOL'] = VcfInfo(
            'TSSOL', vcf_field_counts['A'], 'String',
            'Info indicates whether the variant overlapping with the'
            ' transcription start site(TSS)')

        vcf_writer = vcf.VCFWriter(open('output.vcf', 'w'), vcf_reader)

        query = SPARQLQueries.sparqlQueries()

        totalVar = 0
        tssOLVar = 0

        lo = LiftOver('hg38ToHg19.over.chain.gz')

        for record in vcf_reader:
            variantStart = record.start
            variantEnd = record.end
            variantChromosome = record.CHROM
            variantSubType = record.var_subtype
            isOverlapping = False

            # Adding chr prefix to the chromosome
            if "chr" not in variantChromosome:
                variantChromosome = "chr" + str(record.CHROM)

            #liftover from hg20 to hg19
            data = lo.convert_coordinate(variantChromosome, variantStart)

            #print variantChromosome
            print variantStart
            print variantEnd

            if ((data != None)):
                data2 = data.pop()

                variantChromosomehg19 = data2[0]
                variantStarthg19 = data2[1]

                data = lo.convert_coordinate(variantChromosome, variantEnd)
                data2 = data.pop()

                variantEndhg19 = data2[1]

                # SPARQL query
                result = query.getTSS('http://ep.dbcls.jp/fantom5/sparql',
                                      variantStarthg19, variantEndhg19,
                                      variantChromosomehg19)

                for row in result:

                    values = sparql.unpack_row(row)
                    cageStart = values[1]
                    cageEnd = values[2]

                    if ((variantSubType == 'ins') &
                        (variantStart > cageStart)):
                        isOverlapping = True
                        tssOLVar = tssOLVar + 1
                        break
                    elif ((variantSubType != 'ins') & (cageStart > 0)):
                        isOverlapping = True
                        tssOLVar = tssOLVar + 1
                    break

                totalVar = totalVar + 1
                record.add_info('TSSOL', [isOverlapping])
            else:
                print "No liftover found for this pos = " + record.ID

            vcf_writer.write_record(record)

            print "No of variants = " + str(totalVar)
            print "No of tss overlapping variants = " + str(tssOLVar)
        self.v1 = vcf.Reader(open(vcf1))
        self.v2 = vcf.Reader(open(vcf2))
        #self.it = itertools.product(self.v1, self.v2)

    def intersect(self):
        i = j = 0
        for left in self.v1:
            i += 1
            for right in self.v2:
                j += 1
                if left == right:
                    print "(%d, %d) %s %s %s" % (i, j, left.start, left.end, left.CHROM)
                else:
                    print "(%d, %d)" % (i, j)





if __name__ == "__main__":
    #isec = intersection(sys.argv[1], sys.argv[2])
    #isec.intersect()

    r1 = vcf.Reader(open(sys.argv[1]))
    r2 = vcf.Reader(open(sys.argv[2]))
    w = vcf.VCFWriter(open(sys.argv[3], 'w'), r1)
    intersectIter2(r1, r2, w)