Example #1
0
    def run(self):

        # Phase 1 - Detection of BarCode
        self.bc = BarCode(self.bam)
        sys.stderr.write("[%s] Starting BarCode Analysis \n" % (self.get_time(),))
        
        self.bc.simple_approach()
        sys.stderr.write("[%s] Analyzed BarCodes \n" % (self.get_time(),))
        
        self.bc.write_barcodes(self.barcodes)
        sys.stderr.write("[%s] Wrote BarCodes\n" % (self.get_time(),))

        # Phase 2 - Rewrite BAM
        sys.stderr.write("[%s] Starting Sort and Rewrite BAM\n" % (self.get_time(),))
        
        self.bc.load_barcodes(self.barcodes)
        sys.stderr.write("[%s] Loaded BarCodes\n" % (self.get_time(),))

        self.bc.bam.reset()
        self.bc.sort_and_rewrite_bam(self.rewritten_bam)
        pysam.sort("-n", self.rewritten_bam, self.rewritten_sorted_bam.replace(".bam", ""))
        sys.stderr.write("[%s] Sort and Rewrite BAM\n" % (self.get_time(),))
        
        # Phase 3 - Build Consensus
        self.consensus = Consensus(self.rewritten_sorted_bam, self.ref)
        sys.stderr.write("[%s] Starting Consensus Building\n" % (self.get_time(),))
        
        self.consensus.build()
        sys.stderr.write("[%s] Built and Calculated Consensus\n" % (self.get_time(),))
        
        self.consensus.infer_consensus(self.consensus_reference)
        sys.stderr.write("[%s] Inferred Consensus\n" % (self.get_time(),))

        # Phase 4 - Call Variants and Haplotypes
        self.consensus.output_consensus_genomes(self.consensus_genomes)
        sys.stderr.write("[%s] Output Consensus Genomes\n" % (self.get_time(),))

        self.consensus.output_haplotype_distribution(self.haplotype_distribution)
        sys.stderr.write("[%s] Output Haplotype Distribution\n" % (self.get_time(),))
        
        self.ovcf = VCF(self.vcf, crossmap=self.crossmap)
        self.ovcf.get_variants(self.ref.sequence,
                               self.consensus.consensus_genomes)
        self.ovcf.output_vcf(self.ref.sequence)
        sys.stderr.write("[%s] Output VCF\n" % (self.get_time(),))

        # Phase 5 - Summary Statistics and Chain Files
        f_out = open(self.out, "w")
        self.consensus.output_consensus_coverage(f_out)
        self.ovcf.output_variants_distribution(f_out)
        self.bc.output_reads_in_barcode_distribution(f_out)
        f_out.close()
        sys.stderr.write("[%s] Output Summary Statistics\n" % (self.get_time(),))

        self.ochain = Chain(self.chain)
        self.ochain.output_chain(self.ref,
                                 self.consensus.inferred_consensus,
                                 self.consensus.inferred_structure)
        sys.stderr.write("[%s] Output Chain File\n" % (self.get_time(),))
Example #2
0
    def as_indel(self, ref_fasta):
        chrom = self.chroms[0].lstrip('chr')
        pos = self.breaks[0]

        ref = alt = None
        size = self.get_size()
        if self.rearrangement == 'del':
            ref = ref_fasta.fetch(self.chroms[0], self.breaks[0] - 1,
                                  self.breaks[1] - 1).upper()
            alt = ref_fasta.fetch(self.chroms[0], self.breaks[0] - 1,
                                  self.breaks[0]).upper()

        else:
            ref = ref_fasta.fetch(self.chroms[0], self.breaks[0] - 1,
                                  self.breaks[1]).upper()
            alt = ref + self.novel_seq.upper()

        id = self.id
        qual = '.'
        filter = '.'
        info = {
            'BKPTID': ','.join(self.contigs),
        }

        # read support
        if self.final_support is not None:
            #info['READSUPPORT'] = self.final_support
            info['SPANNING_READS'] = self.support['spanning']

# somatic
        if self.somatic:
            info['SOMATIC'] = 'SOMATIC'

# repeat contraction
        if self.rearrangement == 'del' and self.repeat_seq is not None:
            if self.repeat_seq is not None:
                info['REPEAT_SEQ'] = self.repeat_seq
            if self.repeat_num is not None:
                info['REPEAT_NUM'] = self.repeat_num
            if self.repeat_num_change is not None:
                info['REPEAT_NUM_CHANGE'] = self.repeat_num_change

        if ref is not None and alt is not None:
            fields = [
                chrom, pos, id, ref, alt, qual, filter,
                VCF.info_dict_to_str(info)
            ]
            return '\t'.join(map(str, fields))
Example #3
0
def main():
	input = ComLine(sys.argv[1:])
	vcf_file = VCF(input.args.vcf, input.args.thin, input.args.maf, input.args.indcov, input.args.snpcov, input.args.bi)
	#if input.args.filter == True:
	#	vcf_file.convert_filter()
	#else:

	#convert to Plink
	vcf_file.convert()

	populations = Popmap(input.args.popmap)
	vcf_file.plink()
	vcf_file.print_populations(populations)
	admix_run = Admixture(vcf_file.prefix, input.args.np, input.args.minK, input.args.maxK, input.args.rep, input.args.cv)
	admix_run.admix()
	admix_run.create_zip()
	admix_run.loglik()
	admix_run.print_cv()
Example #4
0
    def as_indel(self, ref_fasta):
        chrom = self.chroms[0].lstrip('chr')
	pos = self.breaks[0]
	
	ref = alt = None
	size = self.get_size()
	if self.rearrangement == 'del':
	    ref = ref_fasta.fetch(self.chroms[0], self.breaks[0] - 1, self.breaks[1] - 1).upper()
	    alt = ref_fasta.fetch(self.chroms[0], self.breaks[0] - 1, self.breaks[0]).upper()
	    
	else:
	    ref = ref_fasta.fetch(self.chroms[0], self.breaks[0] - 1, self.breaks[1]).upper()
	    alt = ref + self.novel_seq.upper()
	    
	id = self.id
	qual = '.'
	filter = '.'
	info = {
	        'BKPTID':','.join(self.contigs),
	        }
	
	# read support
	if self.final_support is not None:
	    #info['READSUPPORT'] = self.final_support
	    info['SPANNING_READS'] = self.support['spanning']

	# somatic
	if self.somatic:
	    info['SOMATIC'] = 'SOMATIC'
	    
	# repeat contraction
	if self.rearrangement == 'del' and self.repeat_seq is not None:
	    if self.repeat_seq is not None:
		info['REPEAT_SEQ'] = self.repeat_seq
	    if self.repeat_num is not None:
		info['REPEAT_NUM'] = self.repeat_num
	    if self.repeat_num_change is not None:
		info['REPEAT_NUM_CHANGE'] = self.repeat_num_change

	if ref is not None and alt is not None:
	    fields = [chrom, pos, id, ref, alt, qual, filter, VCF.info_dict_to_str(info)]
	    return '\t'.join(map(str, fields))
Example #5
0
    def assemble_consensus_genomes(self):
        
        # build consensus
        self.consensus = Consensus(self.rewritten_sorted_bam, self.ref)
        sys.stderr.write("[%s] Starting Consensus Building\n" % (self.get_time(),))
        
        self.consensus.build(debug=self.debug)
        sys.stderr.write("[%s] Built and Calculated Consensus\n" % (self.get_time(),))
        
        self.consensus.infer_consensus(self.consensus_reference)
        sys.stderr.write("[%s] Inferred Consensus\n" % (self.get_time(),))

        self.consensus.output_consensus_genomes(self.consensus_genomes)
        sys.stderr.write("[%s] Output Consensus Genomes\n" % (self.get_time(),))

        self.consensus.output_haplotype_distribution(self.haplotype_distribution)
        sys.stderr.write("[%s] Output Haplotype Distribution\n" % (self.get_time(),))

        self.quark = Quark(self.ref.sequence)
        self.quark.distance_matrix(sorted(self.consensus.freq_distribution.items(),
                                          key=lambda q: q[1],
                                          reverse=True))
        self.quark.graph_it()
        self.quark.rank_it(self.rank)
        sys.stderr.write("[%s] Output Quark Analysis\n" % (self.get_time(),))
        
        self.ovcf = VCF(self.vcf, crossmap=self.crossmap)
        self.ovcf.get_variants(self.ref.sequence,
                               self.consensus.consensus_genomes)
        self.ovcf.output_vcf(self.ref.sequence)
        sys.stderr.write("[%s] Output VCF\n" % (self.get_time(),))

        self.summary_statistics()
        sys.stderr.write("[%s] Output Summary Statistics\n" % (self.get_time(),))

        self.ochain = Chain(self.chain)
        self.ochain.output_chain(self.ref,
                                 self.consensus.inferred_consensus,
                                 self.consensus.inferred_structure)
        sys.stderr.write("[%s] Output Chain File\n" % (self.get_time(),))
Example #6
0
import sys
import matplotlib.pyplot as plt
from matplotlib_venn import venn2

sys.path.append("/home/zhusitao/ngs-tools/format/")
from vcf import VCF

vcf1 = sys.argv[1]
title1 = sys.argv[2]
vcf2 = sys.argv[3]
title2 = sys.argv[4]

list1 = []
list2 = []

v1 = VCF(vcf1)
for record in v1.readVCF():
    chrom, pos, ref, ale = record.CHROM, record.POS, record.REF, record.ALT
    list1.append(chrom + pos + ref + ale)

v2 = VCF(vcf2)
for record in v2.readVCF():
    chrom, pos, ref, ale = record.CHROM, record.POS, record.REF, record.ALT
    list2.append(chrom + pos + ref + ale)
set1 = set(list1)
set2 = set(list2)
inter = set1 & set2

inter_len = len(inter)
set1_len = len(set1)
set2_len = len(set2)
def main():
    input = ComLine(sys.argv[1:])
    pops = Popmap(input.args.popmap)
    vcf = VCF(input.args.vcf, pops)
    vcf.printFile(input.args.out)
    vcf.printPrivate(input.args.out)
Example #8
0
def main():

    # argument method
    parser = argparse.ArgumentParser()
    # positional argument
    parser.add_argument('-d',
                        '--directory',
                        '--dir',
                        help='parent directory of the samples i.e. Sample_054',
                        required=True)
    # optional argument
    parser.add_argument(
        '-f',
        '--filter',
        action='store_true',
        help=
        'use argument to keep the following columns: CHROM, POS, REF, ALT, genotype info. \n'
        'Note: if option selection, the -d can either be the vcf.gz or the parent directory. '
        'If vcf.gz, it will perform filtering on that file. Else, it will perform filtering on'
        'all the files in the hierarchical directory.')
    # optional argument
    parser.add_argument(
        '-m',
        '--merge',
        action='store_true',
        help=
        'use argument to merge all filtered.vcf.gz files in the parent directory'
    )
    # optional argument
    parser.add_argument(
        '-o',
        '--output',
        help='directory where to output the filtered or merged files')
    # optional argument
    parser.add_argument('-ht',
                        '--homozygous_test',
                        action='store_true',
                        help='use argument to collect homozygous statistics')
    # optional argument
    parser.add_argument(
        '-s',
        '--subset',
        action='store_true',
        help='use argument to subset the vcf file based on chromosomes')
    # optional argument
    parser.add_argument(
        '-c',
        '--chromosome',
        help=
        'use argument to select the chromosome number on which to subset on')
    # optional argument
    parser.add_argument(
        '-n',
        '--number_sites',
        help='use argument to select the number of line on which to subset on',
        type=int)
    # optional argument
    parser.add_argument('-p',
                        '--phase',
                        action='store_true',
                        help='use argument to select the phasing test')
    parser.add_argument('-lc',
                        '--list_chromosomes',
                        action='store_true',
                        help='list all the chromosomes in a vcf.gz '
                        'file')
    args = parser.parse_args()

    working_directory, output_directory, chromosome, filter_merge = process_arguments(
        c_directory=args.directory,
        o_directory=args.output,
        filter_flag=args.filter,
        merge_flag=args.merge,
        chrom=args.chromosome,
        arg_parser=parser)

    # create VCF object
    vcf = VCF()

    if args.list_chromosomes:
        vcf.read_files(c_dir=working_directory, vcf_filtered_file=False)
        vcf.list_chrom(output_dir=output_directory)

    # filtering and merging have to read all the vcf files on the subdirectories of the families
    elif filter_merge:
        # read all the vcf files for that family
        vcf.read_files(c_dir=working_directory)
        # filter columns of the vcf files
        if args.filter:
            vcf.filter(output_dir=output_directory)
        else:
            vcf.merge(output_dir=output_directory)

    # subset, homozygous or phasing tests have to read a vcf file in the parent directory
    else:
        # for the subset, the chromosome should not be given in the read_file function
        if args.subset:
            # chromosome if required if goal is to subset since the file will be subsetted on it
            if not args.chromosome:
                raise ValueError(
                    'Chromosome number must be provided in order to perform subset'
                )

            # read all the vcf files for that family
            vcf.read_files(c_dir=working_directory, vcf_filtered_file=True)
            vcf.subset(chrom=args.chromosome,
                       output_dir=output_directory,
                       n_sites=args.number_sites)

        else:
            # read all the vcf files for that family
            vcf.read_files(c_dir=working_directory,
                           vcf_filtered_file=True,
                           chrom=chromosome)

            if args.homozygous_test:
                # collect homozygous statistics
                vcf.tests(output_dir=output_directory,
                          chrom=chromosome,
                          homozygous_test=True)

            if args.phase:
                vcf.tmp_test(output_dir=output_directory, chrom=chromosome)
Example #9
0
def main():
    input = ComLine(sys.argv[1:])

    vcf_file = VCF(input.args.vcf, input.args.thin, input.args.maf,
                   input.args.mac, input.args.indcov, input.args.snpcov,
                   input.args.bi, input.args.remove)

    #convert to Plink
    populations = Popmap(input.args.popmap)
    vcf_file.compIndLists(populations)
    vcf_file.convert()
    vcf_file.plink()
    vcf_file.print_populations(populations)
    vcf_file.print_individuals(populations)

    admix_run = Admixture(vcf_file.prefix, input.args.np, input.args.minK,
                          input.args.maxK, input.args.rep, input.args.cv)
    admix_run.admix()
    admix_run.create_zip()
    admix_run.loglik()
    admix_run.print_cv()
Example #10
0
    def as_sv(self, ref_fasta, id_ext=None, info_ext=None, chrom_ext=None, pos_ext=None):
        chrom = self.chroms[0] if chrom_ext is None else chrom_ext
	pos = self.breaks[0] if pos_ext is None else pos_ext
	
	chrom = chrom.lstrip('chr')
	
	alt = None
	ref = ref_fasta.fetch(self.chroms[0], self.breaks[0] - 1, self.breaks[0]).upper()
	sv_len = self.get_size()
	end = None
	if type(sv_len) is int:
	    end = pos + sv_len
	    
	if self.rearrangement == 'del':
	    alt = '<DEL>'
	    sv_type = 'DEL'
	    if type(sv_len) is int:
		sv_len = -1 * sv_len
		end = pos - sv_len
	    
	elif self.rearrangement == 'dup':
	    alt = '<DUP:TANDEM>'
	    sv_type = 'DUP'
	    
	elif self.rearrangement == 'inv':
	    alt = '<INV>'
	    sv_type = 'INV'
	    
	elif self.rearrangement == 'ins':
	    alt = '<INS>'
	    sv_type = 'INS'
	    end = pos
	    
	id = self.id if id_ext is None else id_ext
	qual = '.'
	filter = '.'
	info = {'SVTYPE': sv_type,
	        'END': end,
	        'BKPTID':','.join(self.contigs),
	        }
	if end is not None:
	    info['END'] = end
	if type(sv_len) is int:
	    info['SVLEN'] = sv_len

	if sv_type == 'DUP':
	    if self.repeat_seq is not None:
		info['REPEAT_SEQ'] = self.repeat_seq
	    if self.repeat_num is not None:
		info['REPEAT_NUM'] = self.repeat_num
	    if self.repeat_num_change is not None:
		info['REPEAT_NUM_CHANGE'] = self.repeat_num_change
	
	# read support
	if self.final_support is not None:
	    #info['READSUPPORT'] = self.final_support
	    info['SPANNING_READS'] = self.support['spanning']
	    if self.support['flanking'] is not None:
		info['FLANKING_PAIRS'] = self.support['flanking']
	    
	# somatic
	if self.somatic:
	    info['SOMATIC'] = 'SOMATIC'
    
	cipos = None
	homol_len = None
	homol_seq = None
	if self.homol_seq and self.homol_seq[0] != '-':
	    homol_seq = self.homol_seq[0].upper()
	    homol_len = len(self.homol_seq[0])
	    contig_breaks = self.contig_breaks[0]
	    # e.g. GMAP
	    if contig_breaks[0] + 1 == contig_breaks[1]:
		#print 'gmap', contig_breaks
		pass
	    # e.g. BWA-mem
	    elif contig_breaks[0] >= contig_breaks[1]:
		cipos = '0,%d' % homol_len
		
	if cipos is not None:
	    info['CIPOS'] = cipos
	    info['CIPOS'] = cipos
	if homol_len is not None:
	    info['HOMLEN'] = homol_len
	    info['HOMLEN'] = homol_len
	if homol_seq is not None:
	    info['HOMSEQ'] = homol_seq
	    info['HOMSEQ'] = homol_seq
	    
	# external info - overrides given info
	if info_ext:
	    for key, value in info_ext.iteritems():
		if key == 'SVLEN' and value == 'NA':
		    continue
		info[key] = value
	
	if ref is not None and alt is not None:
	    fields = [chrom, pos, id, ref, alt, qual, filter, VCF.info_dict_to_str(info)]
	    return '\t'.join(map(str, fields))
Example #11
0
class BaseSeq(Helper):

    def __init__(self, bam, barcodes=None, out=None, ref=None,
                 rewritten_bam=None,
                 consensus_reference=None,
                 consensus_genomes=None,
                 haplotype_distribution=None,
                 vcf=None,
                 chain=None,
                 crossmap=None,
                 export=None,
                 rank=None,
                 debug=None):
        
        self.bam = bam
        self.barcodes = barcodes
        self.out = out
        self.ref = Reference(ref)

        self.rewritten_bam = rewritten_bam
        self.rewritten_sorted_bam = rewritten_bam.replace(".bam", ".sorted.bam") if rewritten_bam else None
        self.consensus_reference = consensus_reference
        self.consensus_genomes = consensus_genomes
        self.haplotype_distribution = haplotype_distribution
        self.vcf = vcf
        self.chain = chain
        self.crossmap = crossmap
        self.export = export
        self.rank = rank
        self.debug = int(debug)
    
    def get_barcodes(self):
        # simple approach - align, take soft-clipped, and use the arbitrary 20 bases
        # intermediate approach - use the seed and extend approach
        out = open(self.out, "w")

        self.bc = BarCode(self.bam)
        self.bc.simple_approach()

        for k, v in sorted(self.bc.barcode_to_read.items()):
            q = sorted(v)
            out.write("%s\t%s\n" % (k, ",".join(q)))

        out.close()

    def error_correction_barcodes(self):

        # start analysis
        self.bc = BarCode(self.bam)
        sys.stdout.write("[%s] Starting Error Correction Analysis\n" % (self.get_time(),))

        # load barcodes
        self.bc.load_barcodes(self.barcodes)
        sys.stdout.write("[%s] Loaded BarCodes\n" % (self.get_time(),))

        # cluster barcodes
        self.bc.cluster_barcodes()
        sys.stdout.write("[%s] Clustered BarCodes\n" % (self.get_time(),))

    
    def filter_barcodes(self, barcode, export="fastq"):
        list_of_ids = []
        with open(self.barcodes, "r") as f:
            for line in f:
                data = line.strip("\r\n").split("\t")
                if barcode == data[0]:
                    list_of_ids = data[1].split(",")
                    break

        self.bc = BarCode(self.bam)
        self.bc.filter_and_export(list_of_ids, self.out, export=export)


    def sort_and_rewrite_bam(self):
        
        self.bc = BarCode(self.bam)
        sys.stderr.write("[%s] Starting Sort and Rewrite BAM\n" % (self.get_time(),))
        
        self.bc.load_barcodes(self.barcodes)
        sys.stderr.write("[%s] Loaded BarCodes\n" % (self.get_time(),))

        self.bc.sort_and_rewrite_bam(self.rewritten_bam)
        pysam.sort("-n", self.rewritten_bam, self.rewritten_sorted_bam.replace(".bam", ""))
        sys.stderr.write("[%s] Sort and Rewrite BAM\n" % (self.get_time(),))

    def split_bam_by_barcode(self):

        self.bc = BarCode(self.bam)
        sys.stderr.write("[%s] Starting procedure to split BAM by barcode\n" % (self.get_time(),))

        self.bc.split_bam_into_barcodes(self.ref, self.out, self.export)
        sys.stderr.write("[%s] Finished splitting BAM by barcode id\n" % (self.get_time(),))
        

    def assemble_consensus_genomes(self):
        
        # build consensus
        self.consensus = Consensus(self.rewritten_sorted_bam, self.ref)
        sys.stderr.write("[%s] Starting Consensus Building\n" % (self.get_time(),))
        
        self.consensus.build(debug=self.debug)
        sys.stderr.write("[%s] Built and Calculated Consensus\n" % (self.get_time(),))
        
        self.consensus.infer_consensus(self.consensus_reference)
        sys.stderr.write("[%s] Inferred Consensus\n" % (self.get_time(),))

        self.consensus.output_consensus_genomes(self.consensus_genomes)
        sys.stderr.write("[%s] Output Consensus Genomes\n" % (self.get_time(),))

        self.consensus.output_haplotype_distribution(self.haplotype_distribution)
        sys.stderr.write("[%s] Output Haplotype Distribution\n" % (self.get_time(),))

        self.quark = Quark(self.ref.sequence)
        self.quark.distance_matrix(sorted(self.consensus.freq_distribution.items(),
                                          key=lambda q: q[1],
                                          reverse=True))
        self.quark.graph_it()
        self.quark.rank_it(self.rank)
        sys.stderr.write("[%s] Output Quark Analysis\n" % (self.get_time(),))
        
        self.ovcf = VCF(self.vcf, crossmap=self.crossmap)
        self.ovcf.get_variants(self.ref.sequence,
                               self.consensus.consensus_genomes)
        self.ovcf.output_vcf(self.ref.sequence)
        sys.stderr.write("[%s] Output VCF\n" % (self.get_time(),))

        self.summary_statistics()
        sys.stderr.write("[%s] Output Summary Statistics\n" % (self.get_time(),))

        self.ochain = Chain(self.chain)
        self.ochain.output_chain(self.ref,
                                 self.consensus.inferred_consensus,
                                 self.consensus.inferred_structure)
        sys.stderr.write("[%s] Output Chain File\n" % (self.get_time(),))
    
    
    def summary_statistics(self):
        # coverage per genome
        # variants per genome
        # estimate PCR and sequencing errors
        # barcode distribution
        f_out = open(self.out, "w")

        self.bc = BarCode(self.bam)  #TEMP
        self.bc.load_barcodes(self.barcodes)  #TEMP
        
        self.consensus.output_consensus_coverage(f_out)
        self.ovcf.output_variants_distribution(f_out)
        self.bc.output_reads_in_barcode_distribution(f_out)
        
        f_out.close()

    def run(self):

        # Phase 1 - Detection of BarCode
        self.bc = BarCode(self.bam)
        sys.stderr.write("[%s] Starting BarCode Analysis \n" % (self.get_time(),))
        
        self.bc.simple_approach()
        sys.stderr.write("[%s] Analyzed BarCodes \n" % (self.get_time(),))
        
        self.bc.write_barcodes(self.barcodes)
        sys.stderr.write("[%s] Wrote BarCodes\n" % (self.get_time(),))

        # Phase 2 - Rewrite BAM
        sys.stderr.write("[%s] Starting Sort and Rewrite BAM\n" % (self.get_time(),))
        
        self.bc.load_barcodes(self.barcodes)
        sys.stderr.write("[%s] Loaded BarCodes\n" % (self.get_time(),))

        self.bc.bam.reset()
        self.bc.sort_and_rewrite_bam(self.rewritten_bam)
        pysam.sort("-n", self.rewritten_bam, self.rewritten_sorted_bam.replace(".bam", ""))
        sys.stderr.write("[%s] Sort and Rewrite BAM\n" % (self.get_time(),))
        
        # Phase 3 - Build Consensus
        self.consensus = Consensus(self.rewritten_sorted_bam, self.ref)
        sys.stderr.write("[%s] Starting Consensus Building\n" % (self.get_time(),))
        
        self.consensus.build()
        sys.stderr.write("[%s] Built and Calculated Consensus\n" % (self.get_time(),))
        
        self.consensus.infer_consensus(self.consensus_reference)
        sys.stderr.write("[%s] Inferred Consensus\n" % (self.get_time(),))

        # Phase 4 - Call Variants and Haplotypes
        self.consensus.output_consensus_genomes(self.consensus_genomes)
        sys.stderr.write("[%s] Output Consensus Genomes\n" % (self.get_time(),))

        self.consensus.output_haplotype_distribution(self.haplotype_distribution)
        sys.stderr.write("[%s] Output Haplotype Distribution\n" % (self.get_time(),))
        
        self.ovcf = VCF(self.vcf, crossmap=self.crossmap)
        self.ovcf.get_variants(self.ref.sequence,
                               self.consensus.consensus_genomes)
        self.ovcf.output_vcf(self.ref.sequence)
        sys.stderr.write("[%s] Output VCF\n" % (self.get_time(),))

        # Phase 5 - Summary Statistics and Chain Files
        f_out = open(self.out, "w")
        self.consensus.output_consensus_coverage(f_out)
        self.ovcf.output_variants_distribution(f_out)
        self.bc.output_reads_in_barcode_distribution(f_out)
        f_out.close()
        sys.stderr.write("[%s] Output Summary Statistics\n" % (self.get_time(),))

        self.ochain = Chain(self.chain)
        self.ochain.output_chain(self.ref,
                                 self.consensus.inferred_consensus,
                                 self.consensus.inferred_structure)
        sys.stderr.write("[%s] Output Chain File\n" % (self.get_time(),))
        
    
    def assemble_genomes(self):
        pass

    
    def assemble_genomes_from_fastq(self):
        pass
Example #12
0
    def as_sv(self,
              ref_fasta,
              id_ext=None,
              info_ext=None,
              chrom_ext=None,
              pos_ext=None):
        chrom = self.chroms[0] if chrom_ext is None else chrom_ext
        pos = self.breaks[0] if pos_ext is None else pos_ext

        chrom = chrom.lstrip('chr')

        alt = None
        ref = ref_fasta.fetch(self.chroms[0], self.breaks[0] - 1,
                              self.breaks[0]).upper()
        sv_len = self.get_size()
        end = None
        if type(sv_len) is int:
            end = pos + sv_len

        if self.rearrangement == 'del':
            alt = '<DEL>'
            sv_type = 'DEL'
            if type(sv_len) is int:
                sv_len = -1 * sv_len
                end = pos - sv_len

        elif self.rearrangement == 'dup':
            alt = '<DUP:TANDEM>'
            sv_type = 'DUP'

        elif self.rearrangement == 'inv':
            alt = '<INV>'
            sv_type = 'INV'

        elif self.rearrangement == 'ins':
            alt = '<INS>'
            sv_type = 'INS'
            end = pos

        id = self.id if id_ext is None else id_ext
        qual = '.'
        filter = '.'
        info = {
            'SVTYPE': sv_type,
            'END': end,
            'BKPTID': ','.join(self.contigs),
        }
        if end is not None:
            info['END'] = end
        if type(sv_len) is int:
            info['SVLEN'] = sv_len

        if sv_type == 'DUP':
            if self.repeat_seq is not None:
                info['REPEAT_SEQ'] = self.repeat_seq
            if self.repeat_num is not None:
                info['REPEAT_NUM'] = self.repeat_num
            if self.repeat_num_change is not None:
                info['REPEAT_NUM_CHANGE'] = self.repeat_num_change

# read support
        if self.final_support is not None:
            #info['READSUPPORT'] = self.final_support
            info['SPANNING_READS'] = self.support['spanning']
            if self.support['flanking'] is not None:
                info['FLANKING_PAIRS'] = self.support['flanking']

# somatic
        if self.somatic:
            info['SOMATIC'] = 'SOMATIC'

        cipos = None
        homol_len = None
        homol_seq = None
        if self.homol_seq and self.homol_seq[0] != '-':
            homol_seq = self.homol_seq[0].upper()
            homol_len = len(self.homol_seq[0])
            contig_breaks = self.contig_breaks[0]
            # e.g. GMAP
            if contig_breaks[0] + 1 == contig_breaks[1]:
                #print 'gmap', contig_breaks
                pass
            # e.g. BWA-mem
            elif contig_breaks[0] >= contig_breaks[1]:
                cipos = '0,%d' % homol_len

        if cipos is not None:
            info['CIPOS'] = cipos
            info['CIPOS'] = cipos
        if homol_len is not None:
            info['HOMLEN'] = homol_len
            info['HOMLEN'] = homol_len
        if homol_seq is not None:
            info['HOMSEQ'] = homol_seq
            info['HOMSEQ'] = homol_seq

# external info - overrides given info
        if info_ext:
            for key, value in info_ext.iteritems():
                if key == 'SVLEN' and value == 'NA':
                    continue
                info[key] = value

        if ref is not None and alt is not None:
            fields = [
                chrom, pos, id, ref, alt, qual, filter,
                VCF.info_dict_to_str(info)
            ]
            return '\t'.join(map(str, fields))
Example #13
0
    def as_breakends(self,
                     ref_fasta,
                     genomic=True,
                     max_novel_seq_len=50,
                     info_ext=None,
                     parids=None,
                     event=None):
        chroms = map(lambda c: c.lstrip('chr'), self.chroms)
        alt_chroms = chroms[:]
        pos = list(self.breaks)
        alt_pos = pos[:]
        # inserted novel sequences
        inserted_seqs = ['', '']
        if self.novel_seq and self.novel_seq != 'NA' and self.novel_seq != '-':
            if len(self.novel_seq) > max_novel_seq_len:
                alt_chroms[0] = '<%s>' % self.contigs[0]
                alt_chroms[1] = '<%s>' % self.contigs[0]
                alt_pos[1] = self.contig_breaks[0][0] + 1
                alt_pos[0] = self.contig_breaks[0][1] - 1
            else:
                if len(self.aligns[0]) == 1:
                    inserted_seqs[0] = self.novel_seq if self.aligns[0][
                        0].strand == '+' else reverse_complement(
                            self.novel_seq)
                    inserted_seqs[1] = self.novel_seq if self.aligns[0][
                        0].strand == '+' else reverse_complement(
                            self.novel_seq)
                else:
                    inserted_seqs[0] = self.novel_seq if self.aligns[0][
                        0].strand == '+' else reverse_complement(
                            self.novel_seq)
                    inserted_seqs[1] = self.novel_seq if self.aligns[0][
                        1].strand == '+' else reverse_complement(
                            self.novel_seq)

# microhomology, cipos
        cipos = None
        homol_len = None
        homol_seq = None
        if self.homol_seq and self.homol_seq[0] != '-' and len(
                self.homol_seq) > 0:
            homol_seq = self.homol_seq[0].upper()
            homol_len = len(self.homol_seq[0])
            contig_breaks = self.contig_breaks[0]
            # e.g. GMAP
            if contig_breaks[0] + 1 == contig_breaks[1]:
                pass
            # e.g. BWA-mem
            elif contig_breaks[0] >= contig_breaks[1]:
                pos[0] -= homol_len
                alt_pos[1] += homol_len
                cipos = '0,%d' % homol_len

        refs = (ref_fasta.fetch(self.chroms[0], self.breaks[0] - 1,
                                self.breaks[0]).upper(),
                ref_fasta.fetch(self.chroms[1], self.breaks[1] - 1,
                                self.breaks[1]).upper())

        ids = ('%s%s' % (self.id, 'a'), '%s%s' % (self.id, 'b'))

        svtype = 'BND' if genomic else 'FND'
        infos = [{
            'SVTYPE': svtype,
            'MATEID': ids[1],
            'EVENTTYPE': self.rearrangement.upper()
        }, {
            'SVTYPE': svtype,
            'MATEID': ids[0],
            'EVENTTYPE': self.rearrangement.upper()
        }]
        if cipos is not None:
            infos[0]['CIPOS'] = cipos
            infos[1]['CIPOS'] = cipos
        if homol_len is not None:
            infos[0]['HOMLEN'] = homol_len
            infos[1]['HOMLEN'] = homol_len
        if homol_seq is not None:
            infos[0]['HOMSEQ'] = homol_seq
            infos[1]['HOMSEQ'] = homol_seq

# read support
        if self.final_support is not None:
            #infos[0]['READSUPPORT'] = self.final_support
            #infos[1]['READSUPPORT'] = self.final_support
            infos[0]['SPANNING_READS'] = self.support['spanning']
            infos[1]['SPANNING_READS'] = self.support['spanning']
            if self.support['flanking'] is not NONE:
                infos[0]['FLANKING_PAIRS'] = self.support['flanking']
                infos[1]['FLANKING_PAIRS'] = self.support['flanking']

        adj_size = self.get_size()
        if type(adj_size) is int:
            infos[0]['SVLEN'] = adj_size
            infos[1]['SVLEN'] = adj_size

# somatic
        if self.somatic:
            infos[0]['SOMATIC'] = 'SOMATIC'
            infos[1]['SOMATIC'] = 'SOMATIC'

        # contig and contig breakpoints
        if self.contigs:
            for i in range(2):
                infos[i]['BKPTID'] = ','.join(self.contigs)

        if self.contig_breaks and len(self.contig_breaks) == len(self.contigs):
            contig_breaks = []
            for bk in self.contig_breaks:
                if len(bk) == 2:
                    contig_breaks.append('%s-%s' % (bk[0], bk[1]))
                else:
                    print 'error'

            if len(contig_breaks) == len(self.contigs):
                for i in range(2):
                    infos[i]['CTG_BKS'] = ','.join(contig_breaks)

# external info - overrides given info
        if info_ext:
            for key, value in info_ext.iteritems():
                if len(value) == 2:
                    infos[0][key] = value[0]
                    infos[1][key] = value[1]

        if self.orients[0] == 'L':
            # LL
            if self.orients[1] == 'L':
                alts = ('%s%s]%s:%s]' %
                        (refs[0], inserted_seqs[0], alt_chroms[1], alt_pos[1]),
                        '%s%s]%s:%s]' %
                        (refs[1], inserted_seqs[1], alt_chroms[0], alt_pos[0]))
            # LR
            else:
                alts = ('%s%s[%s:%s[' %
                        (refs[0], inserted_seqs[0], alt_chroms[1], alt_pos[1]),
                        ']%s:%s]%s%s' %
                        (alt_chroms[0], alt_pos[0], inserted_seqs[1], refs[1]))
        else:
            # RL
            if self.orients[1] == 'L':
                alts = (']%s:%s]%s%s' %
                        (chroms[1], alt_pos[1], inserted_seqs[0], refs[0]),
                        '%s%s[%s:%s[' %
                        (refs[1], inserted_seqs[1], chroms[0], alt_pos[0]))
            # RR
            else:
                alts = ('[%s:%s[%s%s' %
                        (chroms[1], alt_pos[1], inserted_seqs[0], refs[0]),
                        '[%s:%s[%s%s' %
                        (chroms[0], alt_pos[0], inserted_seqs[1], refs[1]))

        breakends = map(
            lambda i: '\t'.join([
                chroms[i],
                str(pos[i]), ids[i], refs[i], alts[i], '.', '.',
                VCF.info_dict_to_str(infos[i])
            ]), range(2))

        return '\n'.join(breakends)
Example #14
0
##indel_exonicfunc.xls
title_indel_exonicfunc = ['Sample','frameshift_deletion','frameshift_insertion','nonframeshift_deletion','nonframeshift_insertion','stoploss','stopgain','unknown']
indel_exonicfunc.write('\t'.join(title_indel_exonicfunc)+'\n')






for file in open(files, 'r'):
	if file.startswith('#'):continue
	file = file.strip()
	sample_name = os.path.basename(file)
	sample_name = sample_name.split('.')[0]

	myVCF = VCF(file)
	snp = myVCF.filter()
	indel_file = file.replace('snp','indel')
	myVCF = VCF(indel_file)
	indel = myVCF.filter()
	
	"""
	##chr.xls
	chr = myVCF.chr_stat(vcf)
	chromosome.write(sample_name)
	for i in [str(i) for i in range(1,23)]+['X','Y']:
		try:
			chromosome.write('\t'+str(chr[i]))
		except:
			chromosome.write('\t0')
	chromosome.write('\n')
Example #15
0
    def as_breakends(self, ref_fasta, genomic=True, max_novel_seq_len=50, info_ext=None, parids=None, event=None):
        chroms = map(lambda c: c.lstrip('chr'), self.chroms)
	alt_chroms = chroms[:]
	pos = list(self.breaks)
	alt_pos = pos[:]
	# inserted novel sequences
        inserted_seqs = ['','']
	if self.novel_seq and self.novel_seq != 'NA' and self.novel_seq != '-':
	    if len(self.novel_seq) > max_novel_seq_len:
		alt_chroms[0] = '<%s>' % self.contigs[0]
		alt_chroms[1] = '<%s>' % self.contigs[0]
		alt_pos[1] = self.contig_breaks[0][0] + 1
		alt_pos[0] = self.contig_breaks[0][1] - 1
	    else:
		if len(self.aligns[0]) == 1:
		    inserted_seqs[0] = self.novel_seq if self.aligns[0][0].strand == '+' else reverse_complement(self.novel_seq)
		    inserted_seqs[1] = self.novel_seq if self.aligns[0][0].strand == '+' else reverse_complement(self.novel_seq)
		else:
		    inserted_seqs[0] = self.novel_seq if self.aligns[0][0].strand == '+' else reverse_complement(self.novel_seq)
		    inserted_seqs[1] = self.novel_seq if self.aligns[0][1].strand == '+' else reverse_complement(self.novel_seq)
		
	# microhomology, cipos
	cipos = None
	homol_len = None
	homol_seq = None
	if self.homol_seq and self.homol_seq[0] != '-' and len(self.homol_seq) > 0:
	    homol_seq = self.homol_seq[0].upper()
	    homol_len = len(self.homol_seq[0])
	    contig_breaks = self.contig_breaks[0]
	    # e.g. GMAP
	    if contig_breaks[0] + 1 == contig_breaks[1]:
		pass
	    # e.g. BWA-mem
	    elif contig_breaks[0] >= contig_breaks[1]:
		pos[0] -= homol_len
		alt_pos[1] += homol_len
		cipos = '0,%d' % homol_len
	    
        refs = (ref_fasta.fetch(self.chroms[0], self.breaks[0] - 1, self.breaks[0]).upper(),
                ref_fasta.fetch(self.chroms[1], self.breaks[1] - 1, self.breaks[1]).upper())

        ids = ('%s%s' % (self.id, 'a'),
               '%s%s' % (self.id, 'b'))
	        
        svtype = 'BND' if genomic else 'FND'
        infos = [{'SVTYPE':svtype, 'MATEID':ids[1], 'EVENTTYPE':self.rearrangement.upper()},
                 {'SVTYPE':svtype, 'MATEID':ids[0], 'EVENTTYPE':self.rearrangement.upper()}]
	if cipos is not None:
	    infos[0]['CIPOS'] = cipos
	    infos[1]['CIPOS'] = cipos
	if homol_len is not None:
	    infos[0]['HOMLEN'] = homol_len
	    infos[1]['HOMLEN'] = homol_len
	if homol_seq is not None:
	    infos[0]['HOMSEQ'] = homol_seq
	    infos[1]['HOMSEQ'] = homol_seq
	    
	# read support
	if self.final_support is not None:
	    #infos[0]['READSUPPORT'] = self.final_support
	    #infos[1]['READSUPPORT'] = self.final_support
	    infos[0]['SPANNING_READS'] = self.support['spanning']
	    infos[1]['SPANNING_READS'] = self.support['spanning']
	    if self.support['flanking'] is not NONE:
		infos[0]['FLANKING_PAIRS'] = self.support['flanking']
		infos[1]['FLANKING_PAIRS'] = self.support['flanking']
	    
	adj_size = self.get_size()
	if type(adj_size) is int:
	    infos[0]['SVLEN'] = adj_size
	    infos[1]['SVLEN'] = adj_size
	    
	# somatic
	if self.somatic:
	    infos[0]['SOMATIC'] = 'SOMATIC'
	    infos[1]['SOMATIC'] = 'SOMATIC'
	    
        # contig and contig breakpoints
        if self.contigs:
            for i in range(2):
                infos[i]['BKPTID'] = ','.join(self.contigs)
            
        if self.contig_breaks and len(self.contig_breaks) == len(self.contigs):
            contig_breaks = []
            for bk in self.contig_breaks:
                if len(bk) == 2:
                    contig_breaks.append('%s-%s' % (bk[0], bk[1]))
                else:
                    print 'error'
                    
            if len(contig_breaks) == len(self.contigs):
                for i in range(2):
                    infos[i]['CTG_BKS'] = ','.join(contig_breaks)
		    	    
	# external info - overrides given info
	if info_ext:
	    for key, value in info_ext.iteritems():
		if len(value) == 2:
		    infos[0][key] = value[0]
		    infos[1][key] = value[1]
        
        if self.orients[0] == 'L':
            # LL
            if self.orients[1] == 'L':
                alts = ('%s%s]%s:%s]' % (refs[0], inserted_seqs[0], alt_chroms[1], alt_pos[1]),
                        '%s%s]%s:%s]' % (refs[1], inserted_seqs[1], alt_chroms[0], alt_pos[0]))
            # LR
            else:
                alts = ('%s%s[%s:%s[' % (refs[0], inserted_seqs[0], alt_chroms[1], alt_pos[1]),
                        ']%s:%s]%s%s' % (alt_chroms[0], alt_pos[0], inserted_seqs[1], refs[1]))
        else:
            # RL
            if self.orients[1] == 'L':
                alts = (']%s:%s]%s%s' % (chroms[1], alt_pos[1], inserted_seqs[0], refs[0]),
                        '%s%s[%s:%s[' % (refs[1], inserted_seqs[1], chroms[0], alt_pos[0])) 
            # RR
            else:
                alts = ('[%s:%s[%s%s' % (chroms[1], alt_pos[1], inserted_seqs[0], refs[0]),
                        '[%s:%s[%s%s' % (chroms[0], alt_pos[0], inserted_seqs[1], refs[1]))
	
        breakends = map(lambda i: '\t'.join([chroms[i], str(pos[i]), ids[i], refs[i], alts[i], '.', '.', VCF.info_dict_to_str(infos[i])]), range(2))
        
        return '\n'.join(breakends)
Example #16
0
def main():
    input = ComLine(sys.argv[1:])
    phy = Phylip(input.args.phy)
    pops = Popmap(input.args.popmap)
    VCF(phy, pops, input.args.out)
Example #17
0
##indel_exonicfunc.xls
title_indel_exonicfunc = [
    'Sample', 'frameshift_deletion', 'frameshift_insertion',
    'nonframeshift_deletion', 'nonframeshift_insertion', 'stoploss',
    'stopgain', 'unknown'
]
indel_exonicfunc.write('\t'.join(title_indel_exonicfunc) + '\n')

for file in open(files, 'r'):
    if file.startswith('#'): continue
    file = file.strip()
    sample_name = os.path.basename(file)
    sample_name = sample_name.split('.')[0]

    myVCF = VCF(file)
    snp = myVCF.filter()
    indel_file = file.replace('snp', 'indel')
    myVCF = VCF(indel_file)
    indel = myVCF.filter()
    """
	##chr.xls
	chr = myVCF.chr_stat(vcf)
	chromosome.write(sample_name)
	for i in [str(i) for i in range(1,23)]+['X','Y']:
		try:
			chromosome.write('\t'+str(chr[i]))
		except:
			chromosome.write('\t0')
	chromosome.write('\n')
	"""