def main(args): SNV = {} for line in open(args.SNVvcf): if line[0] == "#": continue content = line.split("\t") content[1] = int(content[1]) if not content[0] in SNV: SNV[content[0]] = {} if not content[1] in SNV[content[0]]: SNV[content[0]][content[1]] = [] SNV[content[0]][content[1]].append({ "ref": content[3], "alt": content[4] }) for line in open(args.SVvcf): if line[0] == "#" and line[1] == "#": print line.strip() continue elif line[0] == "#": print "##INFO=<ID=COMPOUND,Number=1,Type=String,Description=\"prints all snps and snvs found within the SV in the following format chr|pos:ref-alt|pos2:ref2->alt2..\">" print line.strip() continue content = line.strip().split("\t") snp_tag = ";COMPOUND=" chrA, posA, chrB, posB, event_type, INFO, FORMAT = readVCF.readVCFLine( line) if not chrA in SNV or not chrB in SNV: print line.strip() continue if chrA == chrB: snp_tag += retrieve_snps(chrA, posA - args.padding, posB + args.padding, SNV) else: snp_tag += retrieve_snps(chrA, posA - args.padding, posA + args.padding, SNV) snp_tag += "|" + retrieve_snps(chrB, posB - args.padding, posB + args.padding, SNV) if not snp_tag == ";COMPOUND={}".format( chrA) and not snp_tag == ";COMPOUND={}|{}".format(chrA, chrB): content[7] += snp_tag print "\t".join(content)
def readVCFFile(toRead, chromosomes): vcfFileName = toRead vcfInfoLines = [] with open(toRead, 'r') as vcf: #The first lines should be a number of meta-information lines, prepended by ##. #Should begin with fileformat. Store these. Check first line for correct format. line = vcf.readline() if (not line.startswith("##fileformat=")): print("VCF file is not in correct format") return None else: print("VCF file seems ok, continuing read") while (line.startswith("##")): vcfInfoLines.append(line) line = vcf.readline() #A header line prepended by # should follow containing 8 fields, tab-delimited. #These are in order CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO. Store in info line list. fields = line.split('\t') if (not (line.startswith('#') or len(fields) == 8) ): print("Header columns missing in VCF file") return None else: vcfInfoLines.append(line) #All following lines are tab-delmited data lines. #Store variant data in chromosome item corresponding to CHROM field numvars = 0 for line in vcf: numvars += 1 #Feed every data line into readVCF, returning: (chrA,posA,chrB,posB,event_type,description,format) (chrA,posA,chrB,posB,event_type,description,format) = readVCF.readVCFLine(line) #Iterate through chromosome list to find match to insert data into for chromo in chromosomes: if chromo.name == chrA: chromo.addVariant(chrA,posA,chrB,posB,event_type,description,format) break return (chromosomes,vcfInfoLines)
help="frequency threshold, more common variants are not printed") parser.add_argument( '--length', type=int, default=10000, help="length threshold, smaller intergenic variants are not printed") args, unknown = parser.parse_known_args() variant_list = [] output = "\"{chrA}\",{chrB},\"{posA}\",\"{posB}\",\"{len}\",\"{var}\",\"{frequency}\",\"{genes}" sample_ids = readVCF.get_sample_ids(args.vcf) for line in open(args.vcf): if not "#" == line[0]: chrA, posA, chrB, posB, event_type, INFO, format = readVCF.readVCFLine( line) # print((chrA, posA, chrB, posB,event_type,INFO,format)) content = line.strip().split("\t") #have a look in the snpeff field eff = True SNPEFF = "" try: SNPEFF = content[7].split("EFF=")[1] except: if ";ANN=" in content[7]: SNPEFF = content[7].split("ANN=")[1] elif ";CSQ=" in content[7]: eff = False SNPEFF = content[7].split("CSQ=")[1] effects = SNPEFF.split(",")
def main(args): #start by loading the variations variations = args.variations ratio = args.overlap queries = [] with open(variations) as fin: noOCCTag = 1 infoFound = 0 outputSource = "Not_specified" for line in fin: #process and output the metadata if line.startswith("#") or line.startswith("="): #find the output source(cnvnator or Findtranslocations) meta_line = line.replace("#", "") content = meta_line.split("=") if (content[0] == "source"): outputSource = content[1].rstrip().split()[0] lookForFilter = meta_line.split("=") #the last infotag will be the Feature tag if (lookForFilter[0] != "INFO" and noOCCTag and infoFound == 1): sys.stdout.write( "##INFO=<ID=OCC,Number=1,Type=Integer,Description=\"The number of occurances of the event in the database\">\n" ) sys.stdout.write( "##INFO=<ID=FRQ,Number=1,Type=Integer,Description=\"The frequency of the event in the database\">\n" ) sys.stdout.write(line) infoFound = 0 noFeatureTag = 0 elif (lookForFilter[0] == "INFO"): sys.stdout.write(line) infoFound = 1 #there should only be one feature tag per vf file if (line == "INFO=<ID=OCC,Number=1,Type=Integer,Description=\"The number of occurances of the event in the database\">" ): noOCCTag = 0 else: sys.stdout.write(line) else: #in this case I need to store a query chrA, startA, endA, chrB, startB, endB, event_type = readVCF.readVCFLine( outputSource, line) current_variation = [ chrA.replace("chr", ""), int(startA), int(endA), chrB.replace("chr", ""), int(startB), int(endB), event_type, 0, line ] # plus a counter and the variation queries.append(current_variation) # at this point queries contains an entry for each variation #now query each sample.db present in the given folder and store the occurences if (args.db): dataBases = glob.glob("{}/*.db".format(os.path.abspath(args.db))) else: dataBases = args.files for sample_db in dataBases: allVariations = {} with open(sample_db) as fDB: for line in fDB: db_entry = (line.rstrip().split('\t')) db_entry[0] = db_entry[0].replace("chr", "") db_entry[1] = db_entry[1].replace("chr", "") if db_entry[0] in allVariations: if db_entry[1] in allVariations[db_entry[0]]: allVariations[db_entry[0]][db_entry[1]].append([ db_entry[2], db_entry[3], db_entry[4], db_entry[5], db_entry[6], db_entry[7] ]) else: allVariations[db_entry[0]][db_entry[1]] = [[ db_entry[2], db_entry[3], db_entry[4], db_entry[5], db_entry[6], db_entry[7] ]] else: allVariations[db_entry[0]] = {} allVariations[db_entry[0]][db_entry[1]] = [[ db_entry[2], db_entry[3], db_entry[4], db_entry[5], db_entry[6], db_entry[7] ]] for query in queries: hit = isVariationInDB(allVariations, query, ratio, args) if hit: query[7] += 1 # found hit for query in sorted(queries, key=itemgetter(7)): vcf_entry = query[8].rstrip() content = vcf_entry.split("\t") content[7] = "{};OCC={};FRQ={}".format( content[7], query[7], (query[7] / float(len(dataBases)))) print(("\t").join(content))
def main(args): #start by loading the variations variations = args.variations ratio= args.overlap queries = [] with open(variations) as fin: noOCCTag=1; infoFound=0; outputSource="Not_specified" for line in fin: #process and output the metadata if line.startswith("#") or line.startswith("="): #find the output source(cnvnator or Findtranslocations) meta_line=line.replace("#",""); content=meta_line.split("="); if(content[0] == "source"): outputSource=content[1].rstrip().split()[0]; lookForFilter=meta_line.split("="); #the last infotag will be the Feature tag if(lookForFilter[0] != "INFO" and noOCCTag and infoFound==1): sys.stdout.write("##INFO=<ID=OCC,Number=1,Type=Integer,Description=\"The number of occurances of the event in the database\">\n"); sys.stdout.write("##INFO=<ID=FRQ,Number=1,Type=Integer,Description=\"The frequency of the event in the database\">\n"); sys.stdout.write(line); infoFound=0;noFeatureTag=0; elif(lookForFilter[0] == "INFO"): sys.stdout.write(line); infoFound=1; #there should only be one feature tag per vf file if(line == "INFO=<ID=OCC,Number=1,Type=Integer,Description=\"The number of occurances of the event in the database\">"): noOCCTag=0 else: sys.stdout.write(line) else: #in this case I need to store a query chrA,startA,endA,chrB,startB,endB,event_type =readVCF.readVCFLine(outputSource, line); current_variation = [chrA.replace("chr",""), int(startA), int(endA), chrB.replace("chr",""), int(startB), int(endB),event_type, 0, line] # plus a counter and the variation queries.append(current_variation) # at this point queries contains an entry for each variation #now query each sample.db present in the given folder and store the occurences if(args.db): dataBases=glob.glob("{}/*.db".format(os.path.abspath(args.db))) else: dataBases=args.files for sample_db in dataBases: allVariations = {} with open(sample_db) as fDB: for line in fDB: db_entry = (line.rstrip().split('\t')) db_entry[0] = db_entry[0].replace("chr","") db_entry[1] = db_entry[1].replace("chr","") if db_entry[0] in allVariations: if db_entry[1] in allVariations[db_entry[0]]: allVariations[db_entry[0]][db_entry[1]].append([db_entry[2], db_entry[3], db_entry[4], db_entry[5], db_entry[6],db_entry[7]]) else: allVariations[db_entry[0]][db_entry[1]] = [[db_entry[2], db_entry[3], db_entry[4], db_entry[5], db_entry[6],db_entry[7]]] else: allVariations[db_entry[0]] = {} allVariations[db_entry[0]][db_entry[1]] = [[db_entry[2], db_entry[3], db_entry[4], db_entry[5], db_entry[6],db_entry[7]]] for query in queries: hit,allVariations = isVariationInDB(allVariations, query,ratio,args) if hit: query[7] += 1 # found hit for query in sorted(queries, key=itemgetter(7)): vcf_entry = query[8].rstrip() sys.stdout.write("{};OCC={};FRQ={}\n".format(vcf_entry, query[7],(query[7]/float(len(dataBases))) ) )
def main(args): bin_size = 0 coverage = {} #read the coverage file with open(args.tab) as fin: for line in fin: if (line[0] == "#"): pass else: content = line.strip().split("\t") bin_size = int(content[2]) - int(content[1]) if not content[0].replace("chr", "").replace("CHR", "") in coverage: coverage[content[0].replace("chr", "").replace( "CHR", "")] = [float(content[3])] else: coverage[content[0].replace("chr", "").replace("CHR", "")].append( float(content[3])) #compute the coverage across each chromosome chr_cov = chromosome_coverage(bin_size, coverage) #use the coverage to genotype the CNVS RD = 0 PE = 0 SR = 0 GT = 0 format_print = 1 with open(args.vcf) as fin: for line in fin: if (line[0] == "#" and line[1] == "#"): if ("#FORMAT=<ID=RD,Number=1,Type=Float,Description=\"The read depth fraction at the variant\">" in line): RD = 1 elif ("#FORMAT=<ID=PE,Number=1,Type=Integer,Description=\"The number of discordant pairs\">" in line): PE = 1 elif ("#FORMAT=<ID=PE,Number=1,Type=Integer,Description=\"The number of split reads\">" ): SR = 1 elif ("#FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">" ): GT = 1 print(line.strip()) elif (line[0] == "#"): out_line = line.strip() + "\tFORMAT\t" + args.sample if format_print: if not RD: print( "##FORMAT=<ID=RD,Number=1,Type=Float,Description=\"The read depth fraction at the variant\">" ) if not PE: print( "##FORMAT=<ID=PE,Number=1,Type=Integer,Description=\"The number of discordant pairs\">" ) if not SR: print( "##FORMAT=<ID=PE,Number=1,Type=Integer,Description=\"The number of split reads\">" ) if not GT: print( "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">" ) print(out_line) format_print = 0 else: RD = "." PE = "0" SR = 0 GT = "./." chrA, startA, endA, chrB, startB, endB, event_type = readVCF.readVCFLine( None, line) out_line = line.strip() try: chrA_cov = chr_cov[chrA] #calculate the coverage across intra chromosomal variants if (chrA == chrB): var_cov = mean_coverage(chrA, startA, endA, bin_size, coverage) RD = (var_cov / float(chr_cov[chrA])) if ("DUP" in event_type): if (RD < 1.4) * chr_cov[chrA]: GT = "0/1" elif ("DEL" in event_type): if (RD > 0.1 * chr_cov[chrA]): GT = "0/1" else: GT = "1/1" except: pass RD = str(RD) #compute the number of discordant pairs if (";LTE=" in line): LTE = line.split(";LTE=")[1] LTE = LTE.split(";")[0] PE = LTE #generate the format data format_line = [GT, PE, RD, str(SR)] out_line = out_line + "\t" + "GT:PE:RD:SR" + "\t" + ":".join( format_line) print(out_line)
def main(args): bin_size=0; coverage={} #read the coverage file with open(args.tab) as fin: for line in fin: if(line[0]=="#"): pass else: content=line.strip().split("\t") bin_size= int(content[2]) - int(content[1]) if not content[0].replace("chr","").replace("CHR","") in coverage: coverage[content[0].replace("chr","").replace("CHR","")]=[float(content[3])] else: coverage[content[0].replace("chr","").replace("CHR","")].append(float(content[3])) #compute the coverage across each chromosome chr_cov=chromosome_coverage(bin_size,coverage); #use the coverage to genotype the CNVS RD=0; PE=0; SR=0; GT=0; format_print=1; with open(args.vcf) as fin: for line in fin: if(line[0]=="#" and line[1] == "#" ): if ("#FORMAT=<ID=RD,Number=1,Type=Float,Description=\"The read depth fraction at the variant\">" in line): RD = 1; elif("#FORMAT=<ID=PE,Number=1,Type=Integer,Description=\"The number of discordant pairs\">" in line): PE = 1; elif("#FORMAT=<ID=PE,Number=1,Type=Integer,Description=\"The number of split reads\">" ): SR = 1; elif("#FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">" ): GT = 1; print(line.strip()); elif(line[0] == "#"): out_line=line.strip()+"\tFORMAT\t"+args.sample if format_print: if not RD: print("##FORMAT=<ID=RD,Number=1,Type=Float,Description=\"The read depth fraction at the variant\">"); if not PE: print("##FORMAT=<ID=PE,Number=1,Type=Integer,Description=\"The number of discordant pairs\">" ); if not SR: print("##FORMAT=<ID=PE,Number=1,Type=Integer,Description=\"The number of split reads\">"); if not GT: print("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"); print(out_line) format_print=0; else: RD="." PE="0"; SR=0; GT="./." chrA,startA,endA,chrB,startB,endB,event_type =readVCF.readVCFLine(None, line); out_line=line.strip(); try: chrA_cov=chr_cov[chrA]; #calculate the coverage across intra chromosomal variants if(chrA == chrB): var_cov=mean_coverage(chrA,startA,endA,bin_size,coverage) RD=( var_cov/float( chr_cov[chrA] ) ) if("DUP" in event_type): if(RD < 1.4)*chr_cov[chrA]: GT="0/1" elif("DEL" in event_type): if(RD > 0.1*chr_cov[chrA]): GT="0/1" else: GT="1/1" except: pass RD=str(RD) #compute the number of discordant pairs if(";LTE=" in line): LTE=line.split(";LTE=")[1]; LTE=LTE.split(";")[0] PE=LTE #generate the format data format_line=[GT,PE,RD,str(SR)] out_line=out_line+"\t"+"GT:PE:RD:SR" + "\t" + ":".join(format_line) print(out_line)
def main(args): allVariations = {} tollerance = args.tollerance fixed =0 if args.fixed: fixed =1 for variation_file in [item for sublist in args.variations for item in sublist] : outputSource=None; collapsedVariations = {} # this will contain the SVs of this file, but collapsing those that are close with open(variation_file) as fin: #memorize all variations for line in fin: if line.startswith("#") or line.startswith("="): #find the output source(cnvnator or Findtranslocations line=line.replace("#",""); content=line.split("="); if(content[0] == "source"): outputSource=content[1].rstrip().split()[0]; continue chrA,startA,endA,chrB,startB,endB,event_type =readVCF.readVCFLine(outputSource,line); startA = startA - tollerance if startA < 0: startA = 0 startB = startB - tollerance if startB < 0: startB = 0 current_variation = [chrA, startA , endA + tollerance, chrB, startB, endB + tollerance,event_type] collapsedVariations = populate_DB(collapsedVariations, current_variation, True, 0, fixed ) ##collapse again in order to avoid problems with areas that have become too close one to onther elemnets_before = 0 for chrA in collapsedVariations: for chrB in collapsedVariations[chrA] : for collapsedVariation in collapsedVariations[chrA][chrB]: elemnets_before += 1 elemnets_after = 0 while elemnets_before != elemnets_after: collapsedVariationsFinal = {} elemnets_before = 0 elemnets_after = 0 for chrA in collapsedVariations: for chrB in collapsedVariations[chrA] : for collapsedVariation in collapsedVariations[chrA][chrB]: current_variation = [chrA, collapsedVariation[0], collapsedVariation[1], chrB, collapsedVariation[2], collapsedVariation[3],collapsedVariation[4]] collapsedVariationsFinal = populate_DB(collapsedVariationsFinal, current_variation, True, 0,fixed) elemnets_before += 1 for chrA in collapsedVariationsFinal: for chrB in collapsedVariationsFinal[chrA] : for collapsedVariation in collapsedVariationsFinal[chrA][chrB]: elemnets_after += 1 collapsedVariations.clear() collapsedVariations = collapsedVariationsFinal #now populate the DB for chrA in collapsedVariations: for chrB in collapsedVariations[chrA] : for collapsedVariation in collapsedVariations[chrA][chrB]: current_variation = [chrA, collapsedVariation[0], collapsedVariation[1], chrB, collapsedVariation[2], collapsedVariation[3],collapsedVariation[4]] allVariations = populate_DB(allVariations, current_variation, False, 0,fixed) for chrA in allVariations: for chrB in allVariations[chrA] : for event in allVariations[chrA][chrB]: print "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(chrA, chrB, event[0], event[1], event[2], event[3], event[4], event[5])
def main(args): #print "memorizing masked reagions" toBeMaskedElements = {} featureEntries = [] for bed_file in [item for sublist in args.bed_files for item in sublist]: toBeMaskedElements[bed_file] = {} bedlabel = bed_file.split("/")[-1] bedlabel = bedlabel.split(".")[0] featureEntries.append( "##INFO=<ID={0},Number=2,Type=String,Description=\"Genomic features of regions A and B\">\n" .format(bedlabel)) #sys.stdout.write("memorizing {} ...".format(bed_file)) with open(bed_file) as fin: rows = (line.rstrip().split('\t') for line in fin) for row in rows: if not row[0].startswith("#"): try: toBeMaskedElements[bed_file][row[0].replace( "chr", "").replace("Chr", "")].append([row[1], row[2], row[3]]) except: toBeMaskedElements[bed_file][row[0].replace( "chr", "").replace("Chr", "")] = [[row[1], row[2], row[3]]] #sys.stdout.write("done\n") #print "sorting masked regions" for bedFiles in toBeMaskedElements: for chr in toBeMaskedElements[bedFiles]: sorted(toBeMaskedElements[bedFiles][chr], key=itemgetter(1)) #print toBeMaskedElements #UncompressedVariations = []; #CompressedVariations = [] #currentVariation = UncompressedVariations[0] with open(args.variations) as fin: UncompressedVariations = (line.rstrip().split('\t') for line in fin) noFeatureTag = 1 infoFound = 0 for row in UncompressedVariations: #the first charachter is # in the metadata, otherwise it is c,C or a number if (row[0][0] != "#"): chr_1, chr_1_start, chr_1_end, chr_2, chr_2_start, chr_2_end, event_type = readVCF.readVCFLine( outputSource, "\t".join(row)) chr_1_start = int(chr_1_start) chr_1_end = int(chr_1_end) outrow = "\t".join(row) outRow = outrow.replace("\n", "") sys.stdout.write(outRow) ## Chategorise the two breackpoints for bedFile in toBeMaskedElements: bedlabel = bedFile.split("/")[-1] bedlabel = bedlabel.split(".")[0] for j in range(2): if j == 0: variation_chr = chr_1.replace("chr", "").replace( "Chr", "") variation_start = chr_1_start variation_end = chr_1_end else: variation_chr = chr_2.replace("chr", "").replace( "Chr", "") variation_start = chr_2_start variation_end = chr_2_end stopSearching = 0 if variation_chr in toBeMaskedElements[bedFile]: categorized = 0 i = 0 max_Size = 0 max_Size_type = "" while i < len(toBeMaskedElements[bedFile] [variation_chr]): repeat_start = int(toBeMaskedElements[bedFile] [variation_chr][i][0]) repeat_end = int(toBeMaskedElements[bedFile] [variation_chr][i][1]) repeat_type = toBeMaskedElements[bedFile][ variation_chr][i][2] i += 1 overlap = 0 if variation_start > repeat_end: stopSearching = 1 elif variation_start <= repeat_start and variation_end >= repeat_end and variation_start < repeat_end: overlap = repeat_end - repeat_start + 1 elif variation_start <= repeat_start and variation_end <= repeat_end and variation_start < repeat_end: overlap = variation_end - repeat_start + 1 elif variation_start >= repeat_start and variation_end >= repeat_end and variation_start < repeat_end: overlap = repeat_end - variation_start + 1 elif variation_start >= repeat_start and variation_end <= repeat_end and variation_start < repeat_end: overlap = variation_end - variation_start + 1 if overlap > max_Size: max_Size = overlap categorized = 1 max_Size_type = "{}".format(repeat_type) if categorized == 0: if (j == 0): sys.stdout.write( ";{0}=NoCategory".format(bedlabel)) else: sys.stdout.write(",NoCategory") else: if (j == 0): sys.stdout.write(";{0}={1}".format( bedlabel, max_Size_type)) else: sys.stdout.write( ",{}".format(max_Size_type)) else: if (j == 0): sys.stdout.write( ";{}=NotPresentInRef".format(bedlabel)) else: sys.stdout.write(",NotPresentInRef") sys.stdout.write("\n") #asasfasfasf else: lookForFilter = row[0].split("=") line = row[0].replace("#", "") content = line.split("=") if (content[0] == "source"): outputSource = content[1].rstrip().split()[0] #the last infotag will be the Feature tag if (lookForFilter[0] != "##INFO" and infoFound == 1): for entry in featureEntries: sys.stdout.write(entry) sys.stdout.write(row[0] + "\n") infoFound = 0 elif (lookForFilter[0] == "##INFO"): sys.stdout.write(row[0] + "\n") infoFound = 1 #there should only be one feature tag per vf file if row[0] in featureEntries: featureEntries.remove(row[0]) else: sys.stdout.write("\t".join(row) + "\n") return 0
def main(args): #print "memorizing masked reagions" toBeMaskedElements = {}; featureEntries=[]; for bed_file in [item for sublist in args.bed_files for item in sublist] : toBeMaskedElements[bed_file]={}; bedlabel=bed_file.split("/")[-1] bedlabel=bedlabel.split(".")[0] featureEntries.append("##INFO=<ID={0},Number=2,Type=String,Description=\"Genomic features of regions A and B\">\n".format(bedlabel)) #sys.stdout.write("memorizing {} ...".format(bed_file)) with open(bed_file) as fin: rows = ( line.rstrip().split('\t') for line in fin) for row in rows: if not row[0].startswith("#"): try: toBeMaskedElements[bed_file][row[0].replace("chr","").replace("Chr","")].append([row[1], row[2], row[3]]) except : toBeMaskedElements[bed_file][row[0].replace("chr","").replace("Chr","")] = [[row[1], row[2], row[3]]] #sys.stdout.write("done\n") #print "sorting masked regions" for bedFiles in toBeMaskedElements: for chr in toBeMaskedElements[bedFiles]: sorted(toBeMaskedElements[bedFiles][chr], key=itemgetter(1)) #print toBeMaskedElements #UncompressedVariations = []; #CompressedVariations = [] #currentVariation = UncompressedVariations[0] with open(args.variations) as fin: UncompressedVariations = ( line.rstrip().split('\t') for line in fin) noFeatureTag=1; infoFound=0; for row in UncompressedVariations: #the first charachter is # in the metadata, otherwise it is c,C or a number if(row[0][0] != "#"): chr_1,chr_1_start,chr_1_end,chr_2,chr_2_start,chr_2_end,event_type =readVCF.readVCFLine(outputSource,"\t".join(row)); chr_1_start=int(chr_1_start); chr_1_end=int(chr_1_end); outrow="\t".join(row); outRow=outrow.replace("\n",""); sys.stdout.write(outRow) ## Chategorise the two breackpoints for bedFile in toBeMaskedElements: bedlabel=bedFile.split("/")[-1] bedlabel=bedlabel.split(".")[0] for j in range(2): if j == 0: variation_chr = chr_1.replace("chr","").replace("Chr","") variation_start = chr_1_start variation_end = chr_1_end else: variation_chr = chr_2.replace("chr","").replace("Chr","") variation_start = chr_2_start variation_end = chr_2_end stopSearching = 0 if variation_chr in toBeMaskedElements[bedFile]: categorized = 0 i = 0 max_Size = 0 max_Size_type = "" while i < len(toBeMaskedElements[bedFile][variation_chr]) : repeat_start = int(toBeMaskedElements[bedFile][variation_chr][i][0]) repeat_end = int(toBeMaskedElements[bedFile][variation_chr][i][1]) repeat_type = toBeMaskedElements[bedFile][variation_chr][i][2] i += 1 overlap = 0 if variation_start > repeat_end: stopSearching = 1 elif variation_start <= repeat_start and variation_end >= repeat_end and variation_start < repeat_end: overlap = repeat_end - repeat_start + 1 elif variation_start <= repeat_start and variation_end <= repeat_end and variation_start < repeat_end: overlap = variation_end - repeat_start + 1 elif variation_start >= repeat_start and variation_end >= repeat_end and variation_start < repeat_end: overlap = repeat_end - variation_start + 1 elif variation_start >= repeat_start and variation_end <= repeat_end and variation_start < repeat_end: overlap = variation_end - variation_start + 1 if overlap > max_Size: max_Size = overlap categorized = 1 max_Size_type = "{}".format(repeat_type) if categorized == 0: if( j == 0): sys.stdout.write(";{0}=NoCategory".format(bedlabel)) else: sys.stdout.write(",NoCategory") else: if(j == 0): sys.stdout.write(";{0}={1}".format(bedlabel,max_Size_type)) else: sys.stdout.write(",{}".format(max_Size_type)) else: if(j == 0): sys.stdout.write(";{}=NotPresentInRef".format(bedlabel)) else: sys.stdout.write(",NotPresentInRef") sys.stdout.write("\n") #asasfasfasf else: lookForFilter=row[0].split("="); line=row[0].replace("#",""); content=line.split("="); if(content[0] == "source"): outputSource=content[1].rstrip().split()[0]; #the last infotag will be the Feature tag if(lookForFilter[0] !="##INFO" and infoFound==1): for entry in featureEntries: sys.stdout.write(entry); sys.stdout.write(row[0]+"\n"); infoFound=0; elif(lookForFilter[0] == "##INFO"): sys.stdout.write(row[0]+"\n"); infoFound=1; #there should only be one feature tag per vf file if row[0] in featureEntries: featureEntries.remove(row[0]) else: sys.stdout.write("\t".join(row)+"\n"); return 0
def main(Data,GC_hist,args): if args.vcf: for line in open(args.vcf): if line[0] == "#" and line[1] == "#": print line.strip() elif line[0] == "#": print "##INFO=<ID={},Number=1,Type=Float,Description=\"estimated copy number\">".format("AMYCNE") print "##INFO=<ID={},Number=2,Type=Float,Description=\"99% confidence interval around the estimated CN\">".format("AMYCNECI") print "##INFO=<ID={},Number=1,Type=Float,Description=\"ratio of bins used for CN estimation\">".format("BIN_RATIO") print "##INFO=<ID={},Number=1,Type=Float,Description=\"mean coverage of the reference bins\">".format("REFCOV") print line.strip() else: chrA,posA,chrB,posB,event_type,INFO,FORMAT = readVCF.readVCFLine(line) if chrA == chrB: if chrA in Data: cn, gc, length,ref,bins,used_bins,bin_list =common.regional_cn_est( Data ,GC_hist, [chrA,posA,posB],args.Q ) ci="(0,0)" CNE = round( cn*args.plody ) if CNE < 0: CNE = -1 else: for i in range(0,len(bin_list)): bin_list[i]=bin_list[i]*args.plody SEM=(numpy.std(bin_list)/numpy.sqrt(used_bins)) ci="{},{}".format(round(cn*args.plody-SEM*3,2),round(cn*args.plody+SEM*3,2)) content=line.split("\t") if bins == 0: bins =-1 content[7] += ";AMYCNE=" + str(int(CNE)) + ";BIN_RATIO=" + str(used_bins/float(bins)) + ";REFCOV=" + str(round(ref,2)) + ";AMYCNECI=" + ci new_line="\t".join(content) print new_line.strip() else: print line.strip() elif args.bed: for line in open(args.bed): if line[0] == "#": print "{}\t{}\t{}\t{}\t{}".format(line.strip(),"CN","CI","RATIO","Refcov") continue content=line.split("\t") chrA=content[0] chrB=chrA posA=content[1] posB=content[2] if not chrA in Data and chrA.replace("chr",""): chrA=chrA.replace("chr","") if chrA in Data: cn, gc, length,ref,bins,used_bins,bin_list =common.regional_cn_est( Data ,GC_hist, [chrA,posA,posB],args.Q ) ci="(0,0)" CNE = int( round( cn*args.plody ) ) if CNE < 0: CNE = -1 else: for i in range(0,len(bin_list)): bin_list[i]=bin_list[i]*args.plody SEM=(numpy.std(bin_list)/numpy.sqrt(used_bins)) ci="{},{}".format(round(cn*args.plody-SEM*3,2),round(cn*args.plody+SEM*3,2)) content=line.split("\t") if bins == 0: bins =-1 print "{}\t{}\t{}\t{}\t{}".format(line.strip(), CNE,ci, used_bins/float(bins), str(round(ref,2))) else: print "ERROR: no input source, please select vcf or bed"
def extract_splits(args, ws0): if args.bed: input_file = args.bed elif args.vcf: input_file = args.vcf else: print "error: missing bed or vcf" quit() if not args.sample: args.sample = args.bam.split("/")[-1].split(".")[0] if not args.working_dir: args.working_dir = args.bam.split("/")[-1].split(".")[0] if args.repeatmask: conn = sqlite3.connect(args.repeatmask) c = conn.cursor() row = 1 detected_splits = {} i = 0 for line in open(input_file): if line[0] == "#": continue if args.bed: content = line.strip().split() args.chrA = content[0] args.posA = int(content[1]) args.chrB = content[2] args.posB = int(content[3]) args.type = content[4] args.orientationA = "" args.orientationB = "" args.id = str(i) var_id = str(i) args.lengthA = "" args.lengthB = "" args.regionA = "" args.regionB = "" insertion_seq = "" homology_seq = "" args.regionAsegments = () args.regionBsegments = () args.contigSegments = () args.HomologySegments = () args.repeatA = "" args.repeatB = "" i += 1 elif args.vcf: chrA, posA, chrB, posB, event_type, INFO, FORMAT = readVCF.readVCFLine( line) args.chrA = chrA args.posA = int(posA) args.chrB = chrB args.posB = int(posB) args.type = event_type args.orientationA = "" args.orientationB = "" args.id = line.strip().split("\t")[2] var_id = line.strip().split("\t")[2] args.lengthA = "" args.lengthB = "" args.regionA = "" args.regionB = "" insertion_seq = "" homology_seq = "" args.regionAsegments = () args.regionBsegments = () args.contigSegments = () args.HomologySegments = () args.repeatA = "" args.repeatB = "" i += 1 found = FindTranslocations.main(args) splits = 0 bp_homology = "" insertions = "" deletions = "" sucess = False print found if found: wd = os.path.join(args.working_dir, var_id) softclips = os.path.join(wd, "splits.sam") print "python consensus.py {} {} > {}/consensus.fa".format( wd, softclips, wd) os.system("python consensus.py {} {} > {}/consensus.fa".format( wd, softclips, wd)) for line in open("{}/consensus.fa".format(wd)): splits = int(line.strip().split()[2]) print splits break try: os.system("bwa mem {} {} > {}".format( args.fa, os.path.join(args.working_dir, var_id, "consensus.fa"), os.path.join(wd, "aligned_consensus.sam"))) args, sucess, contig, bp_homology, homology_seq, insertions, insertion_seq, deletions = retrieve_pos( args, os.path.join(args.working_dir, var_id, "aligned_consensus.sam")) print success except: homology_seq = "WARNING:unable to determine the breakpoint sequence" else: wd = os.path.join(args.working_dir, var_id) os.system("samtools view {} {}:{}-{} > {}/regionA.sam".format( args.bam, args.chrA, args.posA - args.padding, args.posA + args.padding, wd)) os.system("samtools view {} {}:{}-{} > {}/regionB.sam".format( args.bam, args.chrB, args.posB - args.padding, args.posB + args.padding, wd)) os.system( "cat {}/regionA.sam {}/regionB.sam > {}/region.sam".format( wd, wd, wd)) os.system("python bam2fa.py {}/region.sam > {}/region.fq".format( wd, wd)) trials = [20, 60, 90] for k in trials: print "ABYSS -c {} -e {} -k {} -o {}_{}.fa {} > /dev/null 2>&1".format( 1, 10, k, os.path.join(args.working_dir, var_id, "abyss"), k, os.path.join(wd, "region.fq")) os.system( "ABYSS -c {} -e {} -k {} -o {}_{}.fa {} > /dev/null 2>&1". format(1, 10, k, os.path.join(args.working_dir, var_id, "abyss"), k, os.path.join(wd, "region.fq"))) os.system("cat {}_20.fa {}_60.fa {}_90.fa > {}".format( os.path.join(args.working_dir, var_id, "abyss"), os.path.join(args.working_dir, var_id, "abyss"), os.path.join(args.working_dir, var_id, "abyss"), os.path.join(args.working_dir, var_id, "abyss.fa"))) if not os.stat(os.path.join(args.working_dir, var_id, "abyss.fa")).st_size == 0: os.system("bwa mem {} {} > {}".format( args.fa, os.path.join(args.working_dir, var_id, "abyss.fa"), os.path.join(wd, "aligned_contig.sam"))) try: args, sucess, contig, bp_homology, homology_seq, insertions, insertion_seq, deletions = retrieve_pos( args, os.path.join(wd, "aligned_contig.sam")) except: homology_seq = "WARNING:unable to determine the breakpoint sequence" if not sucess: contig = "" distanceA = "" distanceB = "" if args.repeatmask: distanceA, args.repeatA = find_repeat(args.chrA, args.posA, c) distanceB, args.repeatB = find_repeat(args.chrB, args.posB, c) snpDistanceA, snpsA = find_snps(args.chrA, args.posA, args.snp_distance, args.bam, wd, args.fa) snpDistanceB, snpsB = find_snps(args.chrB, args.posB, args.snp_distance, args.bam, wd, args.fa) row_content = [ args.sample, var_id, args.type, splits, args.chrA, args.posA, args.orientationA, args.repeatA, distanceA, snpsA, snpDistanceA, args.chrB, args.posB, args.orientationB, args.repeatB, distanceB, snpsB, snpDistanceB, bp_homology, args.HomologySegments, insertions, insertion_seq, args.lengthA, args.lengthB, len(contig), args.regionAsegments, args.regionBsegments, args.contigSegments ] j = 0 for item in row_content: if j in [19, 25, 26, 27]: ws0.write_rich_text(row, j, item) else: ws0.write(row, j, item) j += 1 row += 1