Beispiel #1
0
def main(args):
    SNV = {}
    for line in open(args.SNVvcf):
        if line[0] == "#":
            continue

        content = line.split("\t")
        content[1] = int(content[1])
        if not content[0] in SNV:
            SNV[content[0]] = {}
        if not content[1] in SNV[content[0]]:
            SNV[content[0]][content[1]] = []
        SNV[content[0]][content[1]].append({
            "ref": content[3],
            "alt": content[4]
        })

    for line in open(args.SVvcf):
        if line[0] == "#" and line[1] == "#":
            print line.strip()
            continue

        elif line[0] == "#":
            print "##INFO=<ID=COMPOUND,Number=1,Type=String,Description=\"prints all snps and snvs found within the SV in the following format chr|pos:ref-alt|pos2:ref2->alt2..\">"
            print line.strip()
            continue

        content = line.strip().split("\t")
        snp_tag = ";COMPOUND="

        chrA, posA, chrB, posB, event_type, INFO, FORMAT = readVCF.readVCFLine(
            line)
        if not chrA in SNV or not chrB in SNV:
            print line.strip()
            continue

        if chrA == chrB:
            snp_tag += retrieve_snps(chrA, posA - args.padding,
                                     posB + args.padding, SNV)
        else:
            snp_tag += retrieve_snps(chrA, posA - args.padding,
                                     posA + args.padding, SNV)
            snp_tag += "|" + retrieve_snps(chrB, posB - args.padding,
                                           posB + args.padding, SNV)

        if not snp_tag == ";COMPOUND={}".format(
                chrA) and not snp_tag == ";COMPOUND={}|{}".format(chrA, chrB):
            content[7] += snp_tag

        print "\t".join(content)
Beispiel #2
0
def readVCFFile(toRead, chromosomes):
    vcfFileName = toRead
    vcfInfoLines = []
    with open(toRead, 'r') as vcf:

        #The first lines should be a number of meta-information lines, prepended by ##.
        #Should begin with fileformat. Store these. Check first line for correct format.
        line = vcf.readline()
        if (not line.startswith("##fileformat=")):
            print("VCF file is not in correct format")
            return None
        else:
            print("VCF file seems ok, continuing read")
        while (line.startswith("##")):
            vcfInfoLines.append(line)
            line = vcf.readline()

        #A header line prepended by # should follow containing 8 fields, tab-delimited.
        #These are in order CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO. Store in info line list.
        fields = line.split('\t')
        if (not (line.startswith('#') or len(fields) == 8) ):
            print("Header columns missing in VCF file")
            return None
        else:
            vcfInfoLines.append(line)

        #All following lines are tab-delmited data lines.
        #Store variant data in chromosome item corresponding to CHROM field
        numvars = 0
        for line in vcf:
            numvars += 1
            #Feed every data line into readVCF, returning: (chrA,posA,chrB,posB,event_type,description,format)
            (chrA,posA,chrB,posB,event_type,description,format) = readVCF.readVCFLine(line)
            #Iterate through chromosome list to find match to insert data into
            for chromo in chromosomes:
                if chromo.name == chrA:
                    chromo.addVariant(chrA,posA,chrB,posB,event_type,description,format)
                    break
        return (chromosomes,vcfInfoLines)
Beispiel #3
0
    help="frequency threshold, more common variants are not printed")
parser.add_argument(
    '--length',
    type=int,
    default=10000,
    help="length threshold, smaller intergenic variants are not printed")

args, unknown = parser.parse_known_args()
variant_list = []

output = "\"{chrA}\",{chrB},\"{posA}\",\"{posB}\",\"{len}\",\"{var}\",\"{frequency}\",\"{genes}"

sample_ids = readVCF.get_sample_ids(args.vcf)
for line in open(args.vcf):
    if not "#" == line[0]:
        chrA, posA, chrB, posB, event_type, INFO, format = readVCF.readVCFLine(
            line)
        #        print((chrA, posA, chrB, posB,event_type,INFO,format))
        content = line.strip().split("\t")

        #have a look in the snpeff field
        eff = True
        SNPEFF = ""
        try:
            SNPEFF = content[7].split("EFF=")[1]
        except:
            if ";ANN=" in content[7]:
                SNPEFF = content[7].split("ANN=")[1]
            elif ";CSQ=" in content[7]:
                eff = False
                SNPEFF = content[7].split("CSQ=")[1]
        effects = SNPEFF.split(",")
Beispiel #4
0
def main(args):
    #start by loading the variations
    variations = args.variations
    ratio = args.overlap
    queries = []
    with open(variations) as fin:
        noOCCTag = 1
        infoFound = 0
        outputSource = "Not_specified"
        for line in fin:
            #process and output the metadata
            if line.startswith("#") or line.startswith("="):
                #find the output source(cnvnator or Findtranslocations)
                meta_line = line.replace("#", "")
                content = meta_line.split("=")
                if (content[0] == "source"):
                    outputSource = content[1].rstrip().split()[0]

                lookForFilter = meta_line.split("=")
                #the last infotag will be the Feature tag
                if (lookForFilter[0] != "INFO" and noOCCTag
                        and infoFound == 1):
                    sys.stdout.write(
                        "##INFO=<ID=OCC,Number=1,Type=Integer,Description=\"The number of occurances of the event in the database\">\n"
                    )
                    sys.stdout.write(
                        "##INFO=<ID=FRQ,Number=1,Type=Integer,Description=\"The frequency of the event in the database\">\n"
                    )
                    sys.stdout.write(line)
                    infoFound = 0
                    noFeatureTag = 0
                elif (lookForFilter[0] == "INFO"):
                    sys.stdout.write(line)
                    infoFound = 1
                    #there should only be one feature tag per vf file
                    if (line ==
                            "INFO=<ID=OCC,Number=1,Type=Integer,Description=\"The number of occurances of the event in the database\">"
                        ):
                        noOCCTag = 0
                else:
                    sys.stdout.write(line)
            else:
                #in this case I need to store a query
                chrA, startA, endA, chrB, startB, endB, event_type = readVCF.readVCFLine(
                    outputSource, line)
                current_variation = [
                    chrA.replace("chr", ""),
                    int(startA),
                    int(endA),
                    chrB.replace("chr", ""),
                    int(startB),
                    int(endB), event_type, 0, line
                ]  # plus a counter and the variation
                queries.append(current_variation)
    # at this point queries contains an entry for each variation
    #now query each sample.db present in the given folder and store the occurences
    if (args.db):
        dataBases = glob.glob("{}/*.db".format(os.path.abspath(args.db)))
    else:
        dataBases = args.files

    for sample_db in dataBases:
        allVariations = {}
        with open(sample_db) as fDB:
            for line in fDB:
                db_entry = (line.rstrip().split('\t'))
                db_entry[0] = db_entry[0].replace("chr", "")
                db_entry[1] = db_entry[1].replace("chr", "")
                if db_entry[0] in allVariations:
                    if db_entry[1] in allVariations[db_entry[0]]:
                        allVariations[db_entry[0]][db_entry[1]].append([
                            db_entry[2], db_entry[3], db_entry[4], db_entry[5],
                            db_entry[6], db_entry[7]
                        ])
                    else:
                        allVariations[db_entry[0]][db_entry[1]] = [[
                            db_entry[2], db_entry[3], db_entry[4], db_entry[5],
                            db_entry[6], db_entry[7]
                        ]]
                else:
                    allVariations[db_entry[0]] = {}
                    allVariations[db_entry[0]][db_entry[1]] = [[
                        db_entry[2], db_entry[3], db_entry[4], db_entry[5],
                        db_entry[6], db_entry[7]
                    ]]

        for query in queries:
            hit = isVariationInDB(allVariations, query, ratio, args)
            if hit:
                query[7] += 1  # found hit

    for query in sorted(queries, key=itemgetter(7)):
        vcf_entry = query[8].rstrip()
        content = vcf_entry.split("\t")
        content[7] = "{};OCC={};FRQ={}".format(
            content[7], query[7], (query[7] / float(len(dataBases))))
        print(("\t").join(content))
def main(args):
    #start by loading the variations
    variations = args.variations
    ratio= args.overlap
    queries = []
    with open(variations) as fin:
        noOCCTag=1;
        infoFound=0;
        outputSource="Not_specified"
        for line in fin:
            #process and output the metadata
            if line.startswith("#") or line.startswith("="):
                #find the output source(cnvnator or Findtranslocations)
                meta_line=line.replace("#","");
                content=meta_line.split("=");
                if(content[0] == "source"):
                    outputSource=content[1].rstrip().split()[0];

                lookForFilter=meta_line.split("=");
                #the last infotag will be the Feature tag
                if(lookForFilter[0] != "INFO" and noOCCTag and infoFound==1):
                    sys.stdout.write("##INFO=<ID=OCC,Number=1,Type=Integer,Description=\"The number of occurances of the event in the database\">\n");
                    sys.stdout.write("##INFO=<ID=FRQ,Number=1,Type=Integer,Description=\"The frequency of the event in the database\">\n");
                    sys.stdout.write(line);
                    infoFound=0;noFeatureTag=0;
                elif(lookForFilter[0] == "INFO"):
                    sys.stdout.write(line);
                    infoFound=1;
                    #there should only be one feature tag per vf file
                    if(line == "INFO=<ID=OCC,Number=1,Type=Integer,Description=\"The number of occurances of the event in the database\">"):
                        noOCCTag=0
                else:
                    sys.stdout.write(line)
            else:
                #in this case I need to store a query
                chrA,startA,endA,chrB,startB,endB,event_type =readVCF.readVCFLine(outputSource, line);
                current_variation = [chrA.replace("chr",""), int(startA), int(endA), chrB.replace("chr",""), int(startB), int(endB),event_type, 0, line] # plus a counter and the variation
                queries.append(current_variation)
    # at this point queries contains an entry for each variation
    #now query each sample.db present in the given folder and store the occurences
    if(args.db):
        dataBases=glob.glob("{}/*.db".format(os.path.abspath(args.db)))
    else:
        dataBases=args.files

    for sample_db in dataBases:
        allVariations = {}
        with open(sample_db) as fDB:
            for line in fDB:
                db_entry = (line.rstrip().split('\t'))
                db_entry[0] = db_entry[0].replace("chr","")
                db_entry[1] = db_entry[1].replace("chr","")
                if db_entry[0] in allVariations:
                        if db_entry[1] in allVariations[db_entry[0]]:
                            allVariations[db_entry[0]][db_entry[1]].append([db_entry[2], db_entry[3], db_entry[4], db_entry[5], db_entry[6],db_entry[7]])
                        else:
                                allVariations[db_entry[0]][db_entry[1]] = [[db_entry[2], db_entry[3], db_entry[4], db_entry[5], db_entry[6],db_entry[7]]]
                else:
                        allVariations[db_entry[0]] = {}
                        allVariations[db_entry[0]][db_entry[1]] = [[db_entry[2], db_entry[3], db_entry[4], db_entry[5], db_entry[6],db_entry[7]]]

                    
        for query in queries:
            hit,allVariations = isVariationInDB(allVariations, query,ratio,args)
            if hit:
                query[7] += 1 # found hit

    for query in sorted(queries, key=itemgetter(7)):
        vcf_entry = query[8].rstrip()
        sys.stdout.write("{};OCC={};FRQ={}\n".format(vcf_entry, query[7],(query[7]/float(len(dataBases))) ) )
Beispiel #6
0
def main(args):
    bin_size = 0
    coverage = {}
    #read the coverage file
    with open(args.tab) as fin:
        for line in fin:
            if (line[0] == "#"):
                pass
            else:
                content = line.strip().split("\t")
                bin_size = int(content[2]) - int(content[1])
                if not content[0].replace("chr", "").replace("CHR",
                                                             "") in coverage:
                    coverage[content[0].replace("chr", "").replace(
                        "CHR", "")] = [float(content[3])]
                else:
                    coverage[content[0].replace("chr",
                                                "").replace("CHR", "")].append(
                                                    float(content[3]))

    #compute the coverage across each chromosome
    chr_cov = chromosome_coverage(bin_size, coverage)

    #use the coverage to genotype the CNVS
    RD = 0
    PE = 0
    SR = 0
    GT = 0

    format_print = 1
    with open(args.vcf) as fin:
        for line in fin:
            if (line[0] == "#" and line[1] == "#"):

                if ("#FORMAT=<ID=RD,Number=1,Type=Float,Description=\"The read depth fraction at the variant\">"
                        in line):
                    RD = 1
                elif ("#FORMAT=<ID=PE,Number=1,Type=Integer,Description=\"The number of discordant pairs\">"
                      in line):
                    PE = 1
                elif ("#FORMAT=<ID=PE,Number=1,Type=Integer,Description=\"The number of split reads\">"
                      ):
                    SR = 1
                elif ("#FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"
                      ):
                    GT = 1
                print(line.strip())

            elif (line[0] == "#"):
                out_line = line.strip() + "\tFORMAT\t" + args.sample

                if format_print:
                    if not RD:
                        print(
                            "##FORMAT=<ID=RD,Number=1,Type=Float,Description=\"The read depth fraction at the variant\">"
                        )
                    if not PE:
                        print(
                            "##FORMAT=<ID=PE,Number=1,Type=Integer,Description=\"The number of discordant pairs\">"
                        )
                    if not SR:
                        print(
                            "##FORMAT=<ID=PE,Number=1,Type=Integer,Description=\"The number of split reads\">"
                        )
                    if not GT:
                        print(
                            "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">"
                        )
                    print(out_line)
                    format_print = 0
            else:

                RD = "."
                PE = "0"
                SR = 0
                GT = "./."

                chrA, startA, endA, chrB, startB, endB, event_type = readVCF.readVCFLine(
                    None, line)
                out_line = line.strip()
                try:
                    chrA_cov = chr_cov[chrA]
                    #calculate the coverage across intra chromosomal variants
                    if (chrA == chrB):
                        var_cov = mean_coverage(chrA, startA, endA, bin_size,
                                                coverage)
                        RD = (var_cov / float(chr_cov[chrA]))
                        if ("DUP" in event_type):
                            if (RD < 1.4) * chr_cov[chrA]:
                                GT = "0/1"

                        elif ("DEL" in event_type):
                            if (RD > 0.1 * chr_cov[chrA]):
                                GT = "0/1"
                            else:
                                GT = "1/1"
                except:
                    pass

                RD = str(RD)
                #compute the number of discordant pairs
                if (";LTE=" in line):
                    LTE = line.split(";LTE=")[1]
                    LTE = LTE.split(";")[0]
                    PE = LTE
                #generate the format data
                format_line = [GT, PE, RD, str(SR)]
                out_line = out_line + "\t" + "GT:PE:RD:SR" + "\t" + ":".join(
                    format_line)
                print(out_line)
Beispiel #7
0
def main(args):
    bin_size=0;
    coverage={}
    #read the coverage file
    with open(args.tab) as fin:
        for line in fin:
            if(line[0]=="#"):
                pass
            else:
                content=line.strip().split("\t")
                bin_size= int(content[2]) - int(content[1])
                if not content[0].replace("chr","").replace("CHR","") in coverage:
                    coverage[content[0].replace("chr","").replace("CHR","")]=[float(content[3])]
                else:
                    coverage[content[0].replace("chr","").replace("CHR","")].append(float(content[3]))


    #compute the coverage across each chromosome
    chr_cov=chromosome_coverage(bin_size,coverage);

    #use the coverage to genotype the CNVS
    RD=0;
    PE=0;
    SR=0;
    GT=0;
    
    format_print=1;
    with open(args.vcf) as fin:
        for line in fin:
            if(line[0]=="#" and line[1] == "#" ):
                
                if ("#FORMAT=<ID=RD,Number=1,Type=Float,Description=\"The read depth fraction at the variant\">" in line):
                    RD = 1;
                elif("#FORMAT=<ID=PE,Number=1,Type=Integer,Description=\"The number of discordant pairs\">" in line):
                    PE = 1;
                elif("#FORMAT=<ID=PE,Number=1,Type=Integer,Description=\"The number of split reads\">" ):
                    SR = 1;
                elif("#FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">" ):
                    GT = 1;
                print(line.strip());
                
            elif(line[0] == "#"):
                out_line=line.strip()+"\tFORMAT\t"+args.sample
                
                if format_print:
                    if not RD:
                        print("##FORMAT=<ID=RD,Number=1,Type=Float,Description=\"The read depth fraction at the variant\">");
                    if not PE:
                        print("##FORMAT=<ID=PE,Number=1,Type=Integer,Description=\"The number of discordant pairs\">" );
                    if not SR:
                        print("##FORMAT=<ID=PE,Number=1,Type=Integer,Description=\"The number of split reads\">");
                    if not GT:
                        print("##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">");
                    print(out_line)
                    format_print=0;
            else:

                RD="."
                PE="0";
                SR=0;                    
                GT="./."
                    
                chrA,startA,endA,chrB,startB,endB,event_type =readVCF.readVCFLine(None, line);
                out_line=line.strip();
                try:
                    chrA_cov=chr_cov[chrA];
                    #calculate the coverage across intra chromosomal variants
                    if(chrA == chrB):
                        var_cov=mean_coverage(chrA,startA,endA,bin_size,coverage)
                        RD=( var_cov/float( chr_cov[chrA] ) )
                        if("DUP" in event_type):
                            if(RD < 1.4)*chr_cov[chrA]:
                                GT="0/1"
                            
                        elif("DEL" in event_type):
                            if(RD > 0.1*chr_cov[chrA]):
                                GT="0/1"
                            else:
                                GT="1/1"
                except:
                    pass
                        
                RD=str(RD)
                #compute the number of discordant pairs
                if(";LTE=" in line):
                    LTE=line.split(";LTE=")[1];
                    LTE=LTE.split(";")[0]    
                    PE=LTE
                #generate the format data
                format_line=[GT,PE,RD,str(SR)]
                out_line=out_line+"\t"+"GT:PE:RD:SR" + "\t" + ":".join(format_line)
                print(out_line)
Beispiel #8
0
def main(args):
    allVariations       = {}
    tollerance    = args.tollerance
    fixed =0
    if args.fixed:
        fixed =1
    for variation_file in [item for sublist in args.variations for item in sublist] :
        outputSource=None;
        collapsedVariations = {} # this will contain the SVs of this file, but collapsing those that are close
        with open(variation_file) as fin:
            #memorize all variations
            for line in fin:
                if line.startswith("#") or line.startswith("="):
                        #find the output source(cnvnator or Findtranslocations
                        line=line.replace("#","");
                        content=line.split("=");
                        if(content[0] == "source"):
                                outputSource=content[1].rstrip().split()[0];
                        continue
                
                chrA,startA,endA,chrB,startB,endB,event_type =readVCF.readVCFLine(outputSource,line);

                startA = startA - tollerance
                if startA < 0:
                        startA = 0
                startB = startB - tollerance
                if startB < 0:
                        startB = 0

                current_variation = [chrA, startA , endA + tollerance, chrB, startB, endB + tollerance,event_type]
                collapsedVariations = populate_DB(collapsedVariations, current_variation, True, 0, fixed )

        ##collapse again in order to avoid problems with areas that have become too close one to onther
        elemnets_before = 0
        for chrA in collapsedVariations:
            for chrB in collapsedVariations[chrA] :
                for collapsedVariation in collapsedVariations[chrA][chrB]:
                    elemnets_before += 1
        elemnets_after  = 0

        while elemnets_before != elemnets_after:
            collapsedVariationsFinal = {}
            elemnets_before = 0
            elemnets_after  = 0
            for chrA in collapsedVariations:
                for chrB in collapsedVariations[chrA] :
                    for collapsedVariation in collapsedVariations[chrA][chrB]:
                        current_variation = [chrA, collapsedVariation[0],  collapsedVariation[1], chrB, collapsedVariation[2], collapsedVariation[3],collapsedVariation[4]]
                        collapsedVariationsFinal = populate_DB(collapsedVariationsFinal, current_variation, True, 0,fixed)
                        elemnets_before += 1
            for chrA in collapsedVariationsFinal:
                for chrB in collapsedVariationsFinal[chrA] :
                    for collapsedVariation in collapsedVariationsFinal[chrA][chrB]:
                        elemnets_after += 1
            collapsedVariations.clear()
            collapsedVariations = collapsedVariationsFinal


        #now populate the DB
        for chrA in collapsedVariations:
            for chrB in collapsedVariations[chrA] :
                for collapsedVariation in collapsedVariations[chrA][chrB]:
                    current_variation = [chrA, collapsedVariation[0],  collapsedVariation[1], chrB, collapsedVariation[2], collapsedVariation[3],collapsedVariation[4]]
                    allVariations = populate_DB(allVariations, current_variation, False, 0,fixed)
                        
        
    

    for chrA in allVariations:
        for chrB in allVariations[chrA] :
            for event in allVariations[chrA][chrB]:
                print "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(chrA, chrB,  event[0],  event[1],  event[2], event[3], event[4], event[5])
Beispiel #9
0
def main(args):
    #print "memorizing masked reagions"
    toBeMaskedElements = {}
    featureEntries = []
    for bed_file in [item for sublist in args.bed_files for item in sublist]:
        toBeMaskedElements[bed_file] = {}
        bedlabel = bed_file.split("/")[-1]
        bedlabel = bedlabel.split(".")[0]
        featureEntries.append(
            "##INFO=<ID={0},Number=2,Type=String,Description=\"Genomic features of regions A and B\">\n"
            .format(bedlabel))
        #sys.stdout.write("memorizing {} ...".format(bed_file))
        with open(bed_file) as fin:
            rows = (line.rstrip().split('\t') for line in fin)
            for row in rows:
                if not row[0].startswith("#"):
                    try:
                        toBeMaskedElements[bed_file][row[0].replace(
                            "chr",
                            "").replace("Chr",
                                        "")].append([row[1], row[2], row[3]])

                    except:
                        toBeMaskedElements[bed_file][row[0].replace(
                            "chr",
                            "").replace("Chr",
                                        "")] = [[row[1], row[2], row[3]]]
            #sys.stdout.write("done\n")

    #print "sorting masked regions"
    for bedFiles in toBeMaskedElements:
        for chr in toBeMaskedElements[bedFiles]:
            sorted(toBeMaskedElements[bedFiles][chr], key=itemgetter(1))

    #print toBeMaskedElements

    #UncompressedVariations = [];
    #CompressedVariations = []
    #currentVariation     = UncompressedVariations[0]

    with open(args.variations) as fin:

        UncompressedVariations = (line.rstrip().split('\t') for line in fin)
        noFeatureTag = 1
        infoFound = 0
        for row in UncompressedVariations:
            #the first charachter is # in the metadata, otherwise it is c,C or a number
            if (row[0][0] != "#"):
                chr_1, chr_1_start, chr_1_end, chr_2, chr_2_start, chr_2_end, event_type = readVCF.readVCFLine(
                    outputSource, "\t".join(row))
                chr_1_start = int(chr_1_start)
                chr_1_end = int(chr_1_end)
                outrow = "\t".join(row)
                outRow = outrow.replace("\n", "")
                sys.stdout.write(outRow)

                ## Chategorise the two breackpoints
                for bedFile in toBeMaskedElements:
                    bedlabel = bedFile.split("/")[-1]
                    bedlabel = bedlabel.split(".")[0]
                    for j in range(2):
                        if j == 0:
                            variation_chr = chr_1.replace("chr", "").replace(
                                "Chr", "")
                            variation_start = chr_1_start
                            variation_end = chr_1_end
                        else:
                            variation_chr = chr_2.replace("chr", "").replace(
                                "Chr", "")
                            variation_start = chr_2_start
                            variation_end = chr_2_end
                            stopSearching = 0

                        if variation_chr in toBeMaskedElements[bedFile]:
                            categorized = 0
                            i = 0
                            max_Size = 0
                            max_Size_type = ""

                            while i < len(toBeMaskedElements[bedFile]
                                          [variation_chr]):
                                repeat_start = int(toBeMaskedElements[bedFile]
                                                   [variation_chr][i][0])
                                repeat_end = int(toBeMaskedElements[bedFile]
                                                 [variation_chr][i][1])
                                repeat_type = toBeMaskedElements[bedFile][
                                    variation_chr][i][2]
                                i += 1
                                overlap = 0
                                if variation_start > repeat_end:
                                    stopSearching = 1
                                elif variation_start <= repeat_start and variation_end >= repeat_end and variation_start < repeat_end:
                                    overlap = repeat_end - repeat_start + 1
                                elif variation_start <= repeat_start and variation_end <= repeat_end and variation_start < repeat_end:
                                    overlap = variation_end - repeat_start + 1
                                elif variation_start >= repeat_start and variation_end >= repeat_end and variation_start < repeat_end:
                                    overlap = repeat_end - variation_start + 1
                                elif variation_start >= repeat_start and variation_end <= repeat_end and variation_start < repeat_end:
                                    overlap = variation_end - variation_start + 1

                                if overlap > max_Size:
                                    max_Size = overlap
                                    categorized = 1
                                    max_Size_type = "{}".format(repeat_type)

                            if categorized == 0:
                                if (j == 0):
                                    sys.stdout.write(
                                        ";{0}=NoCategory".format(bedlabel))
                                else:
                                    sys.stdout.write(",NoCategory")
                            else:
                                if (j == 0):
                                    sys.stdout.write(";{0}={1}".format(
                                        bedlabel, max_Size_type))
                                else:
                                    sys.stdout.write(
                                        ",{}".format(max_Size_type))

                        else:
                            if (j == 0):
                                sys.stdout.write(
                                    ";{}=NotPresentInRef".format(bedlabel))
                            else:
                                sys.stdout.write(",NotPresentInRef")
                sys.stdout.write("\n")
            #asasfasfasf
            else:
                lookForFilter = row[0].split("=")

                line = row[0].replace("#", "")
                content = line.split("=")
                if (content[0] == "source"):
                    outputSource = content[1].rstrip().split()[0]

                #the last infotag will be the Feature tag
                if (lookForFilter[0] != "##INFO" and infoFound == 1):
                    for entry in featureEntries:
                        sys.stdout.write(entry)
                    sys.stdout.write(row[0] + "\n")
                    infoFound = 0
                elif (lookForFilter[0] == "##INFO"):
                    sys.stdout.write(row[0] + "\n")
                    infoFound = 1
                    #there should only be one feature tag per vf file
                    if row[0] in featureEntries: featureEntries.remove(row[0])

                else:
                    sys.stdout.write("\t".join(row) + "\n")

    return 0
Beispiel #10
0
def main(args):
    #print "memorizing masked reagions"
    toBeMaskedElements = {};
    featureEntries=[];
    for bed_file in [item for sublist in args.bed_files for item in sublist] :
        toBeMaskedElements[bed_file]={};
        bedlabel=bed_file.split("/")[-1]
        bedlabel=bedlabel.split(".")[0]
        featureEntries.append("##INFO=<ID={0},Number=2,Type=String,Description=\"Genomic features of regions A and B\">\n".format(bedlabel))
        #sys.stdout.write("memorizing {} ...".format(bed_file))
        with open(bed_file) as fin:
            rows = ( line.rstrip().split('\t') for line in fin)
            for row in rows:
                if not row[0].startswith("#"):
                    try:
			 toBeMaskedElements[bed_file][row[0].replace("chr","").replace("Chr","")].append([row[1], row[2], row[3]])

                    except :
                         toBeMaskedElements[bed_file][row[0].replace("chr","").replace("Chr","")] = [[row[1], row[2], row[3]]]
            #sys.stdout.write("done\n")

    #print "sorting masked regions"
    for bedFiles in toBeMaskedElements:
        for chr in toBeMaskedElements[bedFiles]:
            sorted(toBeMaskedElements[bedFiles][chr], key=itemgetter(1))


    #print toBeMaskedElements

    #UncompressedVariations = [];
    #CompressedVariations = []
    #currentVariation     = UncompressedVariations[0]

    with open(args.variations) as fin:

        UncompressedVariations = ( line.rstrip().split('\t') for line in fin)
        noFeatureTag=1;
        infoFound=0;
        for row in UncompressedVariations:
            #the first charachter is # in the metadata, otherwise it is c,C or a number
            if(row[0][0] != "#"):
                chr_1,chr_1_start,chr_1_end,chr_2,chr_2_start,chr_2_end,event_type =readVCF.readVCFLine(outputSource,"\t".join(row));
                chr_1_start=int(chr_1_start);
                chr_1_end=int(chr_1_end);
                outrow="\t".join(row);
                outRow=outrow.replace("\n","");
                sys.stdout.write(outRow)
            
                ## Chategorise the two breackpoints
                for bedFile in toBeMaskedElements:
                    bedlabel=bedFile.split("/")[-1]
                    bedlabel=bedlabel.split(".")[0]
                    for j in range(2):
                        if j == 0:
                            variation_chr   = chr_1.replace("chr","").replace("Chr","")
                            variation_start = chr_1_start
                            variation_end   = chr_1_end
                        else:
                            variation_chr   = chr_2.replace("chr","").replace("Chr","")
                            variation_start = chr_2_start
                            variation_end   = chr_2_end
                            stopSearching = 0

                        if variation_chr in toBeMaskedElements[bedFile]:
                            categorized   = 0
                            i = 0
                            max_Size = 0
                            max_Size_type = ""

                            while i < len(toBeMaskedElements[bedFile][variation_chr]) :
                                repeat_start = int(toBeMaskedElements[bedFile][variation_chr][i][0])
                                repeat_end   = int(toBeMaskedElements[bedFile][variation_chr][i][1])
                                repeat_type  = toBeMaskedElements[bedFile][variation_chr][i][2]
                                i += 1
                                overlap = 0
                                if variation_start > repeat_end:
                                    stopSearching = 1
                                elif variation_start <= repeat_start and variation_end >= repeat_end and variation_start < repeat_end:
                                    overlap = repeat_end - repeat_start + 1
                                elif variation_start <= repeat_start and variation_end <= repeat_end and variation_start < repeat_end:
                                    overlap = variation_end - repeat_start + 1
                                elif variation_start >= repeat_start and variation_end >= repeat_end and variation_start < repeat_end:
                                    overlap = repeat_end - variation_start + 1
                                elif variation_start >= repeat_start and variation_end <= repeat_end and variation_start < repeat_end:
                                    overlap = variation_end - variation_start + 1

                                if overlap > max_Size:
                                    max_Size = overlap
                                    categorized = 1
                                    max_Size_type  = "{}".format(repeat_type)
                    
                            if categorized == 0:
                                if( j == 0):
                                    sys.stdout.write(";{0}=NoCategory".format(bedlabel))
                                else:
                                    sys.stdout.write(",NoCategory")
                            else:
                                if(j == 0):
                                    sys.stdout.write(";{0}={1}".format(bedlabel,max_Size_type))
                                else:
                                    sys.stdout.write(",{}".format(max_Size_type))

    
                        else:
                            if(j == 0):
                                sys.stdout.write(";{}=NotPresentInRef".format(bedlabel))
                            else:
                                sys.stdout.write(",NotPresentInRef")
                sys.stdout.write("\n")
            #asasfasfasf            
            else:
                lookForFilter=row[0].split("=");

                line=row[0].replace("#","");
                content=line.split("=");
                if(content[0] == "source"):
                        outputSource=content[1].rstrip().split()[0];


                #the last infotag will be the Feature tag
                if(lookForFilter[0] !="##INFO" and infoFound==1):
                        for entry in featureEntries:
                            sys.stdout.write(entry);
                        sys.stdout.write(row[0]+"\n");
                        infoFound=0;
                elif(lookForFilter[0] == "##INFO"):
                        sys.stdout.write(row[0]+"\n");
                        infoFound=1;
                        #there should only be one feature tag per vf file
                        if row[0] in featureEntries: featureEntries.remove(row[0])
                                
                else:
                        sys.stdout.write("\t".join(row)+"\n");
            

    return 0
Beispiel #11
0
def main(Data,GC_hist,args):

    if args.vcf:
        for line in open(args.vcf):
            if line[0] == "#" and line[1] == "#":
                print line.strip()
            
            elif line[0] == "#":
                print "##INFO=<ID={},Number=1,Type=Float,Description=\"estimated copy number\">".format("AMYCNE")
                print "##INFO=<ID={},Number=2,Type=Float,Description=\"99% confidence interval around the estimated CN\">".format("AMYCNECI")
                print "##INFO=<ID={},Number=1,Type=Float,Description=\"ratio of bins used for CN estimation\">".format("BIN_RATIO")
                print "##INFO=<ID={},Number=1,Type=Float,Description=\"mean coverage of the reference bins\">".format("REFCOV")
                print line.strip()
            else:
                chrA,posA,chrB,posB,event_type,INFO,FORMAT = readVCF.readVCFLine(line)
                
                if chrA == chrB:
                    if chrA in Data:
                        cn, gc, length,ref,bins,used_bins,bin_list =common.regional_cn_est( Data ,GC_hist, [chrA,posA,posB],args.Q )
                        ci="(0,0)"
                        CNE = round( cn*args.plody )
                        if CNE < 0:
                            CNE = -1
                        else:    
                            for i in range(0,len(bin_list)):
                                bin_list[i]=bin_list[i]*args.plody
                            SEM=(numpy.std(bin_list)/numpy.sqrt(used_bins))
                            ci="{},{}".format(round(cn*args.plody-SEM*3,2),round(cn*args.plody+SEM*3,2))
                        
                        content=line.split("\t")
                        if bins == 0:
                            bins =-1
                        content[7] += ";AMYCNE=" + str(int(CNE)) + ";BIN_RATIO=" + str(used_bins/float(bins)) + ";REFCOV=" + str(round(ref,2)) + ";AMYCNECI=" + ci
                        new_line="\t".join(content)
                        print new_line.strip()
                else:
                    print line.strip()
                    
    elif args.bed:
        for line in open(args.bed):
            if line[0] == "#":
                print "{}\t{}\t{}\t{}\t{}".format(line.strip(),"CN","CI","RATIO","Refcov")
                continue
            
            content=line.split("\t")
            chrA=content[0]
            chrB=chrA
            posA=content[1]
            posB=content[2]
            if not chrA in Data and chrA.replace("chr",""):
                chrA=chrA.replace("chr","")


            if chrA in Data:
                cn, gc, length,ref,bins,used_bins,bin_list =common.regional_cn_est( Data ,GC_hist, [chrA,posA,posB],args.Q )
                ci="(0,0)"
                CNE = int( round( cn*args.plody ) )
                if CNE < 0:
                    CNE = -1
                else:    
                    for i in range(0,len(bin_list)):
                        bin_list[i]=bin_list[i]*args.plody
                    SEM=(numpy.std(bin_list)/numpy.sqrt(used_bins))
                    ci="{},{}".format(round(cn*args.plody-SEM*3,2),round(cn*args.plody+SEM*3,2))
                        
                    content=line.split("\t")
                    if bins == 0:
                        bins =-1
                    print "{}\t{}\t{}\t{}\t{}".format(line.strip(), CNE,ci, used_bins/float(bins), str(round(ref,2)))   
    else:
        print "ERROR: no input source, please select vcf or bed"
Beispiel #12
0
def extract_splits(args, ws0):
    if args.bed:
        input_file = args.bed
    elif args.vcf:
        input_file = args.vcf
    else:
        print "error: missing bed or vcf"
        quit()

    if not args.sample:
        args.sample = args.bam.split("/")[-1].split(".")[0]
    if not args.working_dir:
        args.working_dir = args.bam.split("/")[-1].split(".")[0]

    if args.repeatmask:
        conn = sqlite3.connect(args.repeatmask)
        c = conn.cursor()

    row = 1
    detected_splits = {}

    i = 0
    for line in open(input_file):
        if line[0] == "#":
            continue

        if args.bed:
            content = line.strip().split()
            args.chrA = content[0]
            args.posA = int(content[1])
            args.chrB = content[2]
            args.posB = int(content[3])
            args.type = content[4]
            args.orientationA = ""
            args.orientationB = ""
            args.id = str(i)
            var_id = str(i)
            args.lengthA = ""
            args.lengthB = ""
            args.regionA = ""
            args.regionB = ""
            insertion_seq = ""
            homology_seq = ""
            args.regionAsegments = ()
            args.regionBsegments = ()
            args.contigSegments = ()
            args.HomologySegments = ()
            args.repeatA = ""
            args.repeatB = ""
            i += 1
        elif args.vcf:
            chrA, posA, chrB, posB, event_type, INFO, FORMAT = readVCF.readVCFLine(
                line)
            args.chrA = chrA
            args.posA = int(posA)
            args.chrB = chrB
            args.posB = int(posB)
            args.type = event_type

            args.orientationA = ""
            args.orientationB = ""
            args.id = line.strip().split("\t")[2]
            var_id = line.strip().split("\t")[2]
            args.lengthA = ""
            args.lengthB = ""
            args.regionA = ""
            args.regionB = ""
            insertion_seq = ""
            homology_seq = ""
            args.regionAsegments = ()
            args.regionBsegments = ()
            args.contigSegments = ()
            args.HomologySegments = ()
            args.repeatA = ""
            args.repeatB = ""
            i += 1

        found = FindTranslocations.main(args)

        splits = 0
        bp_homology = ""
        insertions = ""
        deletions = ""
        sucess = False
        print found
        if found:
            wd = os.path.join(args.working_dir, var_id)
            softclips = os.path.join(wd, "splits.sam")
            print "python consensus.py {} {} > {}/consensus.fa".format(
                wd, softclips, wd)
            os.system("python consensus.py {} {} > {}/consensus.fa".format(
                wd, softclips, wd))
            for line in open("{}/consensus.fa".format(wd)):
                splits = int(line.strip().split()[2])
                print splits
                break

            try:
                os.system("bwa mem {} {} > {}".format(
                    args.fa,
                    os.path.join(args.working_dir, var_id, "consensus.fa"),
                    os.path.join(wd, "aligned_consensus.sam")))
                args, sucess, contig, bp_homology, homology_seq, insertions, insertion_seq, deletions = retrieve_pos(
                    args,
                    os.path.join(args.working_dir, var_id,
                                 "aligned_consensus.sam"))
                print success
            except:
                homology_seq = "WARNING:unable to determine the breakpoint sequence"
        else:
            wd = os.path.join(args.working_dir, var_id)
            os.system("samtools view {} {}:{}-{} > {}/regionA.sam".format(
                args.bam, args.chrA, args.posA - args.padding,
                args.posA + args.padding, wd))
            os.system("samtools view {} {}:{}-{} > {}/regionB.sam".format(
                args.bam, args.chrB, args.posB - args.padding,
                args.posB + args.padding, wd))
            os.system(
                "cat {}/regionA.sam {}/regionB.sam > {}/region.sam".format(
                    wd, wd, wd))
            os.system("python bam2fa.py {}/region.sam > {}/region.fq".format(
                wd, wd))
            trials = [20, 60, 90]
            for k in trials:
                print "ABYSS -c {} -e {} -k {} -o {}_{}.fa {} > /dev/null 2>&1".format(
                    1, 10, k, os.path.join(args.working_dir, var_id, "abyss"),
                    k, os.path.join(wd, "region.fq"))
                os.system(
                    "ABYSS -c {} -e {} -k {} -o {}_{}.fa {} > /dev/null 2>&1".
                    format(1, 10, k,
                           os.path.join(args.working_dir, var_id, "abyss"), k,
                           os.path.join(wd, "region.fq")))
            os.system("cat {}_20.fa {}_60.fa {}_90.fa > {}".format(
                os.path.join(args.working_dir, var_id, "abyss"),
                os.path.join(args.working_dir, var_id, "abyss"),
                os.path.join(args.working_dir, var_id, "abyss"),
                os.path.join(args.working_dir, var_id, "abyss.fa")))

            if not os.stat(os.path.join(args.working_dir, var_id,
                                        "abyss.fa")).st_size == 0:
                os.system("bwa mem {} {} > {}".format(
                    args.fa, os.path.join(args.working_dir, var_id,
                                          "abyss.fa"),
                    os.path.join(wd, "aligned_contig.sam")))
                try:
                    args, sucess, contig, bp_homology, homology_seq, insertions, insertion_seq, deletions = retrieve_pos(
                        args, os.path.join(wd, "aligned_contig.sam"))
                except:
                    homology_seq = "WARNING:unable to determine the breakpoint sequence"
            if not sucess:
                contig = ""

        distanceA = ""
        distanceB = ""
        if args.repeatmask:
            distanceA, args.repeatA = find_repeat(args.chrA, args.posA, c)
            distanceB, args.repeatB = find_repeat(args.chrB, args.posB, c)

        snpDistanceA, snpsA = find_snps(args.chrA, args.posA,
                                        args.snp_distance, args.bam, wd,
                                        args.fa)
        snpDistanceB, snpsB = find_snps(args.chrB, args.posB,
                                        args.snp_distance, args.bam, wd,
                                        args.fa)

        row_content = [
            args.sample, var_id, args.type, splits, args.chrA, args.posA,
            args.orientationA, args.repeatA, distanceA, snpsA, snpDistanceA,
            args.chrB, args.posB, args.orientationB, args.repeatB, distanceB,
            snpsB, snpDistanceB, bp_homology, args.HomologySegments,
            insertions, insertion_seq, args.lengthA, args.lengthB,
            len(contig), args.regionAsegments, args.regionBsegments,
            args.contigSegments
        ]
        j = 0
        for item in row_content:

            if j in [19, 25, 26, 27]:
                ws0.write_rich_text(row, j, item)
            else:
                ws0.write(row, j, item)
            j += 1
        row += 1