Ejemplo n.º 1
0
def rmdup(prog, opts, wd, hitDict, organelle, nucleus):

    dupDict = {}
    dupCounts = 0
    nuclearDups = set()

    # command line format strings
    lastdb = opts.last + "/lastdb %s %s %s"
    lastal = opts.last + "/lastal -e%s -j4 -f0 %s %s | grep -v '#'"
    lastex = opts.last + "/lastex -E%s %s.prj %s.prj | sed -n 4p - | cut -f1"

    # flanking sequence file names
    flankSeq = wd + "/flankSeq.fa"
    flankRev = wd + "/flankRev.fa"

    # index file name
    flankIndex = wd + "/flankIndex"

    # compare flanking sequence similarity
    if opts.verbose:
        print prog + ": filter nuclear duplications"
        print prog + ": compare flanking sequence similarities"

    wrote = extractFlankSeq(flankSeq, flankRev,
                            opts.flank_len, hitDict, nucleus)
    if not wrote:
        hitCounts = sum(map(len, hitDict.values()))
        return hitCounts, dupDict, dupCounts
    
    commands.getoutput(lastdb % ("-c", flankIndex, flankSeq))
    score = commands.getoutput(lastex % ('1e-25', flankIndex, flankIndex))
    score, evalue = evalueSimulation(lastex, lastal, flankIndex,
                                     flankIndex, flankRev, score, '1e-25')
    flankResult = commands.getoutput(lastal % (score, flankIndex, flankSeq))
    nuclearDups = formatResult(flankResult, opts.dup_coverage)
    # crosscheck with segmental duplication database
    if opts.segdup_db:
        if opts.verbose:
            print prog + ": crosscheck with segmental duplication database"
        segmentalDups = checkSegDup(opts.segdup_db, hitDict, opts.dup_coverage)
        nuclearDups = nuclearDups.union(segmentalDups)

    # remove duplicate hits
    dupDict = filterDups(nuclearDups, hitDict)
    hitCounts = sum(map(len, hitDict.values()))
    dupCounts = sum(map(len, dupDict.values()))

    return hitCounts, dupDict, dupCounts
Ejemplo n.º 2
0
def rmdup(prog, opts, wd, hitDict, organelle, nucleus):

    dupDict = {}
    dupCounts = 0
    nuclearDups = set()

    # command line format strings
    lastdb = opts.last + "/lastdb %s %s %s"
    lastal = opts.last + "/lastal -e%s -j4 -f0 %s %s | grep -v '#'"
    lastex = opts.last + "/lastex -E%s %s.prj %s.prj | sed -n 4p - | cut -f1"

    # flanking sequence file names
    flankSeq = wd + "/flankSeq.fa"
    flankRev = wd + "/flankRev.fa"

    # index file name
    flankIndex = wd + "/flankIndex"

    # compare flanking sequence similarity
    if opts.verbose:
        print prog + ": filter nuclear duplications"
        print prog + ": compare flanking sequence similarities"

    wrote = extractFlankSeq(flankSeq, flankRev, opts.flank_len, hitDict,
                            nucleus)
    if not wrote:
        hitCounts = sum(map(len, hitDict.values()))
        return hitCounts, dupDict, dupCounts

    commands.getoutput(lastdb % ("-c", flankIndex, flankSeq))
    score = commands.getoutput(lastex % ('1e-25', flankIndex, flankIndex))
    score, evalue = evalueSimulation(lastex, lastal, flankIndex, flankIndex,
                                     flankRev, score, '1e-25')
    flankResult = commands.getoutput(lastal % (score, flankIndex, flankSeq))
    nuclearDups = formatResult(flankResult, opts.dup_coverage)
    # crosscheck with segmental duplication database
    if opts.segdup_db:
        if opts.verbose:
            print prog + ": crosscheck with segmental duplication database"
        segmentalDups = checkSegDup(opts.segdup_db, hitDict, opts.dup_coverage)
        nuclearDups = nuclearDups.union(segmentalDups)

    # remove duplicate hits
    dupDict = filterDups(nuclearDups, hitDict)
    hitCounts = sum(map(len, hitDict.values()))
    dupCounts = sum(map(len, dupDict.values()))

    return hitCounts, dupDict, dupCounts
Ejemplo n.º 3
0
def checkGenomes(prog, coord, ref, labels, gens, spes, spesV, opts, wd):

    # command line format strings
    lastdb = opts.last + "/lastdb %s %s %s"
    lastal = opts.last + "/lastal -e%s -j4 -f0 %s %s | grep -v '#'"
    lastex = opts.last + "/lastex -E%s %s.prj %s.prj | sed -n 4p - | cut -f1"

    # index file name
    bBaseFreq = wd + "/bBaseFreq"

    # hit and its flanking sequences
    baitSeq = wd + "/baitSeq.fa"
    baitRev = wd + "/baitRev.fa"

    i = [k for k in spes.keys() if spes[k] == ref][0]
    extractBaitSeq(baitSeq, baitRev, gens[i], coord)
    commands.getoutput(lastdb % ("-x", bBaseFreq, baitSeq))
    spes.pop(i)

    L = spes.keys()
    L.sort()

    homolog = {}
    for chrom in coord.keys():
        for obj in coord[chrom]:
            key = ":".join([chrom, obj[0], obj[1]])
            beg = map(int, obj[0].split(","))
            end = map(int, obj[1].split(","))
            l = sum([end[i] - beg[i] for i in range(len(beg))])
            homolog.setdefault(key, [{} for j in L])
            homolog[key].append(l + 400)

    for j in range(len(L)):
        if opts.verbose:
            print prog + ": ...... surveying: " + spesV[L[j]]
        gIndex = wd + "/" + spes[L[j]] + "Index"
        commands.getoutput(lastdb % ("-c", gIndex, gens[L[j]]))
        score = commands.getoutput(lastex % ('1e-10', gIndex, bBaseFreq))
        score, e = evalueSimulation(lastdb, lastal, gIndex, bBaseFreq, baitRev,
                                    score, '1e-10')
        alignResult = commands.getoutput(lastal % (score, gIndex, baitSeq))
        formatResult(j, alignResult, homolog)

    return homolog
Ejemplo n.º 4
0
def checkGenomes(prog, coord, ref, labels, gens, spes, spesV, opts, wd):

    # command line format strings
    lastdb = opts.last + "/lastdb %s %s %s"
    lastal = opts.last + "/lastal -e%s -j4 -f0 %s %s | grep -v '#'"
    lastex = opts.last + "/lastex -E%s %s.prj %s.prj | sed -n 4p - | cut -f1"

    # index file name
    bBaseFreq = wd + "/bBaseFreq"

    # hit and its flanking sequences
    baitSeq = wd + "/baitSeq.fa"
    baitRev = wd + "/baitRev.fa"

    i = [k for k in spes.keys() if spes[k] == ref][0]
    extractBaitSeq(baitSeq, baitRev, gens[i], coord)
    commands.getoutput(lastdb % ("-x", bBaseFreq, baitSeq))
    spes.pop(i)

    L = spes.keys()
    L.sort()

    homolog = {}
    for chrom in coord.keys():
        for obj in coord[chrom]:
            key = ":".join([chrom, obj[0], obj[1]])
            beg = map(int, obj[0].split(","))
            end = map(int, obj[1].split(","))
            l = sum([end[i] - beg[i] for i in range(len(beg))])
            homolog.setdefault(key, [{} for j in L])
            homolog[key].append(l + 400)

    for j in range(len(L)):
        if opts.verbose:
            print prog + ": ...... surveying: " + spesV[L[j]]
        gIndex = wd + "/" + spes[L[j]] + "Index"
        commands.getoutput(lastdb % ("-c", gIndex, gens[L[j]]))
        score = commands.getoutput(lastex % ("1e-10", gIndex, bBaseFreq))
        score, e = evalueSimulation(lastdb, lastal, gIndex, bBaseFreq, baitRev, score, "1e-10")
        alignResult = commands.getoutput(lastal % (score, gIndex, baitSeq))
        formatResult(j, alignResult, homolog)

    return homolog