Beispiel #1
0
def split_files(input_filename='in.fa', split_size=20):
    """
    Split input files into split_0/in.fa, split_1/in.fa...
    Return the list of split directories
    """
    i = 0
    count = 0
    d = "split_0"
    if not os.path.exists(d):
        os.makedirs(d)
    f = open(os.path.join(d, 'in.fa'), 'w')
    split_dirs = [d]
    run_external_call("cp in.weights {0}/in.weights".format(d))

    for r in SeqIO.parse(open(input_filename), 'fasta'):
        if count >= split_size:
            f.close()
            count = 0
            i += 1
            d = "split_" + str(i)
            if not os.path.exists(d):
                os.makedirs(d)
            split_dirs.append(d)
            run_external_call("cp in.weights {0}/in.weights".format(d))
            f = open(os.path.join(d, 'in.fa'), 'w')
        f.write(">{0}\n{1}\n".format(r.id, r.seq))
        count += 1
    f.close()
    return split_dirs
Beispiel #2
0
def split_files(input_filename="in.fa", split_size=20):
    """
    Split input files into split_0/in.fa, split_1/in.fa...
    Return the list of split directories
    """
    i = 0
    count = 0
    d = "split_0"
    if not os.path.exists(d):
        os.makedirs(d)
    f = open(os.path.join(d, "in.fa"), "w")
    split_dirs = [d]
    run_external_call("cp in.weights {0}/in.weights".format(d))

    for r in SeqIO.parse(open(input_filename), "fasta"):
        if count >= split_size:
            f.close()
            count = 0
            i += 1
            d = "split_" + str(i)
            if not os.path.exists(d):
                os.makedirs(d)
            split_dirs.append(d)
            run_external_call("cp in.weights {0}/in.weights".format(d))
            f = open(os.path.join(d, "in.fa"), "w")
        f.write(">{0}\n{1}\n".format(r.id, r.seq))
        count += 1
    f.close()
    return split_dirs
Beispiel #3
0
def sanity_check_mash_exists():
    try:
        run_external_call("mash --version")
    except:
        print("mash executable does not exist. Please install mash first!",
              file=sys.stderr)
        sys.exit(-1)
Beispiel #4
0
def sanity_check_gmapl_exists():
    """
    GMAP version that comes with smrtanalysis 2.3 is old and does not contain gmapl =__=
    User must install newer version of gmap to get gmapl
    """
    try:
        run_external_call("gmapl --version")
    except:
        print("gmapl executable does not exist. You probably have an old version of GMAP." \
                             "Please install a newer version of GMAP and try again.", file=sys.stderr)
        sys.exit(-1)
Beispiel #5
0
def sanity_check_gmapl_exists():
    """
    GMAP version that comes with smrtanalysis 2.3 is old and does not contain gmapl =__=
    User must install newer version of gmap to get gmapl
    """
    try:
        run_external_call("gmapl --version")
    except:
        print >> sys.stderr, "gmapl executable does not exist. You probably have an old version of GMAP." \
                             "Please install a newer version of GMAP and try again."
        sys.exit(-1)
Beispiel #6
0
def run_Cogent_on_split_files(split_dirs, depth):
    """
    1. run Cogent individually on each split directory
    2. combine all cogent2.fa from split directories, pretend they are the "INPUT", run Cogent on it

    """
    time1 = time.time()
    olddir = os.getcwd()
    for d in split_dirs:
        os.chdir(d)
        if os.path.exists('cogent2.fa'):
            print >> sys.stderr, "skipping {0} because done already".format(d)
            os.chdir(olddir)
            continue
        run_Cogent_on_input()
        # clean up cogent in the split dir
        if os.path.exists('cogent') and os.path.isdir('cogent'):
            cleanup_gmap('cogent')
        if os.path.exists('cogent2') and os.path.isdir('cogent2'):
            cleanup_gmap('cogent2')
        os.chdir(olddir)

    if os.path.exists('combined'):
        run_external_call("rm -rf combined")
    os.makedirs('combined')
    # now combine all the cogent2 results and pretend they are the "INPUT"
    f = open('combined/in.fa', 'w')
    f2 = open('combined/in.weights', 'w')
    i = 0
    for d in split_dirs:
        for r in SeqIO.parse(open(os.path.join(d, 'cogent2.fa')), 'fasta'):
            f.write(">fake_input_path{0}\n{1}\n".format(i, r.seq))
            f2.write("fake_input_path{0}\t1\n".format(i))
            i += 1
    f.close()
    f2.close()

    os.chdir('combined')
    if i > cc_settings.MAX_POST_SPLIT_IN_SIZE and depth < cc_settings.MAX_RECUR_DEPTH:
        dirs = split_files(input_filename='in.fa', split_size=cc_settings.MAX_POST_SPLIT_IN_SIZE)
        run_Cogent_on_split_files(dirs, depth+1)
    run_Cogent_on_input()
    os.chdir('../')

    # now take the output from combined and run LP against it,
    # using the real input this time

    with open('in.trimmed.fa', 'w') as f:
        for r in SeqIO.parse(open('in.fa'), 'fasta'):
            f.write(">{0}\n{1}\n".format(r.id, trim_ends(str(r.seq))))

    if os.path.exists('post_combined'):
        run_external_call("rm -rf post_combined")
    os.makedirs('post_combined')
    os.chdir('post_combined')
    run_external_call("ln -s ../combined/cogent2.fa cogent.fa")
    run_external_call("ln -s ../in.weights in.weights")
    run_external_call("ln -s ../in.trimmed.fa in.trimmed.fa")
    run_gmap()
    post_gmap_processing(seqrecs=[r for r in SeqIO.parse(open('in.trimmed.fa'), 'fasta')])
    os.chdir('../')

    # now the result we want is in combined/cogent2.fa, do postprocessing on it with the full in.fa

    run_external_call("ln -f -s post_combined/cogent2.fa cogent2.fa")
    run_gmap(dbname='cogent2', infile='in.trimmed.fa')
    #post_gmap_processing()

    time4 = time.time()
    log.info("[RUNTIME] Total time in run_Cogent: {0}".format(time4-time1))
Beispiel #7
0
def run_Cogent_on_split_files(split_dirs, depth):
    """
    1. run Cogent individually on each split directory
    2. combine all cogent2.fa from split directories, pretend they are the "INPUT", run Cogent on it

    """
    time1 = time.time()
    olddir = os.getcwd()
    for d in split_dirs:
        os.chdir(d)
        if os.path.exists('cogent2.fa'):
            print("skipping {0} because done already".format(d),
                  file=sys.stderr)
            os.chdir(olddir)
            continue
        try:
            run_Cogent_on_input()
            os.chdir(olddir)
        except CycleDetectedException:
            os.chdir(olddir)
            raise CycleDetectedException

    if os.path.exists('combined'):
        run_external_call("rm -rf combined")
    os.makedirs('combined')
    # now combine all the cogent2 results and pretend they are the "INPUT"
    f = open('combined/in.trimmed.fa', 'w')
    f2 = open('combined/in.weights', 'w')
    i = 0
    for d in split_dirs:
        for r in SeqIO.parse(open(os.path.join(d, 'cogent2.fa')), 'fasta'):
            f.write(">fake_input_path{0}\n{1}\n".format(i, r.seq))
            f2.write("fake_input_path{0}\t1\n".format(i))
            i += 1
    f.close()
    f2.close()

    os.chdir('combined')
    if i > cc_settings.MAX_POST_SPLIT_IN_SIZE and depth < cc_settings.MAX_RECUR_DEPTH:
        dirs = split_files(input_filename='in.trimmed.fa',
                           split_size=cc_settings.MAX_POST_SPLIT_IN_SIZE)
        run_Cogent_on_split_files(dirs, depth + 1)
    run_Cogent_on_input()
    os.chdir('../')

    if os.path.exists('post_combined'):
        run_external_call("rm -rf post_combined")
    os.makedirs('post_combined')
    os.chdir('post_combined')
    run_external_call("ln -s ../combined/cogent2.fa cogent.fa")
    run_external_call("ln -s ../in.weights in.weights")
    run_external_call("ln -s ../in.trimmed.fa in.trimmed.fa")
    sam_file = run_minimap2('cogent.fa', 'in.trimmed.fa', 'SAM')
    post_minimap2_processing(
        'cogent.fa',
        sam_file,
        'cogent2',
        seqrecs=[r for r in SeqIO.parse(open('in.trimmed.fa'), 'fasta')])
    os.chdir('../')

    # now the result we want is in combined/cogent2.fa, do postprocessing on it with the full in.fa
    run_external_call("ln -f -s post_combined/cogent2.fa cogent2.fa")
    run_minimap2('cogent2.fa', 'in.trimmed.fa', format='SAM')

    time4 = time.time()
    log.info("[RUNTIME] Total time in run_Cogent: {0}".format(time4 - time1))
Beispiel #8
0
def sanity_check_mash_exists():
    try:
        run_external_call("mash --version")
    except:
        print >> sys.stderr, "mash executable does not exist. Please install mash first!"
        sys.exit(-1)
Beispiel #9
0
def sanity_check_minimap2_exists():
    try:
        run_external_call("minimap2 --version")
    except:
        print >> sys.stderr, "minimap2 executable does not exist. Please install minimap2 first!"
        sys.exit(-1)
Beispiel #10
0
def run_Cogent_on_split_files(split_dirs):
    """
    1. run Cogent individually on each split directory
    2. combine all cogent2.fa from split directories, pretend they are the "INPUT", run Cogent on it

    """
    time1 = time.time()
    olddir = os.getcwd()
    for d in split_dirs:
        os.chdir(d)
        run_Cogent_on_input()
        os.chdir(olddir)

    if os.path.exists('combined'):
        run_external_call("rm -rf combined")
    os.makedirs('combined')
    # now combine all the cogent2 results and pretend they are the "INPUT"
    f = open('combined/in.fa', 'w')
    f2 = open('combined/in.weights', 'w')
    i = 0
    for d in split_dirs:
        for r in SeqIO.parse(open(os.path.join(d, 'cogent2.fa')), 'fasta'):
            f.write(">fake_input_path{0}\n{1}\n".format(i, r.seq))
            f2.write("fake_input_path{0}\t1\n".format(i))
            i += 1
    f.close()
    f2.close()

    os.chdir('combined')
    run_Cogent_on_input()
    os.chdir('../')

    # now take the output from combined and run LP against it,
    # using the real input this time

    with open('in.trimmed.fa', 'w') as f:
        for r in SeqIO.parse(open('in.fa'), 'fasta'):
            f.write(">{0}\n{1}\n".format(r.id, trim_ends(str(r.seq))))

    if os.path.exists('post_combined'):
        run_external_call("rm -rf post_combined")
    os.makedirs('post_combined')
    os.chdir('post_combined')
    run_external_call("ln -s ../combined/cogent2.fa cogent.fa")
    run_external_call("ln -s ../in.weights in.weights")
    run_external_call("ln -s ../in.trimmed.fa in.trimmed.fa")
    run_gmap()
    post_gmap_processing(
        seqrecs=[r for r in SeqIO.parse(open('in.trimmed.fa'), 'fasta')])
    os.chdir('../')

    # now the result we want is in combined/cogent2.fa, do postprocessing on it with the full in.fa

    run_external_call("ln -f -s post_combined/cogent2.fa cogent2.fa")
    run_gmap(dbname='cogent2', infile='in.trimmed.fa')
    #post_gmap_processing()

    time4 = time.time()
    log.info("[RUNTIME] Total time in run_Cogent: {0}".format(time4 - time1))
Beispiel #11
0
def run_Cogent_on_split_files(split_dirs):
    """
    1. run Cogent individually on each split directory
    2. combine all cogent2.fa from split directories, pretend they are the "INPUT", run Cogent on it

    """
    time1 = time.time()
    olddir = os.getcwd()
    for d in split_dirs:
        os.chdir(d)
        run_Cogent_on_input()
        os.chdir(olddir)

    if os.path.exists("combined"):
        run_external_call("rm -rf combined")
    os.makedirs("combined")
    # now combine all the cogent2 results and pretend they are the "INPUT"
    f = open("combined/in.fa", "w")
    f2 = open("combined/in.weights", "w")
    i = 0
    for d in split_dirs:
        for r in SeqIO.parse(open(os.path.join(d, "cogent2.fa")), "fasta"):
            f.write(">fake_input_path{0}\n{1}\n".format(i, r.seq))
            f2.write("fake_input_path{0}\t1\n".format(i))
            i += 1
    f.close()
    f2.close()

    os.chdir("combined")
    run_Cogent_on_input()
    os.chdir("../")

    # now take the output from combined and run LP against it,
    # using the real input this time

    with open("in.trimmed.fa", "w") as f:
        for r in SeqIO.parse(open("in.fa"), "fasta"):
            f.write(">{0}\n{1}\n".format(r.id, trim_ends(str(r.seq))))

    if os.path.exists("post_combined"):
        run_external_call("rm -rf post_combined")
    os.makedirs("post_combined")
    os.chdir("post_combined")
    run_external_call("ln -s ../combined/cogent2.fa cogent.fa")
    run_external_call("ln -s ../in.weights in.weights")
    run_external_call("ln -s ../in.trimmed.fa in.trimmed.fa")
    run_gmap()
    post_gmap_processing(seqrecs=[r for r in SeqIO.parse(open("in.trimmed.fa"), "fasta")])
    os.chdir("../")

    # now the result we want is in combined/cogent2.fa, do postprocessing on it with the full in.fa

    run_external_call("ln -f -s post_combined/cogent2.fa cogent2.fa")
    run_gmap(dbname="cogent2", infile="in.trimmed.fa")
    # post_gmap_processing()

    time4 = time.time()
    log.info("[RUNTIME] Total time in run_Cogent: {0}".format(time4 - time1))