Exemple #1
0
def check_circular(file, name):
    contigs = fastaparser.read_fasta(file)
    count = []
    circular_contigs = {}
    input_fasta = name + "_input_with_circ.fasta"

    with open(input_fasta, 'w') as output:

        for contig in contigs:
            circular_contigs[contig[0].split(" ")[0][1:]] = [
                len(contig[1]), "-"
            ]
            for kval in range(200, 50, -1):
                if kval >= len(contig[1]) or len(contig[1]) < 500:
                    continue
                start = contig[1][:kval]
                end = contig[1][-kval:]

                if start == end:
                    circular_contigs[contig[0].split(" ")[0][1:]] = [
                        len(contig[1]), "+"
                    ]
                    break

            output.write(contig[0] + "\n")
            if circular_contigs[contig[0].split(" ")[0][1:]][1] == "-":
                output.write(contig[1] + "\n")
            elif circular_contigs[contig[0].split(" ")[0][1:]][1] == "+":
                #  if contig[0].split(" ")[0][1:] in circ_set:
                #  output.write(contig[1]+contig[1][:5000]+"\n")
                # else:
                output.write(contig[1] + contig[1][kval:5000] + "\n")

    return (circular_contigs)
Exemple #2
0
def split_and_rename(infile, outdir):
    contigs = fastaparser.read_fasta(infile)
    for contig in contigs:
        filename = contig[0].split()[0][1:]
        filename = join(outdir, filename + ".fasta")
        print filename
        fastaparser.write_fasta_to_file(filename, [contig])
Exemple #3
0
def extract_circular_from_file(file, indir, outdir):
    out_file = join(outdir, os.path.splitext(file)[0] + ".circular.fasta")
    contigs = fastaparser.read_fasta(join(sys.argv[1], file))
    circulars = []
    count = []
    for contig in contigs:
        arr = contig[0].strip(';').split('_')
        #      if float(arr[3]) > 500:
        if len(contig[1]) < 500: continue
        for kval in range(200, 50, -1):
            #            kval = 55
            if kval >= len(contig[1]) or len(contig[1]) < 500:
                continue
            start = contig[1][:kval]
            end = contig[1][-kval:]

            if start == end:
                #               print (">" + contig[0][1:])
                #               print (contig[1])
                #                print (" k equal " + str(kval))
                print(contig[0] + " is circular " + str(kval))
                #                contig[0] = contig[0] + " k: " + str(kval)
                circulars.append(contig)
                break
    fastaparser.write_fasta_to_file(out_file, circulars)
Exemple #4
0
def parse_mash(contig_file, table):
    contigs = fastaparser.read_fasta(contig_file)
    similar_lists = {}
    for contig in contigs:
        similar_lists[get_short_name(contig[0])] = []
    for line in open(table, 'r'):
        arr = line.split()
        dist = float(arr[2])
        if dist < 0.1:
            similar_lists[arr[0]].append(arr[1])
    print "processed input"
    to_sort = []
    for l in similar_lists:
        to_sort.append([l, len(similar_lists[l])])
    sorted_similar = sorted(to_sort, key=itemgetter(1), reverse=True)
    outcontigs = []
    used = set()
    for contig_info in sorted_similar:
        if contig_info[0] not in used:
            for similar in similar_lists[contig_info[0]]:
                used.add(similar)
            if contig_info[1] > 10:
                print contig_info
                print similar_lists[contig_info[0]]
                #far from optimal but whynot
                for contig in contigs:
                    if get_short_name(contig[0]) == contig_info[0]:
                        outcontigs.append(contig)
                        break
    result_f = join(os.path.dirname(contig_file), "interesting.fasta")
    os.system("rm " + result_f)
    fastaparser.write_fasta_to_file(result_f, outcontigs)
Exemple #5
0
def make_rectangles_from_genome(options):
    k = options.k
    ingraph = Graph()
    _, genome = fastaparser.read_fasta(options.genome).next()
    ingraph.make_graph(genome, int(k))
    edges_before_loop_DG = ingraph.find_loops(10, 1000) 
    ingraph.save(os.path.join(options.out_dir,"graph"))
    rs = RectangleSet(ingraph, int(options.d))
    rs.filter_without_prd()
    f_left = open(os.path.join(options.out_dir, "paired_genom_contigs_1.fasta"),"w") # TODO: what is it?
    f_right = open(os.path.join(options.out_dir, "paired_genom_contigs_2.fasta"),"w") # TODO: what is it?
    contigs_id = 0
    for key, rect in rs.rectangles.items():
      for key, diag in rect.diagonals.items():
        e1 = rect.e1.seq
        e2 = rect.e2.seq
        f_left.write(">" + str(contigs_id) + "/1\n")
        f_left.write(e1[diag.offseta:diag.offsetc])
        f_left.write("\n")
        f_right.write(">"+str(contigs_id) + "/2\n")
        f_right.write(e2[diag.offsetb:diag.offsetd])
        f_right.write("\n")
        contigs_id += 1
    bgraph = rs.bgraph_from_genome()
    bgraph.condense()
    outgraph = bgraph.project(options.out_dir, False)
    outgraph.fasta(open(os.path.join(options.out_dir, 'rectangles.fasta'), 'w'))
Exemple #6
0
def GC_content(contigs_fpath, skip=False):
    """
       Returns percent of GC for assembly and GC distribution: (list of GC%, list of # windows)
    """
    total_GC_amount = 0
    total_contig_length = 0
    GC_bin_num = int(100 / qconfig.GC_bin_size) + 1
    GC_distribution_x = [i * qconfig.GC_bin_size for i in range(0, GC_bin_num)] # list of X-coordinates, i.e. GC %
    GC_distribution_y = [0] * GC_bin_num # list of Y-coordinates, i.e. # windows with GC % = x
    total_GC = None
    if skip:
        return total_GC, (GC_distribution_x, GC_distribution_y)

    for name, seq_full in fastaparser.read_fasta(contigs_fpath): # in tuples: (name, seq)
        total_GC_amount += seq_full.count("G") + seq_full.count("C")
        total_contig_length += len(seq_full) - seq_full.count("N")
        n = 100 # blocks of length 100
        # non-overlapping windows
        for seq in [seq_full[i:i+n] for i in range(0, len(seq_full), n)]:
            # skip block if it has less than half of ACGT letters (it also helps with "ends of contigs")
            ACGT_len = len(seq) - seq.count("N")
            if ACGT_len < (n / 2):
                continue

            GC_len = seq.count("G") + seq.count("C")
            GC_percent = 100.0 * GC_len / ACGT_len
            GC_distribution_y[int(int(GC_percent / qconfig.GC_bin_size) * qconfig.GC_bin_size)] += 1

#    GC_info = []
#    for name, seq_full in fastaparser.read_fasta(contigs_fpath): # in tuples: (name, seq)
#        total_GC_amount += seq_full.count("G") + seq_full.count("C")
#        total_contig_length += len(seq_full) - seq_full.count("N")
#        n = 100 # blocks of length 100
#        # non-overlapping windows
#        for seq in [seq_full[i:i+n] for i in range(0, len(seq_full), n)]:
#            # skip block if it has less than half of ACGT letters (it also helps with "ends of contigs")
#            ACGT_len = len(seq) - seq.count("N")
#            if ACGT_len < (n / 2):
#                continue
#            # contig_length = len(seq)
#            GC_amount = seq.count("G") + seq.count("C")
#            #GC_info.append((contig_length, GC_amount * 100.0 / contig_length))
#            GC_info.append((1, 100 * GC_amount / ACGT_len))

#        # sliding windows
#        seq = seq_full[0:n]
#        GC_amount = seq.count("G") + seq.count("C")
#        GC_info.append((1, GC_amount * 100.0 / n))
#        for i in range(len(seq_full) - n):
#            GC_amount = GC_amount - seq_full[i].count("G") - seq_full[i].count("C")
#            GC_amount = GC_amount + seq_full[i + n].count("G") + seq_full[i + n].count("C")
#            GC_info.append((1, GC_amount * 100.0 / n))

    if total_contig_length == 0:
        total_GC = None
    else:
        total_GC = total_GC_amount * 100.0 / total_contig_length

    return total_GC, (GC_distribution_x, GC_distribution_y)
Exemple #7
0
def GC_content(contigs_fpath, skip=False):
    """
       Returns percent of GC for assembly and GC distribution: (list of GC%, list of # windows)
    """
    total_GC_amount = 0
    total_contig_length = 0
    GC_bin_num = int(100 / qconfig.GC_bin_size) + 1
    GC_distribution_x = [i * qconfig.GC_bin_size for i in range(0, GC_bin_num)]  # list of X-coordinates, i.e. GC %
    GC_distribution_y = [0] * GC_bin_num  # list of Y-coordinates, i.e. # windows with GC % = x
    total_GC = None
    if skip:
        return total_GC, (GC_distribution_x, GC_distribution_y)

    for name, seq_full in fastaparser.read_fasta(contigs_fpath):  # in tuples: (name, seq)
        total_GC_amount += seq_full.count("G") + seq_full.count("C")
        total_contig_length += len(seq_full) - seq_full.count("N")
        n = 100  # blocks of length 100
        # non-overlapping windows
        for seq in [seq_full[i : i + n] for i in range(0, len(seq_full), n)]:
            # skip block if it has less than half of ACGT letters (it also helps with "ends of contigs")
            ACGT_len = len(seq) - seq.count("N")
            if ACGT_len < (n / 2):
                continue

            GC_len = seq.count("G") + seq.count("C")
            GC_percent = 100.0 * GC_len / ACGT_len
            GC_distribution_y[int(int(GC_percent / qconfig.GC_bin_size) * qconfig.GC_bin_size)] += 1

    #    GC_info = []
    #    for name, seq_full in fastaparser.read_fasta(contigs_fpath): # in tuples: (name, seq)
    #        total_GC_amount += seq_full.count("G") + seq_full.count("C")
    #        total_contig_length += len(seq_full) - seq_full.count("N")
    #        n = 100 # blocks of length 100
    #        # non-overlapping windows
    #        for seq in [seq_full[i:i+n] for i in range(0, len(seq_full), n)]:
    #            # skip block if it has less than half of ACGT letters (it also helps with "ends of contigs")
    #            ACGT_len = len(seq) - seq.count("N")
    #            if ACGT_len < (n / 2):
    #                continue
    #            # contig_length = len(seq)
    #            GC_amount = seq.count("G") + seq.count("C")
    #            #GC_info.append((contig_length, GC_amount * 100.0 / contig_length))
    #            GC_info.append((1, 100 * GC_amount / ACGT_len))

    #        # sliding windows
    #        seq = seq_full[0:n]
    #        GC_amount = seq.count("G") + seq.count("C")
    #        GC_info.append((1, GC_amount * 100.0 / n))
    #        for i in range(len(seq_full) - n):
    #            GC_amount = GC_amount - seq_full[i].count("G") - seq_full[i].count("C")
    #            GC_amount = GC_amount + seq_full[i + n].count("G") + seq_full[i + n].count("C")
    #            GC_info.append((1, GC_amount * 100.0 / n))

    if total_contig_length == 0:
        total_GC = None
    else:
        total_GC = total_GC_amount * 100.0 / total_contig_length

    return total_GC, (GC_distribution_x, GC_distribution_y)
Exemple #8
0
def glue_and_rename(indir, outfile):
    for file in os.listdir(indir):
        arr = file.split('.')
        if len(arr) < 4:
            continue
        contigs = fastaparser.read_fasta(join(indir, file))
        for contig in contigs:
            new_name = contig[0] + " " + arr[0] + "." + arr[1]
            print new_name
            fastaparser.write_fasta_to_file(outfile,
                                            zip([new_name], [contig[1]]))
Exemple #9
0
def found_most_similar(work_dir):
    contigs_info = []
    for file in os.listdir(work_dir):
        arr = file.split('.')
        if arr[-1] == "fasta":
            contigs = fastaparser.read_fasta(join(work_dir, file))
            contigs_info.append([file, len(contigs[0][1])])

    all_sorted = sorted(contigs_info, key=itemgetter(1))
    max_ind = len(all_sorted)
    low_ind = 0
    high_ind = 0
    similar_list = []
    used = []
    for i in range(0, max_ind):
        used.append(False)
        cur_len = all_sorted[i][1]
        first_mash = join(work_dir, all_sorted[i][0] + ".msh")
        while all_sorted[low_ind][1] < cur_len * 0.8 and low_ind < max_ind - 1:
            low_ind += 1
        while all_sorted[high_ind][1] < cur_len * 1.2 and high_ind < max_ind:
            high_ind += 1
        if i % 10 == 0:
            print "processing... " + str(i) + " range: " + str(
                low_ind) + "-" + str(high_ind)
        sim = []
        for j in range(low_ind, high_ind):
            second_mash = join(work_dir, all_sorted[j][0] + ".msh")
            process = subprocess.Popen(
                [mash_bin, 'dist', first_mash, second_mash],
                stdout=subprocess.PIPE)
            stdout = process.communicate()[0]
            arr = stdout.split()
            dist = float(arr[2])
            if dist < 0.2:
                sim.append(j)
        similar_list.append([i, len(sim), sim])
        if i % 10 == 0:
            print(len(sim))
    most_similar = sorted(similar_list, key=itemgetter(1), reverse=True)
    for k in most_similar:
        print k
    for contigs in most_similar:
        print all_sorted[contigs[0]][0] + " " + str(contigs[1]) + " " + str(
            used(contigs[0]))
        for j in contigs[2]:
            used[j] = True
Exemple #10
0
def extract_not_listed(infasta, list):
    listed = set()
    for line in open(list, 'r'):
        listed.add(">" + line.split()[0])
    print len(listed)
    contigs = fastaparser.read_fasta(infasta)
    print len(contigs)
    outcontigs = []
    for contig in contigs:
        if not contig[0].split()[0] in listed:
            #            print contig[0]
            outcontigs.append(contig)


#        else:
#            listed.remove(contig[0])
#    for c in listed:
#        print c
    print len(outcontigs)
    outfasta = infasta[:-6] + ".unknown.fasta"
    os.system("rm " + outfasta)
    fastaparser.write_fasta_to_file(outfasta, outcontigs)
Exemple #11
0
def break_scaffolds(argv):
    if (len(argv) != 4) and (len(argv) != 2):
        print(
            "Usage: " + argv[0] +
            " <input fasta (scaffolds)> (to get stats on sizes of Ns regions)")
        print(
            "Usage: " + argv[0] +
            " <input fasta (scaffolds)> <THRESHOLD> <output fasta (contigs)> (to break contigs on Ns regions of size >= THRESHOLD)"
        )
        sys.exit()

    BREAK_SCAFFOLDS = False
    if len(argv) == 4:
        BREAK_SCAFFOLDS = True

    N_NUMBER = None
    counter = 0
    if BREAK_SCAFFOLDS:
        N_NUMBER = int(argv[2])

    sizes_of_Ns_regions = dict()
    new_fasta = []
    for id, (name, seq) in enumerate(fastaparser.read_fasta(argv[1])):
        i = 0
        cur_contig_number = 1
        cur_contig_start = 0
        while (i < len(seq)) and (seq.find("N", i) != -1):
            start = seq.find("N", i)
            end = start + 1
            while (end != len(seq)) and (seq[end] == 'N'):
                end += 1

            i = end + 1
            if BREAK_SCAFFOLDS and (end - start) >= N_NUMBER:
                new_fasta.append(
                    (name.split()[0] + "_" + str(cur_contig_number),
                     seq[cur_contig_start:start]))
                cur_contig_number += 1
                cur_contig_start = end

            if not BREAK_SCAFFOLDS:
                if (end - start) in sizes_of_Ns_regions:
                    sizes_of_Ns_regions[(end - start)] += 1
                else:
                    sizes_of_Ns_regions[(end - start)] = 1

        if BREAK_SCAFFOLDS:
            new_fasta.append((name.split()[0] + "_" + str(cur_contig_number),
                              seq[cur_contig_start:]))
            counter += cur_contig_number

    if BREAK_SCAFFOLDS:
        fastaparser.write_fasta_to_file(argv[3], new_fasta)
        #print (" * " + str(id + 1) + " scaffold(s) were broken into " + str(counter) + " contig(s)")
    else:
        list_of_sizes = sizes_of_Ns_regions.keys()
        list_of_sizes.sort()
        avg_len = 0.0
        nruns = 0
        for k, v in sizes_of_Ns_regions:
            avg_len += k * v
            nruns += v
            print k, sizes_of_Ns_regions[k]
        avg_len /= nruns
        print "N-runs: " + str(nruns) + ", avg. len: " + str(avg_len)
Exemple #12
0
def do(ref_fpath, contigs_fpaths, output_dirpath, json_output_dir,
       results_dir):
    logger.print_timestamp()
    logger.info("Running Basic statistics processor...")

    if not os.path.isdir(output_dirpath):
        os.mkdir(output_dirpath)

    reference_length = None
    if ref_fpath:
        reference_length = sum(
            fastaparser.get_lengths_from_fastafile(ref_fpath))
        reference_GC, reference_GC_distribution = GC_content(ref_fpath)

        logger.info('  Reference genome:')
        logger.info('    ' + os.path.basename(ref_fpath) +
                    ', Reference length = ' + str(reference_length) +
                    ', Reference GC % = ' + '%.2f' % reference_GC)
    elif qconfig.estimated_reference_size:
        reference_length = qconfig.estimated_reference_size
        logger.info('  Estimated reference length = ' + str(reference_length))

    if reference_length:
        # Saving the reference in JSON
        if json_output_dir:
            json_saver.save_reference_length(json_output_dir, reference_length)

        # Saving for an HTML report
        if qconfig.html_report:
            from libs.html_saver import html_saver
            html_saver.save_reference_length(results_dir, reference_length)

    logger.info('  Contig files: ')
    lists_of_lengths = []
    numbers_of_Ns = []
    for id, contigs_fpath in enumerate(contigs_fpaths):
        assembly_name = qutils.name_from_fpath(contigs_fpath)
        assembly_label = qutils.label_from_fpath(contigs_fpath)

        logger.info('    ' + qutils.index_to_str(id) + assembly_label)
        #lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath))
        list_of_length = []
        number_of_Ns = 0
        for (name, seq) in fastaparser.read_fasta(contigs_fpath):
            list_of_length.append(len(seq))
            number_of_Ns += seq.count('N')

        lists_of_lengths.append(list_of_length)
        numbers_of_Ns.append(number_of_Ns)

    # saving lengths to JSON
    if json_output_dir:
        json_saver.save_contigs_lengths(json_output_dir, contigs_fpaths,
                                        lists_of_lengths)

    if qconfig.html_report:
        from libs.html_saver import html_saver
        html_saver.save_contigs_lengths(results_dir, contigs_fpaths,
                                        lists_of_lengths)

    ########################################################################

    logger.info('  Calculating N50 and L50...')

    list_of_GC_distributions = []
    import N50
    for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(
            itertools.izip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)):
        report = reporting.get(contigs_fpath)
        n50, l50 = N50.N50_and_L50(lengths_list)
        ng50, lg50 = None, None
        if reference_length:
            ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length)
        n75, l75 = N50.N50_and_L50(lengths_list, 75)
        ng75, lg75 = None, None
        if reference_length:
            ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75)
        total_length = sum(lengths_list)
        total_GC, GC_distribution = GC_content(contigs_fpath)
        list_of_GC_distributions.append(GC_distribution)
        logger.info('    ' + qutils.index_to_str(id) +
                    qutils.label_from_fpath(contigs_fpath) + \
                    ', N50 = ' + str(n50) + \
                    ', L50 = ' + str(l50) + \
                    ', Total length = ' + str(total_length) + \
                    ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \
                    ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) )

        report.add_field(reporting.Fields.N50, n50)
        report.add_field(reporting.Fields.L50, l50)
        if reference_length:
            report.add_field(reporting.Fields.NG50, ng50)
            report.add_field(reporting.Fields.LG50, lg50)
        report.add_field(reporting.Fields.N75, n75)
        report.add_field(reporting.Fields.L75, l75)
        if reference_length:
            report.add_field(reporting.Fields.NG75, ng75)
            report.add_field(reporting.Fields.LG75, lg75)
        report.add_field(reporting.Fields.CONTIGS, len(lengths_list))
        report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list))
        report.add_field(reporting.Fields.TOTALLEN, total_length)
        report.add_field(reporting.Fields.GC,
                         ('%.2f' % total_GC if total_GC else None))
        report.add_field(reporting.Fields.UNCALLED, number_of_Ns)
        report.add_field(
            reporting.Fields.UNCALLED_PERCENT,
            ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length))))
        if ref_fpath:
            report.add_field(reporting.Fields.REFLEN, int(reference_length))
            report.add_field(reporting.Fields.REFGC, '%.2f' % reference_GC)
        elif reference_length:
            report.add_field(reporting.Fields.ESTREFLEN, int(reference_length))

    if json_output_dir:
        json_saver.save_GC_info(json_output_dir, contigs_fpaths,
                                list_of_GC_distributions)

    if qconfig.html_report:
        from libs.html_saver import html_saver
        html_saver.save_GC_info(results_dir, contigs_fpaths,
                                list_of_GC_distributions)

    if qconfig.draw_plots:
        import plotter
        ########################################################################import plotter
        plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths,
                                output_dirpath + '/cumulative_plot',
                                'Cumulative length')

        ########################################################################
        # Drawing GC content plot...
        list_of_GC_distributions_with_ref = list_of_GC_distributions
        if ref_fpath:
            list_of_GC_distributions_with_ref.append(reference_GC_distribution)
        # Drawing cumulative plot...
        plotter.GC_content_plot(ref_fpath, contigs_fpaths,
                                list_of_GC_distributions_with_ref,
                                output_dirpath + '/GC_content_plot')

        ########################################################################
        # Drawing Nx and NGx plots...
        plotter.Nx_plot(contigs_fpaths, lists_of_lengths,
                        output_dirpath + '/Nx_plot', 'Nx', [])
        if reference_length:
            plotter.Nx_plot(
                contigs_fpaths, lists_of_lengths, output_dirpath + '/NGx_plot',
                'NGx', [reference_length for i in range(len(contigs_fpaths))])

    logger.info('Done.')
Exemple #13
0
if len(sys.argv) < 3:
    print 'Contigs concatenator: makes one big contig from the assembly'
    print 'Usage: ', sys.argv[0], ' INPUT_FILE OUTPUT_FILE [COORDS_FILE]'
    sys.exit(0)

infilename = sys.argv[1]
outfilename = sys.argv[2]
coords = 0
if (len(sys.argv) > 3):
    coords = open(sys.argv[3], 'w')

padding = ""
for i in xrange(0, padding_length):
    padding += "N"

fasta = fastaparser.read_fasta(infilename)
summary_seq = ""
cur_coord = 1
for name, seq in fasta:
    if (len(seq) >= min_contig):
        if (coords != 0):
            coords.write(
                str(cur_coord) + " " + str(cur_coord + len(seq) - 1) + "\n")
            cur_coord += len(seq) + padding_length
        summary_seq += (seq + padding)

out = open(outfilename, 'w')
out.write(">sum_contig total_length=" + str(len(summary_seq)) + '\n')
for i in xrange(0, len(summary_seq), 60):
    out.write(summary_seq[i:i + 60] + '\n')
Exemple #14
0
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
       genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath, genome_stats_dirpath):

    nucmer_path_dirpath = os.path.join(detailed_contigs_reports_dirpath, 'nucmer_output')

    logger.print_timestamp()
    logger.info('Running Genome analyzer...')

    if not os.path.isdir(genome_stats_dirpath):
        os.mkdir(genome_stats_dirpath)

    reference_chromosomes = {}
    genome_size = 0
    for name, seq in fastaparser.read_fasta(ref_fpath):
        chr_name = name.split()[0]
        chr_len = len(seq)
        genome_size += chr_len
        reference_chromosomes[chr_name] = chr_len

    # reading genome size
    # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0]
    # reading reference name
    # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome
    # ref_file = open(reference, 'r')
    # reference_name = ref_file.readline().split()[0][1:]
    # ref_file.close()

    # RESULTS file
    result_fpath = genome_stats_dirpath + '/genome_info.txt'
    res_file = open(result_fpath, 'w')
    res_file.write('reference chromosomes:\n')
    for chr_name, chr_len in reference_chromosomes.iteritems():
        res_file.write('\t' + chr_name + ' (' + str(chr_len) + ' bp)\n')
    res_file.write('\n')
    res_file.write('total genome size: ' + str(genome_size) + '\n\n')
    res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n')
    res_file.write('partial gene/operon min size: ' + str(qconfig.min_gene_overlap) + '\n\n')

    genes_container = FeatureContainer(genes_fpaths, 'gene')
    operons_container = FeatureContainer(operons_fpaths, 'operon')
    for container in [genes_container, operons_container]:
        if not container.fpaths:
            logger.notice('No file with ' + container.kind + 's provided. '
                          'Use the -' + container.kind[0].capitalize() + ' option '
                          'if you want to specify it.', indent='  ')
            continue

        for fpath in container.fpaths:
            container.region_list += genes_parser.get_genes_from_file(fpath, container.kind)

        if len(container.region_list) == 0:
            logger.warning('No ' + container.kind + 's were loaded.', indent='  ')
            res_file.write(container.kind + 's loaded: ' + 'None' + '\n')
        else:
            logger.info('  Loaded ' + str(len(container.region_list)) + ' ' + container.kind + 's')
            res_file.write(container.kind + 's loaded: ' + str(len(container.region_list)) + '\n')
            container.chr_names_dict = chromosomes_names_dict(container.kind, container.region_list, reference_chromosomes.keys())

    for contigs_fpath in aligned_contigs_fpaths:
        report = reporting.get(contigs_fpath)
        if genes_container.fpaths:
            report.add_field(reporting.Fields.REF_GENES, len(genes_container.region_list))
        if operons_container.fpaths:
            report.add_field(reporting.Fields.REF_OPERONS, len(operons_container.region_list))

    # header
    res_file.write('\n\n')
    res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n'
        % ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial', 'operons', 'partial'))
    res_file.write('%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n'
        % ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons'))
    res_file.write('================================================================================================================\n')

    # for cumulative plots:
    files_genes_in_contigs = {}   #  "filename" : [ genes in sorted contigs (see below) ]
    files_operons_in_contigs = {}

    # for histograms
    genome_mapped = []
    full_found_genes = []
    full_found_operons = []

    # process all contig files
    n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads)
    from joblib import Parallel, delayed
    results_genes_operons_tuples = Parallel(n_jobs=n_jobs)(delayed(process_single_file)(
        contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath,
        reference_chromosomes, genes_container, operons_container)
        for index, contigs_fpath in enumerate(aligned_contigs_fpaths))

    for contigs_fpath, (results, genes_in_contigs, operons_in_contigs) in zip(aligned_contigs_fpaths, results_genes_operons_tuples):
        assembly_name = qutils.name_from_fpath(contigs_fpath)

        files_genes_in_contigs[contigs_fpath] = genes_in_contigs
        files_operons_in_contigs[contigs_fpath] = operons_in_contigs
        full_found_genes.append(sum(genes_in_contigs))
        full_found_operons.append(sum(operons_in_contigs))

        covered_bp = results["covered_bp"]
        gaps_count = results["gaps_count"]
        genes_full = results[reporting.Fields.GENES + "_full"]
        genes_part = results[reporting.Fields.GENES + "_partial"]
        operons_full = results[reporting.Fields.OPERONS + "_full"]
        operons_part = results[reporting.Fields.OPERONS + "_partial"]

        report = reporting.get(contigs_fpath)
        genome_fraction = float(covered_bp) * 100 / float(genome_size)
        duplication_ratio = (report.get_field(reporting.Fields.TOTALLEN) +
                             report.get_field(reporting.Fields.MISINTERNALOVERLAP) +
                             report.get_field(reporting.Fields.AMBIGUOUSEXTRABASES) -
                             report.get_field(reporting.Fields.UNALIGNEDBASES)) /\
                            ((genome_fraction / 100.0) * float(genome_size))

        res_file.write('%-25s| %-10s| %-12s| %-10s|'
        % (assembly_name[:24], '%3.5f%%' % genome_fraction, '%1.5f' % duplication_ratio, gaps_count))

        report.add_field(reporting.Fields.MAPPEDGENOME, '%.3f' % genome_fraction)
        report.add_field(reporting.Fields.DUPLICATION_RATIO, '%.3f' % duplication_ratio)
        genome_mapped.append(genome_fraction)

        for (field, full, part) in [(reporting.Fields.GENES, genes_full, genes_part),
            (reporting.Fields.OPERONS, operons_full, operons_part)]:
            if full is None and part is None:
                res_file.write(' %-10s| %-10s|' % ('-', '-'))
            else:
                res_file.write(' %-10s| %-10s|' % (full, part))
                report.add_field(field, '%s + %s part' % (full, part))
        res_file.write('\n')
    res_file.close()

    if genes_container.region_list:
        ref_genes_num = len(genes_container.region_list)
    else:
        ref_genes_num = None

    if operons_container.region_list:
        ref_operons_num = len(operons_container.region_list)
    else:
        ref_operons_num = None

    # saving json
    if json_output_dirpath:
        if genes_container.region_list:
            json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num)
        if operons_container.region_list:
            json_saver.save_features_in_contigs(json_output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num)

    if qconfig.html_report:
        from libs.html_saver import html_saver
        if genes_container.region_list:
            html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'genes', files_genes_in_contigs, ref_genes_num)
        if operons_container.region_list:
            html_saver.save_features_in_contigs(output_dirpath, aligned_contigs_fpaths, 'operons', files_operons_in_contigs, ref_operons_num)

    if qconfig.draw_plots:
        # cumulative plots:
        import plotter
        if genes_container.region_list:
            plotter.genes_operons_plot(len(genes_container.region_list), aligned_contigs_fpaths, files_genes_in_contigs,
                genome_stats_dirpath + '/genes_cumulative_plot', 'genes')
            plotter.histogram(aligned_contigs_fpaths, full_found_genes, genome_stats_dirpath + '/complete_genes_histogram',
                '# complete genes')
        if operons_container.region_list:
            plotter.genes_operons_plot(len(operons_container.region_list), aligned_contigs_fpaths, files_operons_in_contigs,
                genome_stats_dirpath + '/operons_cumulative_plot', 'operons')
            plotter.histogram(aligned_contigs_fpaths, full_found_operons, genome_stats_dirpath + '/complete_operons_histogram',
                '# complete operons')
        plotter.histogram(aligned_contigs_fpaths, genome_mapped, genome_stats_dirpath + '/genome_fraction_histogram',
            'Genome fraction, %', top_value=100)

    logger.info('Done.')
Exemple #15
0
def process_single_file(contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath,
                        reference_chromosomes, genes_container, operons_container):
    assembly_name = qutils.name_from_fpath(contigs_fpath)
    assembly_label = qutils.label_from_fpath(contigs_fpath)
    results = dict()

    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    nucmer_base_fpath = os.path.join(nucmer_path_dirpath, assembly_name + '.coords')
    if qconfig.use_all_alignments:
        nucmer_fpath = nucmer_base_fpath
    else:
        nucmer_fpath = nucmer_base_fpath + '.filtered'

    if not os.path.isfile(nucmer_fpath):
        logger.error('Nucmer\'s coords file (' + nucmer_fpath + ') not found! Try to restart QUAST.',
            indent='  ')

    coordfile = open(nucmer_fpath, 'r')
    for line in coordfile:
        if line.startswith('='):
            break

    # EXAMPLE:
    #    [S1]     [E1]  |     [S2]     [E2]  |  [LEN 1]  [LEN 2]  |  [% IDY]  | [TAGS]
    #=====================================================================================
    #  338980   339138  |     2298     2134  |      159      165  |    79.76  | gi|48994873|gb|U00096.2|	NODE_0_length_6088
    #  374145   374355  |     2306     2097  |      211      210  |    85.45  | gi|48994873|gb|U00096.2|	NODE_0_length_6088

    genome_mapping = {}
    for chr_name, chr_len in reference_chromosomes.iteritems():
        genome_mapping[chr_name] = [0] * (chr_len + 1)

    contig_tuples = fastaparser.read_fasta(contigs_fpath)  # list of FASTA entries (in tuples: name, seq)
    contig_tuples = sorted(contig_tuples, key=lambda contig: len(contig[1]), reverse=True)
    sorted_contigs_names = [name for (name, seq) in contig_tuples]

    genes_in_contigs = [0] * len(sorted_contigs_names) # for cumulative plots: i-th element is the number of genes in i-th contig
    operons_in_contigs = [0] * len(sorted_contigs_names)
    aligned_blocks_by_contig_name = {} # for gene finding: contig_name --> list of AlignedBlock
    for name in sorted_contigs_names:
        aligned_blocks_by_contig_name[name] = []

    for line in coordfile:
        if line.strip() == '':
            break
        s1 = int(line.split('|')[0].split()[0])
        e1 = int(line.split('|')[0].split()[1])
        s2 = int(line.split('|')[1].split()[0])
        e2 = int(line.split('|')[1].split()[1])
        contig_name = line.split()[12].strip()
        chr_name = line.split()[11].strip()

        if chr_name not in genome_mapping:
            logger.error("Something went wrong and chromosome names in your coords file (" + nucmer_base_fpath + ") " \
                         "differ from the names in the reference. Try to remove the file and restart QUAST.")

        aligned_blocks_by_contig_name[contig_name].append(AlignedBlock(seqname=chr_name, start=s1, end=e1))
        if s2 == 0 and e2 == 0:  # special case: circular genome, contig starts on the end of a chromosome and ends in the beginning
            for i in range(s1, len(genome_mapping[chr_name])):
                genome_mapping[chr_name][i] = 1
            for i in range(1, e1 + 1):
                genome_mapping[chr_name][i] = 1
        else: #if s1 <= e1:
            for i in range(s1, e1 + 1):
                genome_mapping[chr_name][i] = 1
    coordfile.close()

    # counting genome coverage and gaps number
    covered_bp = 0
    gaps_count = 0
    gaps_fpath = os.path.join(genome_stats_dirpath, assembly_name + '_gaps.txt')
    gaps_file = open(gaps_fpath, 'w')
    for chr_name, chr_len in reference_chromosomes.iteritems():
        print >>gaps_file, chr_name
        cur_gap_size = 0
        for i in range(1, chr_len + 1):
            if genome_mapping[chr_name][i] == 1:
                if cur_gap_size >= qconfig.min_gap_size:
                    gaps_count += 1
                    print >>gaps_file, i - cur_gap_size, i - 1
                covered_bp += 1
                cur_gap_size = 0
            else:
                cur_gap_size += 1

        if cur_gap_size >= qconfig.min_gap_size:
            gaps_count += 1
            print >>gaps_file, chr_len - cur_gap_size + 1, chr_len
    gaps_file.close()

    results["covered_bp"] = covered_bp
    results["gaps_count"] = gaps_count

    # finding genes and operons
    for container, feature_in_contigs, field, suffix in [
        (genes_container,
         genes_in_contigs,
         reporting.Fields.GENES,
         '_genes.txt'),

        (operons_container,
         operons_in_contigs,
         reporting.Fields.OPERONS,
         '_operons.txt')]:

        if not container.region_list:
            results[field + "_full"] = None
            results[field + "_partial"] = None
            continue

        total_full = 0
        total_partial = 0
        found_fpath = os.path.join(genome_stats_dirpath, assembly_name + suffix)
        found_file = open(found_fpath, 'w')
        print >>found_file, '%s\t\t%s\t%s' % ('ID or #', 'Start', 'End')
        print >>found_file, '============================'

        # 0 - gene is not found,
        # 1 - gene is found,
        # 2 - part of gene is found
        found_list = [0] * len(container.region_list)
        for i, region in enumerate(container.region_list):
            found_list[i] = 0
            for contig_id, name in enumerate(sorted_contigs_names):
                cur_feature_is_found = False
                for cur_block in aligned_blocks_by_contig_name[name]:
                    if container.chr_names_dict[region.seqname] != cur_block.seqname:
                        continue

                    # computing circular genomes
                    if cur_block.start > cur_block.end:
                        blocks = [AlignedBlock(seqname=cur_block.seqname, start=cur_block.start, end=region.end + 1),
                                  AlignedBlock(seqname=cur_block.seqname, start=1, end=cur_block.end)]
                    else:
                        blocks = [cur_block]

                    for block in blocks:
                        if region.end <= block.start or block.end <= region.start:
                            continue
                        elif block.start <= region.start and region.end <= block.end:
                            if found_list[i] == 2:  # already found as partial gene
                                total_partial -= 1
                            found_list[i] = 1
                            total_full += 1
                            i = str(region.id)
                            if i == 'None':
                                i = '# ' + str(region.number + 1)
                            print >>found_file, '%s\t\t%d\t%d' % (i, region.start, region.end)
                            feature_in_contigs[contig_id] += 1  # inc number of found genes/operons in id-th contig

                            cur_feature_is_found = True
                            break
                        elif found_list[i] == 0 and min(region.end, block.end) - max(region.start, block.start) >= qconfig.min_gene_overlap:
                            found_list[i] = 2
                            total_partial += 1
                    if cur_feature_is_found:
                        break
                if cur_feature_is_found:
                    break

        results[field + "_full"] = total_full
        results[field + "_partial"] = total_partial
        found_file.close()

    logger.info('  ' + qutils.index_to_str(index) + 'Analysis is finished.')
    return results, genes_in_contigs, operons_in_contigs
Exemple #16
0
def do(ref_fpath, aligned_contigs_fpaths, output_dirpath, json_output_dirpath,
       genes_fpaths, operons_fpaths, detailed_contigs_reports_dirpath,
       genome_stats_dirpath):

    nucmer_path_dirpath = os.path.join(detailed_contigs_reports_dirpath,
                                       'nucmer_output')
    from libs import search_references_meta
    if search_references_meta.is_quast_first_run:
        nucmer_path_dirpath = os.path.join(nucmer_path_dirpath, 'raw')

    logger.print_timestamp()
    logger.main_info('Running Genome analyzer...')

    if not os.path.isdir(genome_stats_dirpath):
        os.mkdir(genome_stats_dirpath)

    reference_chromosomes = {}
    genome_size = 0
    for name, seq in fastaparser.read_fasta(ref_fpath):
        chr_name = name.split()[0]
        chr_len = len(seq)
        genome_size += chr_len
        reference_chromosomes[chr_name] = chr_len

    # reading genome size
    # genome_size = fastaparser.get_lengths_from_fastafile(reference)[0]
    # reading reference name
    # >gi|48994873|gb|U00096.2| Escherichia coli str. K-12 substr. MG1655, complete genome
    # ref_file = open(reference, 'r')
    # reference_name = ref_file.readline().split()[0][1:]
    # ref_file.close()

    # RESULTS file
    result_fpath = genome_stats_dirpath + '/genome_info.txt'
    res_file = open(result_fpath, 'w')

    genes_container = FeatureContainer(genes_fpaths, 'gene')
    operons_container = FeatureContainer(operons_fpaths, 'operon')
    for container in [genes_container, operons_container]:
        if not container.fpaths:
            logger.notice('No file with ' + container.kind + 's provided. '
                          'Use the -' + container.kind[0].capitalize() +
                          ' option '
                          'if you want to specify it.',
                          indent='  ')
            continue

        for fpath in container.fpaths:
            container.region_list += genes_parser.get_genes_from_file(
                fpath, container.kind)

        if len(container.region_list) == 0:
            logger.warning('No ' + container.kind + 's were loaded.',
                           indent='  ')
            res_file.write(container.kind + 's loaded: ' + 'None' + '\n')
        else:
            logger.info('  Loaded ' + str(len(container.region_list)) + ' ' +
                        container.kind + 's')
            res_file.write(container.kind + 's loaded: ' +
                           str(len(container.region_list)) + '\n')
            container.chr_names_dict = chromosomes_names_dict(
                container.kind, container.region_list,
                reference_chromosomes.keys())

    for contigs_fpath in aligned_contigs_fpaths:
        report = reporting.get(contigs_fpath)
        if genes_container.fpaths:
            report.add_field(reporting.Fields.REF_GENES,
                             len(genes_container.region_list))
        if operons_container.fpaths:
            report.add_field(reporting.Fields.REF_OPERONS,
                             len(operons_container.region_list))

    # for cumulative plots:
    files_genes_in_contigs = {
    }  #  "filename" : [ genes in sorted contigs (see below) ]
    files_operons_in_contigs = {}

    # for histograms
    genome_mapped = []
    full_found_genes = []
    full_found_operons = []

    # process all contig files
    num_nf_errors = logger._num_nf_errors
    n_jobs = min(len(aligned_contigs_fpaths), qconfig.max_threads)
    from joblib import Parallel, delayed
    process_results = Parallel(n_jobs=n_jobs)(
        delayed(process_single_file)(
            contigs_fpath, index, nucmer_path_dirpath, genome_stats_dirpath,
            reference_chromosomes, genes_container, operons_container)
        for index, contigs_fpath in enumerate(aligned_contigs_fpaths))
    num_nf_errors += len([res for res in process_results if res is None])
    logger._num_nf_errors = num_nf_errors
    process_results = [res for res in process_results if res]
    if not process_results:
        logger.main_info('Genome analyzer failed for all the assemblies.')
        res_file.close()
        return

    ref_lengths = [process_results[i][0] for i in range(len(process_results))]
    results_genes_operons_tuples = [
        process_results[i][1] for i in range(len(process_results))
    ]
    for ref in reference_chromosomes:
        ref_lengths_by_contigs[ref] = [
            ref_lengths[i][ref] for i in range(len(ref_lengths))
        ]
    res_file.write('reference chromosomes:\n')
    for chr_name, chr_len in reference_chromosomes.iteritems():
        aligned_len = max(ref_lengths_by_contigs[chr_name])
        res_file.write('\t' + chr_name + ' (total length: ' + str(chr_len) +
                       ' bp, maximal covered length: ' + str(aligned_len) +
                       ' bp)\n')
    res_file.write('\n')
    res_file.write('total genome size: ' + str(genome_size) + '\n\n')
    res_file.write('gap min size: ' + str(qconfig.min_gap_size) + '\n')
    res_file.write('partial gene/operon min size: ' +
                   str(qconfig.min_gene_overlap) + '\n\n')
    # header
    # header
    res_file.write('\n\n')
    res_file.write(
        '%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' %
        ('assembly', 'genome', 'duplication', 'gaps', 'genes', 'partial',
         'operons', 'partial'))
    res_file.write(
        '%-25s| %-10s| %-12s| %-10s| %-10s| %-10s| %-10s| %-10s|\n' %
        ('', 'fraction', 'ratio', 'number', '', 'genes', '', 'operons'))
    res_file.write(
        '================================================================================================================\n'
    )

    for contigs_fpath, (results, genes_in_contigs, operons_in_contigs) in zip(
            aligned_contigs_fpaths, results_genes_operons_tuples):
        assembly_name = qutils.name_from_fpath(contigs_fpath)

        files_genes_in_contigs[contigs_fpath] = genes_in_contigs
        files_operons_in_contigs[contigs_fpath] = operons_in_contigs
        full_found_genes.append(sum(genes_in_contigs))
        full_found_operons.append(sum(operons_in_contigs))

        covered_bp = results["covered_bp"]
        gaps_count = results["gaps_count"]
        genes_full = results[reporting.Fields.GENES + "_full"]
        genes_part = results[reporting.Fields.GENES + "_partial"]
        operons_full = results[reporting.Fields.OPERONS + "_full"]
        operons_part = results[reporting.Fields.OPERONS + "_partial"]

        report = reporting.get(contigs_fpath)
        genome_fraction = float(covered_bp) * 100 / float(genome_size)
        duplication_ratio = (report.get_field(reporting.Fields.TOTALLEN) +
                             report.get_field(reporting.Fields.MISINTERNALOVERLAP) +
                             report.get_field(reporting.Fields.AMBIGUOUSEXTRABASES) -
                             report.get_field(reporting.Fields.UNALIGNEDBASES)) /\
                            ((genome_fraction / 100.0) * float(genome_size))

        res_file.write('%-25s| %-10s| %-12s| %-10s|' %
                       (assembly_name[:24], '%3.5f%%' % genome_fraction,
                        '%1.5f' % duplication_ratio, gaps_count))

        report.add_field(reporting.Fields.MAPPEDGENOME,
                         '%.3f' % genome_fraction)
        report.add_field(reporting.Fields.DUPLICATION_RATIO,
                         '%.3f' % duplication_ratio)
        genome_mapped.append(genome_fraction)

        for (field, full,
             part) in [(reporting.Fields.GENES, genes_full, genes_part),
                       (reporting.Fields.OPERONS, operons_full, operons_part)]:
            if full is None and part is None:
                res_file.write(' %-10s| %-10s|' % ('-', '-'))
            else:
                res_file.write(' %-10s| %-10s|' % (full, part))
                report.add_field(field, '%s + %s part' % (full, part))
        res_file.write('\n')
    res_file.close()

    if genes_container.region_list:
        ref_genes_num = len(genes_container.region_list)
    else:
        ref_genes_num = None

    if operons_container.region_list:
        ref_operons_num = len(operons_container.region_list)
    else:
        ref_operons_num = None

    # saving json
    if json_output_dirpath:
        if genes_container.region_list:
            json_saver.save_features_in_contigs(json_output_dirpath,
                                                aligned_contigs_fpaths,
                                                'genes',
                                                files_genes_in_contigs,
                                                ref_genes_num)
        if operons_container.region_list:
            json_saver.save_features_in_contigs(json_output_dirpath,
                                                aligned_contigs_fpaths,
                                                'operons',
                                                files_operons_in_contigs,
                                                ref_operons_num)

    if qconfig.html_report:
        from libs.html_saver import html_saver
        if genes_container.region_list:
            html_saver.save_features_in_contigs(output_dirpath,
                                                aligned_contigs_fpaths,
                                                'genes',
                                                files_genes_in_contigs,
                                                ref_genes_num)
        if operons_container.region_list:
            html_saver.save_features_in_contigs(output_dirpath,
                                                aligned_contigs_fpaths,
                                                'operons',
                                                files_operons_in_contigs,
                                                ref_operons_num)

    if qconfig.draw_plots:
        # cumulative plots:
        import plotter
        if genes_container.region_list:
            plotter.genes_operons_plot(
                len(genes_container.region_list), aligned_contigs_fpaths,
                files_genes_in_contigs,
                genome_stats_dirpath + '/genes_cumulative_plot', 'genes')
            plotter.histogram(
                aligned_contigs_fpaths, full_found_genes,
                genome_stats_dirpath + '/complete_genes_histogram',
                '# complete genes')
        if operons_container.region_list:
            plotter.genes_operons_plot(
                len(operons_container.region_list), aligned_contigs_fpaths,
                files_operons_in_contigs,
                genome_stats_dirpath + '/operons_cumulative_plot', 'operons')
            plotter.histogram(
                aligned_contigs_fpaths, full_found_operons,
                genome_stats_dirpath + '/complete_operons_histogram',
                '# complete operons')
        plotter.histogram(aligned_contigs_fpaths,
                          genome_mapped,
                          genome_stats_dirpath + '/genome_fraction_histogram',
                          'Genome fraction, %',
                          top_value=100)

    logger.main_info('Done.')
Exemple #17
0
    if last_gap not in trusted_gaps:
        last_gap = (0,0)

    total_gaps = len(chunks) - 1
    if not circular:
        total_gaps += 1
    print ("     Total gaps between aligned regions: " + str(total_gaps) + "; gaps, that have read pairs spanning over: " + str(len(trusted_gaps)))

    trusted_chunks_index = [(0,0) for i in range(0, len(chunks))]
    for i in range(0, len(chunks) - 1):
        if (chunks[i][1], chunks[i + 1][0]) in trusted_gaps:
            trusted_chunks_index[i] = (chunks[i][1], chunks[i + 1][0])

    chunks_file  = os.path.join(output_dir, os.path.splitext(os.path.basename(reference))[0] + "gaps_" + dataset + ".fasta")
    fasta = fastaparser.read_fasta(chunks_file)

    singlef  = os.path.join(output_dir, dataset + "_single_reads.fasta")
    simulate_ideal_by_fasta.simulate_single(singlef, fasta, rl, circular)

    if ins < rl:
        continue

    pairedf  = os.path.join(output_dir, dataset + "_paired_reads.fasta")
    simulate_ideal_by_fasta.simulate_paired(pairedf, fasta, ins, rl, circular)

    gapf  = os.path.join(output_dir, dataset + "_gapped_reads.fasta")
    simulate_ideal_by_fasta.simulate_paired_over_gaps(gapf, fasta, chunks, trusted_chunks_index, last_gap, ref_len, ins, rl, circular)


# total report
Exemple #18
0
def do(ref_fpath, contigs_fpaths, output_dirpath, json_output_dir, results_dir):
    logger.print_timestamp()
    logger.main_info("Running Basic statistics processor...")
    
    if not os.path.isdir(output_dirpath):
        os.mkdir(output_dirpath)

    reference_length = None
    if ref_fpath:
        reference_length = sum(fastaparser.get_lengths_from_fastafile(ref_fpath))
        reference_GC, reference_GC_distribution = GC_content(ref_fpath)

        logger.info('  Reference genome:')
        logger.info('    ' + os.path.basename(ref_fpath) + ', Reference length = ' + str(reference_length) + ', Reference GC % = ' + '%.2f' % reference_GC)
    elif qconfig.estimated_reference_size:
        reference_length = qconfig.estimated_reference_size
        logger.info('  Estimated reference length = ' + str(reference_length))

    if reference_length:
        # Saving the reference in JSON
        if json_output_dir:
            json_saver.save_reference_length(json_output_dir, reference_length)

        # Saving for an HTML report
        if qconfig.html_report:
            from libs.html_saver import html_saver
            html_saver.save_reference_length(results_dir, reference_length)

    logger.info('  Contig files: ')
    lists_of_lengths = []
    numbers_of_Ns = []
    for id, contigs_fpath in enumerate(contigs_fpaths):
        assembly_label = qutils.label_from_fpath(contigs_fpath)

        logger.info('    ' + qutils.index_to_str(id) + assembly_label)
        #lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath))
        list_of_length = []
        number_of_Ns = 0
        for (name, seq) in fastaparser.read_fasta(contigs_fpath):
            list_of_length.append(len(seq))
            number_of_Ns += seq.count('N')

        lists_of_lengths.append(list_of_length)
        numbers_of_Ns.append(number_of_Ns)

    num_contigs = max([len(list_of_length) for list_of_length in lists_of_lengths])

    multiplicator = 1
    if num_contigs >= (qconfig.max_points*2):
        import math
        multiplicator = int(num_contigs/qconfig.max_points)
        max_points = num_contigs/multiplicator
        lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths]
        corr_lists_of_lengths = [[sum(list_of_length[((i-1)*multiplicator):(i*multiplicator)]) for i in range(1, max_points)
                                  if (i*multiplicator) < len(list_of_length)] for list_of_length in lists_of_lengths]
        for num_list in range(len(corr_lists_of_lengths)):
            last_index = len(corr_lists_of_lengths[num_list])
            corr_lists_of_lengths[num_list].append(sum(lists_of_lengths[num_list][last_index*multiplicator:]))
    else:
        corr_lists_of_lengths = lists_of_lengths

    # saving lengths to JSON
    if json_output_dir:
        json_saver.save_contigs_lengths(json_output_dir, contigs_fpaths, corr_lists_of_lengths)
        json_saver.save_tick_x(json_output_dir, multiplicator)

    if qconfig.html_report:
        from libs.html_saver import html_saver
        html_saver.save_contigs_lengths(results_dir, contigs_fpaths, corr_lists_of_lengths)
        html_saver.save_tick_x(results_dir, multiplicator)

    ########################################################################

    logger.info('  Calculating N50 and L50...')

    list_of_GC_distributions = []
    largest_contig = 0
    import N50
    for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(itertools.izip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)):
        report = reporting.get(contigs_fpath)
        n50, l50 = N50.N50_and_L50(lengths_list)
        ng50, lg50 = None, None
        if reference_length:
            ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length)
        n75, l75 = N50.N50_and_L50(lengths_list, 75)
        ng75, lg75 = None, None
        if reference_length:
            ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75)
        total_length = sum(lengths_list)
        total_GC, GC_distribution = GC_content(contigs_fpath, skip=qconfig.no_gc)
        list_of_GC_distributions.append(GC_distribution)
        logger.info('    ' + qutils.index_to_str(id) +
                    qutils.label_from_fpath(contigs_fpath) + \
                    ', N50 = ' + str(n50) + \
                    ', L50 = ' + str(l50) + \
                    ', Total length = ' + str(total_length) + \
                    ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \
                    ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) if total_length != 0 else 'undefined')
        
        report.add_field(reporting.Fields.N50, n50)
        report.add_field(reporting.Fields.L50, l50)
        if reference_length and not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NG50, ng50)
            report.add_field(reporting.Fields.LG50, lg50)
        report.add_field(reporting.Fields.N75, n75)
        report.add_field(reporting.Fields.L75, l75)
        if reference_length and not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NG75, ng75)
            report.add_field(reporting.Fields.LG75, lg75)
        report.add_field(reporting.Fields.CONTIGS, len(lengths_list))
        if lengths_list:
            report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list))
            largest_contig = max(largest_contig, max(lengths_list))
            report.add_field(reporting.Fields.TOTALLEN, total_length)
            if not qconfig.is_combined_ref:
                report.add_field(reporting.Fields.GC, ('%.2f' % total_GC if total_GC is not None else None))
            report.add_field(reporting.Fields.UNCALLED, number_of_Ns)
            report.add_field(reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length))))
        if ref_fpath:
            report.add_field(reporting.Fields.REFLEN, int(reference_length))
            if not qconfig.is_combined_ref:
                report.add_field(reporting.Fields.REFGC, '%.2f' % reference_GC)
        elif reference_length:
            report.add_field(reporting.Fields.ESTREFLEN, int(reference_length))

    import math
    qconfig.min_difference = math.ceil((largest_contig/1000)/600)  # divide on height of plot

    if json_output_dir:
        json_saver.save_GC_info(json_output_dir, contigs_fpaths, list_of_GC_distributions)

    if qconfig.html_report and not qconfig.is_combined_ref:
        from libs.html_saver import html_saver
        html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions)

    import plotter
    ########################################################################
    # Drawing Nx and NGx plots...
    plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, output_dirpath + '/Nx_plot', 'Nx', [], json_output_dir=json_output_dir)
    if reference_length and not qconfig.is_combined_ref:
        plotter.Nx_plot(results_dir, num_contigs > qconfig.max_points, contigs_fpaths, lists_of_lengths, output_dirpath + '/NGx_plot', 'NGx',
                        [reference_length for i in range(len(contigs_fpaths))], json_output_dir=json_output_dir)

    if qconfig.draw_plots:
        ########################################################################import plotter
        # Drawing cumulative plot...
        plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, output_dirpath + '/cumulative_plot', 'Cumulative length')
        if not qconfig.is_combined_ref:
            ########################################################################
            # Drawing GC content plot...
            list_of_GC_distributions_with_ref = list_of_GC_distributions
            if ref_fpath:
                list_of_GC_distributions_with_ref.append(reference_GC_distribution)
            plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, output_dirpath + '/GC_content_plot')

    logger.main_info('Done.')
for line in in_file:
    if line.startswith("	CONTIG:"):
        cur_contig_id = line.split("	CONTIG:")[1].strip()
    if (line.find("Extensive misassembly") != -1) and (cur_contig_id != ""):
        mis_contigs_ids.append(cur_contig_id.split()[0])
        cur_contig_id = ""
    if line.startswith("Analyzing coverage..."):
        break

# printing IDs of misassembled contigs
print("Misassembled contigs:")
for contig_id in mis_contigs_ids:
    print(contig_id)

in_file.close()

if (len(sys.argv) == 4):
    import fastaparser
    input_contigs = fastaparser.read_fasta(sys.argv[2])
    mis_contigs = open(sys.argv[3], "w")

    for (name, seq) in input_contigs:
        corr_name = re.sub(r'\W', '', re.sub(r'\s', '_', name))

        if mis_contigs_ids.count(corr_name) != 0:
            mis_contigs.write(name + '\n')
            for i in xrange(0, len(seq), 60):
                mis_contigs.write(seq[i:i + 60] + '\n')

    mis_contigs.close()
Exemple #20
0
def do(ref_fpath, contigs_fpaths, output_dirpath, json_output_dir, results_dir):
    logger.print_timestamp()
    logger.main_info("Running Basic statistics processor...")

    if not os.path.isdir(output_dirpath):
        os.mkdir(output_dirpath)

    reference_length = None
    if ref_fpath:
        reference_length = sum(fastaparser.get_lengths_from_fastafile(ref_fpath))
        reference_GC, reference_GC_distribution = GC_content(ref_fpath)

        logger.info("  Reference genome:")
        logger.info(
            "    "
            + os.path.basename(ref_fpath)
            + ", Reference length = "
            + str(reference_length)
            + ", Reference GC % = "
            + "%.2f" % reference_GC
        )
    elif qconfig.estimated_reference_size:
        reference_length = qconfig.estimated_reference_size
        logger.info("  Estimated reference length = " + str(reference_length))

    if reference_length:
        # Saving the reference in JSON
        if json_output_dir:
            json_saver.save_reference_length(json_output_dir, reference_length)

        # Saving for an HTML report
        if qconfig.html_report:
            from libs.html_saver import html_saver

            html_saver.save_reference_length(results_dir, reference_length)

    logger.info("  Contig files: ")
    lists_of_lengths = []
    numbers_of_Ns = []
    for id, contigs_fpath in enumerate(contigs_fpaths):
        assembly_label = qutils.label_from_fpath(contigs_fpath)

        logger.info("    " + qutils.index_to_str(id) + assembly_label)
        # lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath))
        list_of_length = []
        number_of_Ns = 0
        for (name, seq) in fastaparser.read_fasta(contigs_fpath):
            list_of_length.append(len(seq))
            number_of_Ns += seq.count("N")

        lists_of_lengths.append(list_of_length)
        numbers_of_Ns.append(number_of_Ns)

    num_contigs = max([len(list_of_length) for list_of_length in lists_of_lengths])

    multiplicator = 1
    if num_contigs >= (qconfig.max_points * 2):
        import math

        multiplicator = int(num_contigs / qconfig.max_points)
        max_points = num_contigs / multiplicator
        lists_of_lengths = [sorted(list, reverse=True) for list in lists_of_lengths]
        corr_lists_of_lengths = [
            [
                sum(list_of_length[((i - 1) * multiplicator) : (i * multiplicator)])
                for i in range(1, max_points)
                if (i * multiplicator) < len(list_of_length)
            ]
            for list_of_length in lists_of_lengths
        ]
        for num_list in range(len(corr_lists_of_lengths)):
            last_index = len(corr_lists_of_lengths[num_list])
            corr_lists_of_lengths[num_list].append(sum(lists_of_lengths[num_list][last_index * multiplicator :]))
    else:
        corr_lists_of_lengths = lists_of_lengths

    # saving lengths to JSON
    if json_output_dir:
        json_saver.save_contigs_lengths(json_output_dir, contigs_fpaths, corr_lists_of_lengths)
        json_saver.save_tick_x(json_output_dir, multiplicator)

    if qconfig.html_report:
        from libs.html_saver import html_saver

        html_saver.save_contigs_lengths(results_dir, contigs_fpaths, corr_lists_of_lengths)
        html_saver.save_tick_x(results_dir, multiplicator)

    ########################################################################

    logger.info("  Calculating N50 and L50...")

    list_of_GC_distributions = []
    largest_contig = 0
    import N50

    for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(
        itertools.izip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)
    ):
        report = reporting.get(contigs_fpath)
        n50, l50 = N50.N50_and_L50(lengths_list)
        ng50, lg50 = None, None
        if reference_length:
            ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length)
        n75, l75 = N50.N50_and_L50(lengths_list, 75)
        ng75, lg75 = None, None
        if reference_length:
            ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75)
        total_length = sum(lengths_list)
        total_GC, GC_distribution = GC_content(contigs_fpath, skip=qconfig.no_gc)
        list_of_GC_distributions.append(GC_distribution)
        logger.info(
            "    "
            + qutils.index_to_str(id)
            + qutils.label_from_fpath(contigs_fpath)
            + ", N50 = "
            + str(n50)
            + ", L50 = "
            + str(l50)
            + ", Total length = "
            + str(total_length)
            + ", GC % = "
            + ("%.2f" % total_GC if total_GC is not None else "undefined")
            + ", # N's per 100 kbp = "
            + " %.2f" % (float(number_of_Ns) * 100000.0 / float(total_length))
            if total_length != 0
            else "undefined"
        )

        report.add_field(reporting.Fields.N50, n50)
        report.add_field(reporting.Fields.L50, l50)
        if reference_length and not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NG50, ng50)
            report.add_field(reporting.Fields.LG50, lg50)
        report.add_field(reporting.Fields.N75, n75)
        report.add_field(reporting.Fields.L75, l75)
        if reference_length and not qconfig.is_combined_ref:
            report.add_field(reporting.Fields.NG75, ng75)
            report.add_field(reporting.Fields.LG75, lg75)
        report.add_field(reporting.Fields.CONTIGS, len(lengths_list))
        if lengths_list:
            report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list))
            largest_contig = max(largest_contig, max(lengths_list))
            report.add_field(reporting.Fields.TOTALLEN, total_length)
            if not qconfig.is_combined_ref:
                report.add_field(reporting.Fields.GC, ("%.2f" % total_GC if total_GC is not None else None))
            report.add_field(reporting.Fields.UNCALLED, number_of_Ns)
            report.add_field(
                reporting.Fields.UNCALLED_PERCENT, ("%.2f" % (float(number_of_Ns) * 100000.0 / float(total_length)))
            )
        if ref_fpath:
            report.add_field(reporting.Fields.REFLEN, int(reference_length))
            if not qconfig.is_combined_ref:
                report.add_field(reporting.Fields.REFGC, "%.2f" % reference_GC)
        elif reference_length:
            report.add_field(reporting.Fields.ESTREFLEN, int(reference_length))

    import math

    qconfig.min_difference = math.ceil((largest_contig / 1000) / 600)  # divide on height of plot

    if json_output_dir:
        json_saver.save_GC_info(json_output_dir, contigs_fpaths, list_of_GC_distributions)

    if qconfig.html_report and not qconfig.is_combined_ref:
        from libs.html_saver import html_saver

        html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions)

    import plotter

    ########################################################################
    # Drawing Nx and NGx plots...
    plotter.Nx_plot(
        results_dir,
        num_contigs > qconfig.max_points,
        contigs_fpaths,
        lists_of_lengths,
        output_dirpath + "/Nx_plot",
        "Nx",
        [],
        json_output_dir=json_output_dir,
    )
    if reference_length and not qconfig.is_combined_ref:
        plotter.Nx_plot(
            results_dir,
            num_contigs > qconfig.max_points,
            contigs_fpaths,
            lists_of_lengths,
            output_dirpath + "/NGx_plot",
            "NGx",
            [reference_length for i in range(len(contigs_fpaths))],
            json_output_dir=json_output_dir,
        )

    if qconfig.draw_plots:
        ########################################################################import plotter
        # Drawing cumulative plot...
        plotter.cumulative_plot(
            ref_fpath, contigs_fpaths, lists_of_lengths, output_dirpath + "/cumulative_plot", "Cumulative length"
        )
        if not qconfig.is_combined_ref:
            ########################################################################
            # Drawing GC content plot...
            list_of_GC_distributions_with_ref = list_of_GC_distributions
            if ref_fpath:
                list_of_GC_distributions_with_ref.append(reference_GC_distribution)
            plotter.GC_content_plot(
                ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, output_dirpath + "/GC_content_plot"
            )

    logger.main_info("Done.")
#!/usr/bin/python

############################################################################
# Copyright (c) 2015 Saint Petersburg State University
# Copyright (c) 2011-2014 Saint Petersburg Academic University
# All Rights Reserved
# See file LICENSE for details.
############################################################################

# Convert contigs (i.e a reference) for experiment of running SPAdes on E. coli MC reads in "IonTorrent" mode
# (all series of repeated nucleotides are changed to single nucleotides).

import sys
import os
import fastaparser

# MAIN
if len(sys.argv) < 3:
    print("Usage: " + sys.argv[0] + " <input fasta> <output fasta>")
    sys.exit()

new_fasta = []
for name, seq in fastaparser.read_fasta(sys.argv[1]):
    new_seq = seq[0]
    for i in range(1, len(seq)):
        if seq[i - 1] != seq[i]:
            new_seq += seq[i]
    new_fasta.append((name, new_seq))

fastaparser.write_fasta_to_file(sys.argv[2], new_fasta)
Exemple #22
0
# MAIN
if len(sys.argv) != 4:
    print("Usage: " + sys.argv[0] + " <input fasta> <K or K1,K2,K3> <output_dir>")
    sys.exit()

if len(sys.argv[2].split(',')) > 1:
    K_list = map(int, sys.argv[2].split(','))
else:
    K_list = [int(sys.argv[2])]
output_dir = os.path.abspath(sys.argv[3])
if not os.path.isdir(output_dir):
    os.makedirs(output_dir)

# creating single-entry references and chains
params_subst_dict = dict()
input_fasta = fastaparser.read_fasta(sys.argv[1])
cwd = os.getcwd()
os.chdir(ideal_assembler_bin_dir)
for K in K_list:
    print("Starting with K=" + str(K))
    result_fasta = []
    for id, fasta_entry in enumerate(input_fasta):
        cur_ref_name = os.path.join(output_dir, 'chr_' + str(id) + '.fasta')
        cur_chain_name = os.path.join(output_dir, 'chr_' + str(id) + '_K' + str(K) + '_chain')
        log_filename = os.path.join(output_dir, 'chr_' + str(id) + '_K' + str(K) + '.log')
        fastaparser.write_fasta_to_file(cur_ref_name, [fasta_entry])
        shutil.copy(chain_template, cur_chain_name)
        cur_params_subst_dict = dict(params_subst_dict)
        cur_params_subst_dict['OUT_BASE'] = 'chr_' + str(id) + '_K' + str(K)
        tmp_dir = os.path.join(ideal_assembler_bin_dir, 'data/cap/cache/env_' + cur_params_subst_dict['OUT_BASE'])
        cur_params_subst_dict['REFERENCE'] = cur_ref_name
Exemple #23
0
def main():

    args = parse_args(sys.argv[1:])
    base = os.path.basename(args.f)
    name_file = os.path.splitext(base)[0]
    dirname = os.path.dirname(__file__)
    outdir = args.o

    try:
        os.makedirs(outdir)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    name = os.path.join(outdir, name_file)

    ids = []
    with open(args.f, "r") as ins:
        for line in ins:
            if line[0] == ">":
                ids.append(line.split()[0][1:])

    if args.hmm:
        hmm = args.hmm
    else:
        print("No HMM database provided")
        exit(1)

    if args.db:
        from parse_blast_xml import parser
        blastdb = args.db

    if args.t:
        threads = str(args.t)
    else:
        threads = str(20)

    # Check for circular:
    contig_len_circ = check_circular(args.f, name)
    infile_circ = name + "_input_with_circ.fasta"

    # Run gene prediction
    print(
        datetime.datetime.fromtimestamp(
            time.time()).strftime('%Y-%m-%d %H:%M:%S'))

    print("Gene prediction...")
    res = os.system("prodigal -p meta -c -i " + infile_circ + " -a " + name +
                    "_proteins.fa -o " + name + "_genes.fa 2>" + name +
                    "_prodigal.log")
    if res != 0:
        print("Prodigal run failed")
        exit(1)

    # Filter genes predicted over the end of the contig

    proteins = fastaparser.read_fasta(name + "_proteins.fa")
    with open(name + "_proteins_circ.fa", 'w') as protein_output:
        for i in proteins:
            contig_name = i[0].split()[0].rsplit("_", 1)[0][1:]
            gene_start = i[0].split("#")[1]
            if int(gene_start.strip()) < int(
                (contig_len_circ[contig_name][0])):
                protein_output.write(i[0] + "\n")
                protein_output.write(i[1] + "\n")

# HMM search

    print(
        datetime.datetime.fromtimestamp(
            time.time()).strftime('%Y-%m-%d %H:%M:%S'))
    print("HMM domains prediction...")
    res = os.system("hmmsearch  --noali --cut_nc  -o " + name +
                    "_out_pfam --domtblout " + name + "_domtblout --cpu " +
                    threads + " " + hmm + " " + name + "_proteins_circ.fa")
    if res != 0:
        print("hmmsearch run failed")
        exit(1)

    print(
        datetime.datetime.fromtimestamp(
            time.time()).strftime('%Y-%m-%d %H:%M:%S'))

    print("Parsing...")
    tblout_pfam = name + "_domtblout"

    feature_table = get_table_from_tblout(tblout_pfam)
    feature_table = [i.strip().split(' ', 1) for i in feature_table]

    with open(name + '_feature_table.txt', 'w') as output:
        writer = csv.writer(output, lineterminator='\n')
        writer.writerows(feature_table)

    feature_table_names = []
    feature_table_genes = []
    for i in feature_table:
        feature_table_names.append(i[0])
        feature_table_genes.append(i[1])

    print(
        datetime.datetime.fromtimestamp(
            time.time()).strftime('%Y-%m-%d %H:%M:%S'))

    print("Classification...")
    t = feature_table_genes
    k = naive_bayes(t)

    names_result = {}
    for i in range(0, len(k)):
        names_result[feature_table_names[i]] = [
            k[i][0], k[i][3], feature_table_genes[i]
        ]

    if args.db:
        #run blast
        print(
            datetime.datetime.fromtimestamp(
                time.time()).strftime('%Y-%m-%d %H:%M:%S'))
        print("Running BLAST...")

        os.system("blastn  -query " + args.f + " -db " + blastdb +
                  " -evalue 0.0001 -outfmt 5 -out " + name +
                  ".xml -num_threads " + threads + " -num_alignments 50")
        print(
            datetime.datetime.fromtimestamp(
                time.time()).strftime('%Y-%m-%d %H:%M:%S'))
        print("Parsing BLAST")
        parser(name + ".xml", outdir)
        print(
            datetime.datetime.fromtimestamp(
                time.time()).strftime('%Y-%m-%d %H:%M:%S'))

        #### add blast results
        plasmids = [
            line.strip().split("\t") for line in open(name + "_plasmid.names")
        ]

        plasmids_list = {}
        for i in range(0, len(plasmids) - 1):
            if len(plasmids[i]) == 1:
                plasmids_list[plasmids[i][0].split()[0]] = [
                    float(plasmids[i + 1][1].split(":")[1]),
                    float(plasmids[i + 1][2].split(":")[1]), plasmids[i + 1][0]
                ]

        chrom = [
            line.rstrip().split("\t")
            for line in open(name + "_chromosome.names")
        ]
        chrom_list = {}
        for i in range(0, len(chrom) - 1):
            if len(chrom[i]) == 1:
                chrom_list[chrom[i][0].split()[0]] = [
                    float(chrom[i + 1][1].split(":")[1]),
                    float(chrom[i + 1][2].split(":")[1]), chrom[i + 1][0]
                ]

        vir = [
            line.rstrip().split("\t") for line in open(name + "_viruses.names")
        ]
        vir_list = {}
        for i in range(0, len(vir) - 1):
            if len(vir[i]) == 1:
                vir_list[vir[i][0].split()[0]] = [
                    float(vir[i + 1][1].split(":")[1]),
                    float(vir[i + 1][2].split(":")[1]), vir[i + 1][0]
                ]

        nos = [line.rstrip() for line in open(name + "_no_significant.names")]
        nos_list = []
        for i in nos:
            if len(i.split()) > 0:
                nos_list.append(i.split()[0])
        nos_list = [i.strip().split()[0] for i in nos_list]

        other = [line.rstrip() for line in open(name + "_other.names")]
        other_list = []
        for i in other:
            if len(i.split()) > 0:
                other_list.append(i.split()[0])
        other_list = [i.strip().split()[0] for i in other_list]

    final_table = collections.OrderedDict()
    if args.db:
        for i in ids:
            if i in names_result:
                if names_result[i][0] == "Uncertain - too short":
                    if (contig_len_circ[i][0] > 3000) or (contig_len_circ[i][1]
                                                          == "+"):
                        names_result[i][0] = "Uncertain - viral or bacterial"

                if i in plasmids_list:
                    final_table[i] = [
                        names_result[i][0], contig_len_circ[i][0],
                        contig_len_circ[i][1], names_result[i][1],
                        names_result[i][2], "Plasmid",
                        round(plasmids_list[i][0], 2),
                        round(plasmids_list[i][1], 2), plasmids_list[i][2]
                    ]
                if i in chrom_list:
                    final_table[i] = [
                        names_result[i][0], contig_len_circ[i][0],
                        contig_len_circ[i][1], names_result[i][1],
                        names_result[i][2], "Chromosome", chrom_list[i][0],
                        chrom_list[i][1], chrom_list[i][2]
                    ]
                if i in vir_list:
                    final_table[i] = [
                        names_result[i][0], contig_len_circ[i][0],
                        contig_len_circ[i][1], names_result[i][1],
                        names_result[i][2], "Virus", vir_list[i][0],
                        vir_list[i][1], vir_list[i][2]
                    ]
                if i in nos_list:
                    final_table[i] = [
                        names_result[i][0], contig_len_circ[i][0],
                        contig_len_circ[i][1], names_result[i][1],
                        names_result[i][2], "Non-significant"
                    ]
                if i in other_list:
                    final_table[i] = [
                        names_result[i][0],
                        contig_len_circ[i][0],
                        contig_len_circ[i][1],
                        names_result[i][1],
                        names_result[i][2],
                        "Other",
                    ]

            else:
                if (contig_len_circ[i][0] > 3000) or (contig_len_circ[i][1]
                                                      == "+"):
                    names_result[i] = "Uncertain - viral or bacterial"
                else:
                    names_result[i] = "Uncertain - too short"

                if i in plasmids_list:
                    final_table[i] = [
                        names_result[i], contig_len_circ[i][0],
                        contig_len_circ[i][1], "-", "-", "Plasmid",
                        plasmids_list[i][0], plasmids_list[i][1],
                        plasmids_list[i][2]
                    ]
                if i in chrom_list:
                    final_table[i] = [
                        names_result[i], contig_len_circ[i][0],
                        contig_len_circ[i][1], "-", "-", "Chromosome",
                        chrom_list[i][0], chrom_list[i][1], chrom_list[i][2]
                    ]
                if i in vir_list:
                    final_table[i] = [
                        names_result[i], contig_len_circ[i][0],
                        contig_len_circ[i][1], "-", "-", "Virus",
                        vir_list[i][0], vir_list[i][1], vir_list[i][2]
                    ]
                if i in nos_list:
                    final_table[i] = [
                        names_result[i], contig_len_circ[i][0],
                        contig_len_circ[i][1], "-", "-", "Non-significant"
                    ]
                if i in other_list:
                    final_table[i] = [
                        names_result[i], contig_len_circ[i][0],
                        contig_len_circ[i][1], "-", "-", "Other",
                        other_list[i][0], other_list[i][1], other_list[i][2]
                    ]

    else:
        for i in ids:
            if i in names_result:
                if names_result[i][0] == "Uncertain - too short":
                    if (contig_len_circ[i][0] > 3000) or (contig_len_circ[i][1]
                                                          == "+"):
                        names_result[i][0] = "Uncertain - viral or bacterial"
                final_table[i] = [
                    names_result[i][0], contig_len_circ[i][0],
                    contig_len_circ[i][1], names_result[i][1],
                    names_result[i][2]
                ]
            else:
                if (contig_len_circ[i][0] > 3000) or (contig_len_circ[i][1]
                                                      == "+"):
                    final_table[i] = [
                        "Uncertain - viral or bacterial",
                        contig_len_circ[i][0], contig_len_circ[i][1], "-"
                    ]
                else:
                    final_table[i] = [
                        "Uncertain - too short", contig_len_circ[i][0],
                        contig_len_circ[i][1], "-"
                    ]

    result_file = name + "_result_table.csv"
    with open(result_file, 'w') as output:
        writer = csv.writer(output, lineterminator='\n')
        for i in final_table:
            writer.writerow([i] + final_table[i])

    if not os.path.exists(outdir + "/Prediction_results_fasta/"):
        os.mkdir(outdir + "/Prediction_results_fasta/")

    with open(
            outdir + "/Prediction_results_fasta/" + name_file + "_virus.fasta",
            "w") as vir_file:
        with open(
                outdir + "/Prediction_results_fasta/" + name_file +
                "_plasmid.fasta", "w") as plasmid_file:
            with open(
                    outdir + "/Prediction_results_fasta/" + name_file +
                    "_chromosome.fasta", "w") as chrom_file:
                with open(
                        outdir + "/Prediction_results_fasta/" + name_file +
                        "_virus_uncertain.fasta", "w") as vc_file:
                    with open(
                            outdir + "/Prediction_results_fasta/" + name_file +
                            "_plasmid_uncertain.fasta", "w") as pc_file:
                        contigs = fastaparser.read_fasta(args.f)
                        for i in contigs:
                            contig_name = i[0].split(" ")[0][1:]
                            if final_table[contig_name][0] == "Virus":
                                vir_file.write(i[0] + "\n")
                                vir_file.write(i[1] + "\n")
                            elif final_table[contig_name][0] == "Chromosome":
                                chrom_file.write(i[0] + "\n")
                                chrom_file.write(i[1] + "\n")
                            elif final_table[contig_name][0] == "Plasmid":
                                if args.p:
                                    plasmid_file.write(i[0] + "\n")
                                    plasmid_file.write(i[1] + "\n")
                                else:
                                    chrom_file.write(i[0] + "\n")
                                    chrom_file.write(i[1] + "\n")

                            elif final_table[contig_name][
                                    0] == "Uncertain - viral or bacterial":
                                vc_file.write(i[0] + "\n")
                                vc_file.write(i[1] + "\n")

                            elif final_table[contig_name][
                                    0] == "Uncertain - plasmid or chromosomal":
                                if args.p:
                                    pc_file.write(i[0] + "\n")
                                    pc_file.write(i[1] + "\n")
                                else:
                                    chrom_file.write(i[0] + "\n")
                                    chrom_file.write(i[1] + "\n")

                        if not args.p:
                            os.remove(outdir + "/Prediction_results_fasta/" +
                                      name_file + "_plasmid.fasta")
                            os.remove(outdir + "/Prediction_results_fasta/" +
                                      name_file + "_plasmid_uncertain.fasta")

    print("Done!")
    print(
        datetime.datetime.fromtimestamp(
            time.time()).strftime('%Y-%m-%d %H:%M:%S'))
    print("Verification results can be found in " +
          os.path.abspath(result_file))
Exemple #24
0
############################################################################
# Copyright (c) 2015 Saint Petersburg State University
# Copyright (c) 2011-2014 Saint Petersburg Academic University
# All Rights Reserved
# See file LICENSE for details.
############################################################################

import sys
import os

# Deletes reverse-complementary duplicates and simple duplicates from FASTA-file

sys.path.append(os.path.join(os.path.abspath(sys.path[0]), '..'))

import fastaparser

if len(sys.argv) < 2:
    print 'Usage', sys.argv[0], 'in.fasta > out.fasta'
    exit(1)

fastafilename = sys.argv[1]
fasta = fastaparser.read_fasta(fastafilename)
fasta_res = {}

for name, seq in fasta:
    if (seq not in fasta_res) and (fastaparser.rev_comp(seq) not in fasta_res):
        fasta_res[seq] = name

fastaparser.write_fasta((name, seq) for seq, name in fasta_res.iteritems())
Exemple #25
0
def main_utils():
    program_name = sys.argv[0][:sys.argv[0].rfind('.')]

    # parse running string of main program and get all arguments:
    args = UtilsPipeline.get_arguments()

    WELL_FULLY_COVERAGE_THRESHOLDS = rqconfig.well_fully_coverage_thresholds(
        args.lower_threshold, args.upper_threshold)

    ALIGNMENT_THRESHOLDS = rqconfig.alignment_thresholds()

    # run rnaQUAST on test_data:
    if args.test:
        UtilsPipeline.run_rnaQUAST_on_test_data(args, rquast_dirpath,
                                                program_name)
        # UtilsPipeline.run_rnaQUAST_on_debug_data(args, rquast_dirpath, program_name)
        sys.exit()

    UtilsPipeline.get_abspath_input_data(args)

    # create output directory:
    args.output_dir = UtilsPipeline.create_output_folder(
        args.output_dir, program_name)
    # create temporary directory:
    tmp_dir = UtilsPipeline.create_empty_folder(
        os.path.join(args.output_dir, 'tmp'))
    # create directory for log files:
    log_dir = UtilsPipeline.create_empty_folder(
        os.path.join(args.output_dir, 'logs'))

    # SET LOGGER:
    if args.debug:
        rqconfig.debug = True
        logger.set_up_console_handler(debug=True)
    else:
        logger.set_up_console_handler()
    logger.set_up_file_handler(log_dir)
    logger.print_command_line([os.path.realpath(__file__)] + sys.argv[1:],
                              wrap_after=None)
    logger.start(args.blat, tmp_dir)

    UtilsPipeline.get_input_data_exist_error(args, logger)

    # THREADING:
    args.threads = UtilsPipeline.get_num_threads(args.threads, logger)

    if args.meta:
        logger.info(
            '\nYOU RUN QUALITY ASSESSMENT FOR METATRANSCRIPTOME ASSEMBLIES')

    # GET segregate FILES:
    if args.reference and args.gtf and len(args.reference) != len(args.gtf):
        logger.error('Numbers of references and gene databases are different',
                     exit_with_code=1)

    args.reference = \
        UtilsPipeline.get_single_file(args.reference, tmp_dir, 'reference', rqconfig.list_ext_fa, args.meta, logger)

    args.gtf = \
        UtilsPipeline.get_single_file(args.gtf, tmp_dir, 'gene_database', rqconfig.list_ext_gtf, args.meta, logger)

    # READ REFERENCE FROM MULTIFASTA:
    reference_dict = None
    ids_chrs = None
    if args.reference is not None:
        logger.print_timestamp()
        logger.info('Getting reference...')
        reference_dict = UtilsGeneral.list_to_dict(
            fastaparser.read_fasta(args.reference))
        logger.info('Done.')

        genome_len = UtilsGeneral.get_genome_len(reference_dict)

        ids_chrs = reference_dict.keys()

        # correction for fasta contained Y, W and etc:
        # for id_chr in ids_chrs:
        #     reference_dict[id_chr] = UtilsGeneral.correct_nucl_seq(reference_dict[id_chr])

    # for strand specific data we store + and - keys in dictionaries and only + for non strand specific data:
    strands = UtilsGeneral.get_strands(args, logger)

    if args.prokaryote:
        type_organism = 'prokaryotes'
    else:
        type_organism = 'eukaryotes'

    # USE ANNOTATION:
    sqlite3_db_genes = None
    sorted_exons_attr = None
    db_genes_metrics = None
    type_genes, type_isoforms, type_exons = \
        UtilsAnnotations.default_type_genes, \
        UtilsAnnotations.default_type_isoforms, \
        UtilsAnnotations.default_type_exons

    if args.gtf is not None or args.gene_db is not None:
        if args.gene_db is not None:
            gene_db_name = os.path.split(args.gene_db)[1]
            label_db = gene_db_name[:gene_db_name.rfind('.db')]
        else:
            gtf_name = os.path.split(args.gtf)[1]
            label_db = gtf_name[:gtf_name.rfind('.g')]

            if ids_chrs is not None:
                args.gtf = UtilsAnnotations.clear_gtf_by_reference_chr(
                    args.gtf, ids_chrs, tmp_dir, label_db, logger)

        sqlite3_db_genes = \
            UtilsAnnotations.create_sqlite3_db(args.gene_db, args.gtf, label_db,
                                               args.disable_infer_genes, args.disable_infer_transcripts,
                                               args.output_dir, tmp_dir, logger)

        type_genes, type_isoforms, type_exons = \
            UtilsAnnotations.get_type_features(sqlite3_db_genes, UtilsAnnotations.default_type_genes,
                                               UtilsAnnotations.default_type_isoforms,
                                               UtilsAnnotations.default_type_exons, args.prokaryote, logger)

        # if UtilsAnnotations.default_type_exons == type_exons:
        #     type_organism = 'eukaryotes'
        # else:
        #     type_organism = 'prokaryotes'

        db_genes_metrics = GeneDatabaseMetrics.GeneDatabaseMetrics(
            sqlite3_db_genes, type_genes, type_isoforms, logger)

        ALIGNMENT_THRESHOLDS.ERR_SPACE_TARGET_FAKE_BLAT = db_genes_metrics.max_intron_len + 100
        logger.info(
            '\nSets maximum intron size equal {}. Default is 1500000 bp.\n'.
            format(ALIGNMENT_THRESHOLDS.ERR_SPACE_TARGET_FAKE_BLAT))

        # set exons starts / ends and ids for binning strategy:
        if ids_chrs is not None:
            sorted_exons_attr = \
                SortedExonsAttributes.SortedExonsAttributes(sqlite3_db_genes, type_exons, strands, ids_chrs, reference_dict, logger)

    reads_coverage = None
    if args.reads_alignment is not None or \
            ((args.single_reads is not None or (args.left_reads is not None and args.right_reads is not None))
             and args.reference is not None and sqlite3_db_genes is not None):
        reads_coverage = \
            ReadsCoverage.ReadsCoverage(args.reads_alignment, args.tophat, args.reference, args.single_reads,
                                        args.left_reads, args.right_reads, reference_dict, sqlite3_db_genes, type_isoforms,
                                        sorted_exons_attr, args.strand_specific, db_genes_metrics.tot_isoforms_len,
                                        genome_len, tmp_dir, args.threads, WELL_FULLY_COVERAGE_THRESHOLDS, logger, log_dir)

    if args.transcripts is not None:
        # GET TRANSCRIPTS:
        transcripts_dicts = []
        for i_transcripts in range(len(args.transcripts)):
            logger.print_timestamp('  ')
            logger.info('  Getting transcripts from {}...'.format(
                args.transcripts[i_transcripts]))
            transcripts_dicts.append(
                UtilsGeneral.list_to_dict(
                    fastaparser.read_fasta(args.transcripts[i_transcripts])))
            logger.info('  Done.')

        # get labels for folders names and names of transcripts in reports:
        all_labels_from_dirs = False
        if args.labels is None:
            args.labels = UtilsPipeline.process_labels(args.transcripts,
                                                       args.labels,
                                                       all_labels_from_dirs)
    else:
        logger.warning('No transcripts. Use --transcripts option.')

    # GET PSL ALIGNMENT FILE:
    if args.alignment is None and args.reference is not None and args.transcripts is not None:
        if args.blat:
            args.alignment = UtilsTools.run_blat(None, args.reference,
                                                 transcripts_dicts,
                                                 args.labels, args.threads,
                                                 tmp_dir, logger, log_dir)
        else:
            args.alignment = UtilsTools.run_gmap(args.reference, genome_len,
                                                 args.transcripts, args.labels,
                                                 args.threads, args.gmap_index,
                                                 tmp_dir, logger, log_dir)

        #if args.fusion_misassemble_analyze:
        #    if not (args.left_reads is not None and args.right_reads is not None):
        #        logger.error('Usage: --left_reads LEFT_READS --right RIGHT_READS for analyse fusions and misassemblies',
        #                     exit_with_code=2, to_stderr=True)
        #        sys.exit(2)

    # FOR MISASSEMBLIES SEARCH:
    # GET DATABASE FOR FA ISOFORMS:
    args.blast = False
    if args.reference is not None and sqlite3_db_genes is not None and args.alignment is not None:
        blastn_run = os.path.join(rqconfig.rnaQUAST_LOCATION, '.', 'blastn')
        if not os.path.isfile(blastn_run):
            blastn_run = "blastn"

        if UtilsGeneral.which(blastn_run) is None:
            logger.warning(
                'blastn not found! Please add blastn to PATH for better MISASSEMBLIES metrics.'
            )
        else:
            args.blast = True

            isoforms_fa_path = os.path.join(tmp_dir,
                                            '{}.isoforms.fa'.format(label_db))
            isoforms_list = UtilsGeneral.dict_to_list(
                UtilsAnnotations.get_fa_isoforms(sqlite3_db_genes,
                                                 type_isoforms, type_exons,
                                                 reference_dict, logger))
            fastaparser.write_fasta(isoforms_fa_path, isoforms_list)

            isoforms_blast_db = UtilsTools.get_blast_db(
                isoforms_fa_path, label_db, tmp_dir, logger, log_dir)

    # LOGGING INPUT DATA:
    logger.print_input_files(args)

    # INITIALIZATION TRANSCRIPTS METRICS AND REPORTS:
    transcripts_metrics = []
    separated_reports = []
    if args.transcripts is not None:
        alignments_reports = []
        blast_alignments = []
        for i_transcripts in range(len(args.transcripts)):
            # INITIALIZE TRANSCRIPTS METRICS:
            #if args.sam_file is not None:
            #    sam_file_tmp = args.sam_file[i_transcripts]
            #else:
            transcripts_metrics.append(
                TranscriptsMetrics.TranscriptsMetrics(
                    args, args.labels[i_transcripts]))

            # INITIALIZE SEPARATED REPORTS:
            separated_reports.append(
                SeparatedReport.SeparatedReport(
                    args.labels[i_transcripts], args.output_dir,
                    transcripts_metrics[i_transcripts],
                    WELL_FULLY_COVERAGE_THRESHOLDS))
            '''from joblib import Parallel, delayed

            n = len(args.transcripts)
            run_n = n / args.threads
            for i_run in range(run_n):
                tmp = Parallel(n_jobs=args.threads)(delayed(process_one_trascripts_file)(args, i_transcripts, reference_dict, annotation_dict,
                                                                                              annotated_exons, annotated_isoforms, strands, transcripts_metrics,
                                                                                              basic_isoforms_metrics, separated_reports)
                                                         for i_transcripts in range(i_run * args.threads, args.threads * (i_run + 1), 1))
                for i in range(args.threads):
                    i_transcripts = i + i_run * args.threads
                    transcripts_metrics[i_transcripts] = tmp[i][0]
                    separated_reports[i_transcripts] = tmp[i][1]

            if n - run_n * args.threads != 0:
                tmp = Parallel(n_jobs=n - run_n * args.threads)(delayed(process_one_trascripts_file)(args, i_transcripts, reference_dict, annotation_dict,
                                                                                                     annotated_exons, annotated_isoforms, strands, transcripts_metrics,
                                                                                                     basic_isoforms_metrics, separated_reports)
                                                                for i_transcripts in range(run_n * args.threads, n, 1))
                for i in range(n - run_n * args.threads):
                    i_transcripts = i + run_n * args.threads
                    transcripts_metrics[i_transcripts] = tmp[i][0]
                    separated_reports[i_transcripts] = tmp[i][1]'''

            logger.info()
            logger.info('Processing transcripts from {}:'.format(
                args.transcripts[i_transcripts]))

            if args.blast:
                blast_alignments.append\
                    (UtilsTools.align_transcripts_to_isoforms_by_blastn
                     (args.transcripts[i_transcripts], isoforms_blast_db, tmp_dir, args.labels[i_transcripts], logger, log_dir))
            else:
                blast_alignments.append(None)

            # PROCESS TRANSCRIPTS ALIGNMENTS:
            if transcripts_metrics[i_transcripts].simple_metrics is not None:
                # GET FILES WITH ALIGNMENTS REPORTS:
                alignments_reports.append\
                    (UtilsAlignment.AlignmentsReport.get_alignments_report
                     (args.labels[i_transcripts], args.alignment[i_transcripts], blast_alignments[i_transcripts],
                      transcripts_dicts[i_transcripts], tmp_dir, args.min_alignment, logger, ALIGNMENT_THRESHOLDS))

                # UPDATE METRICS BY ASSEMBLED TRANSCRIPTS:
                transcripts_metrics[i_transcripts].processing_assembled_psl_file\
                    (alignments_reports[i_transcripts].blat_report.assembled_psl_file, sorted_exons_attr,
                     args.strand_specific, logger, sqlite3_db_genes, type_isoforms, WELL_FULLY_COVERAGE_THRESHOLDS)

                # UPDATE METRICS BY MISASSEMBLED TRANSCRIPTS:
                # by blat:
                transcripts_metrics[i_transcripts].processing_misassembled_psl_file\
                    (alignments_reports[i_transcripts].blat_report.misassembled_psl_union_file, logger, True)
                # by blast:
                if args.blast:
                    transcripts_metrics[i_transcripts].processing_misassembled_psl_file\
                        (alignments_reports[i_transcripts].blast6_report.misassembled_blast6_union_file, logger, False)

            # GET METRICS:
            transcripts_metrics[i_transcripts].get_transcripts_metrics\
                (args, type_organism, reference_dict, args.transcripts[i_transcripts], transcripts_dicts[i_transcripts],
                 args.labels[i_transcripts], args.threads, sqlite3_db_genes, db_genes_metrics, reads_coverage, logger,
                 tmp_dir, log_dir, WELL_FULLY_COVERAGE_THRESHOLDS, rqconfig.TRANSCRIPT_LENS)

            # GET SEPARATED REPORT:
            separated_reports[i_transcripts].get_separated_report\
                (args, args.labels[i_transcripts], transcripts_dicts[i_transcripts], transcripts_metrics[i_transcripts],
                 db_genes_metrics, reads_coverage, logger, WELL_FULLY_COVERAGE_THRESHOLDS, PRECISION, rqconfig.TRANSCRIPT_LENS)

    # GET COMPARISON REPORT:
    comparison_report = None
    if len(separated_reports) != 1:
        comparison_report = ComparisonReport.ComparisonReport()
        comparison_report.get_comparison_report(
            args, args.output_dir, args.labels, transcripts_metrics,
            db_genes_metrics, reads_coverage, logger,
            WELL_FULLY_COVERAGE_THRESHOLDS, PRECISION,
            rqconfig.TRANSCRIPT_LENS)

    # GET SHORT REPORT:
    short_report = \
        ShortReport.ShortReport(args, db_genes_metrics, transcripts_metrics, args.output_dir, separated_reports,
                                comparison_report, logger, WELL_FULLY_COVERAGE_THRESHOLDS, PRECISION,
                                rqconfig.TRANSCRIPT_LENS)

    # REMOVE TEMPORARY DIRECTORY FROM OUTPUT DIRECTORY:
    if os.path.exists(tmp_dir) and not args.debug:
        logger.debug('Remove temporary directory {}'.format(tmp_dir))
        shutil.rmtree(tmp_dir)
        logger.debug('Done.')

    # LOGGING RESULTS PATHES:
    logger.print_path_results(args, separated_reports, comparison_report,
                              short_report)

    if args.debug:
        UtilsGeneral.profile_memory(args, reference_dict, db_genes_metrics,
                                    transcripts_metrics, separated_reports,
                                    comparison_report, logger)

    # FINISH LOGGING:
    logger.finish_up()
Exemple #26
0
def do(ref_fpath, contigs_fpaths, output_dirpath, json_output_dir, results_dir):
    logger.print_timestamp()
    logger.info("Running Basic statistics processor...")
    
    if not os.path.isdir(output_dirpath):
        os.mkdir(output_dirpath)

    reference_length = None
    if ref_fpath:
        reference_length = sum(fastaparser.get_lengths_from_fastafile(ref_fpath))
        reference_GC, reference_GC_distribution = GC_content(ref_fpath)

        logger.info('  Reference genome:')
        logger.info('    ' + os.path.basename(ref_fpath) + ', Reference length = ' + str(reference_length) + ', Reference GC % = ' + '%.2f' % reference_GC)
    elif qconfig.estimated_reference_size:
        reference_length = qconfig.estimated_reference_size
        logger.info('  Estimated reference length = ' + str(reference_length))

    if reference_length:
        # Saving the reference in JSON
        if json_output_dir:
            json_saver.save_reference_length(json_output_dir, reference_length)

        # Saving for an HTML report
        if qconfig.html_report:
            from libs.html_saver import html_saver
            html_saver.save_reference_length(results_dir, reference_length)

    logger.info('  Contig files: ')
    lists_of_lengths = []
    numbers_of_Ns = []
    for id, contigs_fpath in enumerate(contigs_fpaths):
        assembly_name = qutils.name_from_fpath(contigs_fpath)
        assembly_label = qutils.label_from_fpath(contigs_fpath)

        logger.info('    ' + qutils.index_to_str(id) + assembly_label)
        #lists_of_lengths.append(fastaparser.get_lengths_from_fastafile(contigs_fpath))
        list_of_length = []
        number_of_Ns = 0
        for (name, seq) in fastaparser.read_fasta(contigs_fpath):
            list_of_length.append(len(seq))
            number_of_Ns += seq.count('N')

        lists_of_lengths.append(list_of_length)
        numbers_of_Ns.append(number_of_Ns)

    # saving lengths to JSON
    if json_output_dir:
        json_saver.save_contigs_lengths(json_output_dir, contigs_fpaths, lists_of_lengths)

    if qconfig.html_report:
        from libs.html_saver import html_saver
        html_saver.save_contigs_lengths(results_dir, contigs_fpaths, lists_of_lengths)

    ########################################################################

    logger.info('  Calculating N50 and L50...')

    list_of_GC_distributions = []
    import N50
    for id, (contigs_fpath, lengths_list, number_of_Ns) in enumerate(itertools.izip(contigs_fpaths, lists_of_lengths, numbers_of_Ns)):
        report = reporting.get(contigs_fpath)
        n50, l50 = N50.N50_and_L50(lengths_list)
        ng50, lg50 = None, None
        if reference_length:
            ng50, lg50 = N50.NG50_and_LG50(lengths_list, reference_length)
        n75, l75 = N50.N50_and_L50(lengths_list, 75)
        ng75, lg75 = None, None
        if reference_length:
            ng75, lg75 = N50.NG50_and_LG50(lengths_list, reference_length, 75)
        total_length = sum(lengths_list)
        total_GC, GC_distribution = GC_content(contigs_fpath)
        list_of_GC_distributions.append(GC_distribution)
        logger.info('    ' + qutils.index_to_str(id) +
                    qutils.label_from_fpath(contigs_fpath) + \
                    ', N50 = ' + str(n50) + \
                    ', L50 = ' + str(l50) + \
                    ', Total length = ' + str(total_length) + \
                    ', GC % = ' + ('%.2f' % total_GC if total_GC is not None else 'undefined') + \
                    ', # N\'s per 100 kbp = ' + ' %.2f' % (float(number_of_Ns) * 100000.0 / float(total_length)) )

        report.add_field(reporting.Fields.N50, n50)
        report.add_field(reporting.Fields.L50, l50)
        if reference_length:
            report.add_field(reporting.Fields.NG50, ng50)
            report.add_field(reporting.Fields.LG50, lg50)
        report.add_field(reporting.Fields.N75, n75)
        report.add_field(reporting.Fields.L75, l75)
        if reference_length:
            report.add_field(reporting.Fields.NG75, ng75)
            report.add_field(reporting.Fields.LG75, lg75)
        report.add_field(reporting.Fields.CONTIGS, len(lengths_list))
        report.add_field(reporting.Fields.LARGCONTIG, max(lengths_list))
        report.add_field(reporting.Fields.TOTALLEN, total_length)
        report.add_field(reporting.Fields.GC, ('%.2f' % total_GC if total_GC else None))
        report.add_field(reporting.Fields.UNCALLED, number_of_Ns)
        report.add_field(reporting.Fields.UNCALLED_PERCENT, ('%.2f' % (float(number_of_Ns) * 100000.0 / float(total_length))))
        if ref_fpath:
            report.add_field(reporting.Fields.REFLEN, int(reference_length))
            report.add_field(reporting.Fields.REFGC, '%.2f' % reference_GC)
        elif reference_length:
            report.add_field(reporting.Fields.ESTREFLEN, int(reference_length))

    if json_output_dir:
        json_saver.save_GC_info(json_output_dir, contigs_fpaths, list_of_GC_distributions)

    if qconfig.html_report:
        from libs.html_saver import html_saver
        html_saver.save_GC_info(results_dir, contigs_fpaths, list_of_GC_distributions)

    if qconfig.draw_plots:
        import plotter
        ########################################################################import plotter
        plotter.cumulative_plot(ref_fpath, contigs_fpaths, lists_of_lengths, output_dirpath + '/cumulative_plot', 'Cumulative length')
    
        ########################################################################
        # Drawing GC content plot...
        list_of_GC_distributions_with_ref = list_of_GC_distributions
        if ref_fpath:
            list_of_GC_distributions_with_ref.append(reference_GC_distribution)
        # Drawing cumulative plot...
        plotter.GC_content_plot(ref_fpath, contigs_fpaths, list_of_GC_distributions_with_ref, output_dirpath + '/GC_content_plot')

        ########################################################################
        # Drawing Nx and NGx plots...
        plotter.Nx_plot(contigs_fpaths, lists_of_lengths, output_dirpath + '/Nx_plot', 'Nx', [])
        if reference_length:
            plotter.Nx_plot(contigs_fpaths, lists_of_lengths, output_dirpath + '/NGx_plot', 'NGx', [reference_length for i in range(len(contigs_fpaths))])

    logger.info('Done.')
Exemple #27
0
def process_single_file(contigs_fpath, index, nucmer_path_dirpath,
                        genome_stats_dirpath, reference_chromosomes,
                        genes_container, operons_container):
    assembly_label = qutils.label_from_fpath_for_fname(contigs_fpath)
    results = dict()
    ref_lengths = {}
    logger.info('  ' + qutils.index_to_str(index) + assembly_label)

    nucmer_base_fpath = os.path.join(nucmer_path_dirpath,
                                     assembly_label + '.coords')
    if qconfig.use_all_alignments:
        nucmer_fpath = nucmer_base_fpath
    else:
        nucmer_fpath = nucmer_base_fpath + '.filtered'

    if not os.path.isfile(nucmer_fpath):
        logger.error('Nucmer\'s coords file (' + nucmer_fpath +
                     ') not found! Try to restart QUAST.',
                     indent='  ')
        return None

    coordfile = open(nucmer_fpath, 'r')
    for line in coordfile:
        if line.startswith('='):
            break

    # EXAMPLE:
    #    [S1]     [E1]  |     [S2]     [E2]  |  [LEN 1]  [LEN 2]  |  [% IDY]  | [TAGS]
    #=====================================================================================
    #  338980   339138  |     2298     2134  |      159      165  |    79.76  | gi|48994873|gb|U00096.2|	NODE_0_length_6088
    #  374145   374355  |     2306     2097  |      211      210  |    85.45  | gi|48994873|gb|U00096.2|	NODE_0_length_6088

    genome_mapping = {}
    for chr_name, chr_len in reference_chromosomes.iteritems():
        genome_mapping[chr_name] = [0] * (chr_len + 1)

    contig_tuples = fastaparser.read_fasta(
        contigs_fpath)  # list of FASTA entries (in tuples: name, seq)
    contig_tuples = sorted(contig_tuples,
                           key=lambda contig: len(contig[1]),
                           reverse=True)
    sorted_contigs_names = [name for (name, seq) in contig_tuples]

    genes_in_contigs = [0] * len(
        sorted_contigs_names
    )  # for cumulative plots: i-th element is the number of genes in i-th contig
    operons_in_contigs = [0] * len(sorted_contigs_names)
    aligned_blocks_by_contig_name = {
    }  # for gene finding: contig_name --> list of AlignedBlock
    for name in sorted_contigs_names:
        aligned_blocks_by_contig_name[name] = []

    for line in coordfile:
        if line.strip() == '':
            break
        s1 = int(line.split('|')[0].split()[0])
        e1 = int(line.split('|')[0].split()[1])
        s2 = int(line.split('|')[1].split()[0])
        e2 = int(line.split('|')[1].split()[1])
        contig_name = line.split()[12].strip()
        chr_name = line.split()[11].strip()

        if chr_name not in genome_mapping:
            logger.error("Something went wrong and chromosome names in your coords file (" + nucmer_base_fpath + ") " \
                         "differ from the names in the reference. Try to remove the file and restart QUAST.")
            return None

        aligned_blocks_by_contig_name[contig_name].append(
            AlignedBlock(seqname=chr_name, start=s1, end=e1))
        if s2 == 0 and e2 == 0:  # special case: circular genome, contig starts on the end of a chromosome and ends in the beginning
            for i in range(s1, len(genome_mapping[chr_name])):
                genome_mapping[chr_name][i] = 1
            for i in range(1, e1 + 1):
                genome_mapping[chr_name][i] = 1
        else:  #if s1 <= e1:
            for i in range(s1, e1 + 1):
                genome_mapping[chr_name][i] = 1
    coordfile.close()

    # counting genome coverage and gaps number
    covered_bp = 0
    gaps_count = 0
    gaps_fpath = os.path.join(genome_stats_dirpath,
                              assembly_label + '_gaps.txt')
    gaps_file = open(gaps_fpath, 'w')
    for chr_name, chr_len in reference_chromosomes.iteritems():
        print >> gaps_file, chr_name
        cur_gap_size = 0
        aligned_len = 0
        for i in range(1, chr_len + 1):
            if genome_mapping[chr_name][i] == 1:
                if cur_gap_size >= qconfig.min_gap_size:
                    gaps_count += 1
                    print >> gaps_file, i - cur_gap_size, i - 1
                aligned_len += 1
                covered_bp += 1
                cur_gap_size = 0
            else:
                cur_gap_size += 1
        ref_lengths[chr_name] = aligned_len
        if cur_gap_size >= qconfig.min_gap_size:
            gaps_count += 1
            print >> gaps_file, chr_len - cur_gap_size + 1, chr_len
    gaps_file.close()

    results["covered_bp"] = covered_bp
    results["gaps_count"] = gaps_count

    # finding genes and operons
    for container, feature_in_contigs, field, suffix in [
        (genes_container, genes_in_contigs, reporting.Fields.GENES,
         '_genes.txt'),
        (operons_container, operons_in_contigs, reporting.Fields.OPERONS,
         '_operons.txt')
    ]:

        if not container.region_list:
            results[field + "_full"] = None
            results[field + "_partial"] = None
            continue

        total_full = 0
        total_partial = 0
        found_fpath = os.path.join(genome_stats_dirpath,
                                   assembly_label + suffix)
        found_file = open(found_fpath, 'w')
        print >> found_file, '%s\t\t%s\t%s\t%s' % ('ID or #', 'Start', 'End',
                                                   'Type')
        print >> found_file, '========================================='

        # 0 - gene is not found,
        # 1 - gene is found,
        # 2 - part of gene is found
        found_list = [0] * len(container.region_list)
        for i, region in enumerate(container.region_list):
            found_list[i] = 0
            for contig_id, name in enumerate(sorted_contigs_names):
                cur_feature_is_found = False
                for cur_block in aligned_blocks_by_contig_name[name]:
                    if container.chr_names_dict[
                            region.seqname] != cur_block.seqname:
                        continue

                    # computing circular genomes
                    if cur_block.start > cur_block.end:
                        blocks = [
                            AlignedBlock(seqname=cur_block.seqname,
                                         start=cur_block.start,
                                         end=region.end + 1),
                            AlignedBlock(seqname=cur_block.seqname,
                                         start=1,
                                         end=cur_block.end)
                        ]
                    else:
                        blocks = [cur_block]

                    for block in blocks:
                        if region.end <= block.start or block.end <= region.start:
                            continue
                        elif block.start <= region.start and region.end <= block.end:
                            if found_list[
                                    i] == 2:  # already found as partial gene
                                total_partial -= 1
                            found_list[i] = 1
                            total_full += 1
                            region_id = str(region.id)
                            if region_id == 'None':
                                region_id = '# ' + str(region.number + 1)
                            print >> found_file, '%s\t\t%d\t%d\tcomplete' % (
                                region_id, region.start, region.end)
                            feature_in_contigs[
                                contig_id] += 1  # inc number of found genes/operons in id-th contig

                            cur_feature_is_found = True
                            break
                        elif found_list[i] == 0 and min(
                                region.end, block.end) - max(
                                    region.start,
                                    block.start) >= qconfig.min_gene_overlap:
                            found_list[i] = 2
                            total_partial += 1
                    if cur_feature_is_found:
                        break
                if cur_feature_is_found:
                    break
            # adding info about partially found genes/operons
            if found_list[i] == 2:  # partial gene/operon
                region_id = str(region.id)
                if region_id == 'None':
                    region_id = '# ' + str(region.number + 1)
                print >> found_file, '%s\t\t%d\t%d\tpartial' % (
                    region_id, region.start, region.end)

        results[field + "_full"] = total_full
        results[field + "_partial"] = total_partial
        found_file.close()

    logger.info('  ' + qutils.index_to_str(index) + 'Analysis is finished.')

    return ref_lengths, (results, genes_in_contigs, operons_in_contigs)
Exemple #28
0
############################################################################
# Copyright (c) 2015 Saint Petersburg State University
# Copyright (c) 2011-2014 Saint Petersburg Academic University
# All Rights Reserved
# See file LICENSE for details.
############################################################################

import os
import sys
import itertools
import shutil

import fastaparser

########################################################################

if len(sys.argv) != 3:
    print 'FASTA-file converter from multi-line reads to one-line ones'
    print 'Usage: ', sys.argv[0], ' <input-file> <output-file>'
    sys.exit(0)

out = open(sys.argv[2], 'w')

fasta = fastaparser.read_fasta(sys.argv[1])
for name, seq in fasta:
    out.write(name + '\n')
    out.write(seq + '\n')

out.close()