Beispiel #1
0
def get_edges_from_gfa(gfa_fpath, output_dirpath, min_edge_len):
    if not gfa_fpath:
        return None

    input_edges_fpath = join(dirname(gfa_fpath), get_filename(gfa_fpath) + ".fasta")
    edges_fpath = join(output_dirpath, basename(input_edges_fpath))
    if not is_empty_file(gfa_fpath) and not can_reuse(edges_fpath, files_to_check=[gfa_fpath]):
        print("Extracting edge sequences from " + gfa_fpath + "...")
        with open(edges_fpath, "w") as out:
            with open(gfa_fpath) as f:
                for line in f:
                    if line.startswith('S'):
                        fs = line.strip().split()
                        seq_name = fs[1]
                        seq = None
                        if is_acgt_seq(fs[2]):
                            seq = fs[2]
                        elif len(fs) >= 4 and is_acgt_seq(fs[3]):
                            seq = fs[3]
                        if seq and len(seq) >= min_edge_len:
                            out.write(">%s\n" % get_edge_agv_id(get_edge_num(seq_name)))
                            out.write(seq)
                            out.write("\n")
    if is_empty_file(edges_fpath) and not is_empty_file(input_edges_fpath):
        with open(edges_fpath, "w") as out:
            with open(input_edges_fpath) as f:
                for line in f:
                    if line.startswith('>'):
                        seq_name = line.strip().split()[0][1:]
                        out.write(">%s\n" % get_edge_agv_id(get_edge_num(seq_name)))
                    else:
                        out.write(line)
    return edges_fpath
Beispiel #2
0
def map_edges_to_ref(input_fpath, output_dirpath, reference_fpath, threads):
    mapping_fpath = join(output_dirpath, "mapping.paf")
    if reference_fpath:
        if not can_reuse(mapping_fpath,
                         files_to_check=[input_fpath, reference_fpath]):
            if not is_empty_file(input_fpath):
                print("Aligning graph edges to the reference...")
                cmdline = [
                    "minimap2", "-x", "asm20", "--score-N", "0", "-E", "1,0",
                    "-N", "200", "-p", "0.5", "-f", "200", "-t",
                    str(threads), reference_fpath, input_fpath
                ]
                return_code = subprocess.call(cmdline,
                                              stdout=open(mapping_fpath, "w"),
                                              stderr=open(
                                                  join(output_dirpath,
                                                       "minimap.log"), "w"))
                if return_code != 0 or is_empty_file(mapping_fpath):
                    print(
                        "Warning! Minimap2 failed aligning edges to the reference"
                    )
            else:
                print(
                    "Warning! File with edge sequences was not found, failed aligning edges to the reference"
                )
    return mapping_fpath
Beispiel #3
0
def parse_canu_unitigs_info(input_dirpath, dict_edges):
    tiginfo_fpath = find_file_by_pattern(input_dirpath, ".unitigs.layout.tigInfo")
    if not is_empty_file(tiginfo_fpath):
        with open(tiginfo_fpath) as f:
            for i, line in enumerate(f):
                if i == 0:
                    header = line.strip().split()
                    repeat_col = header.index("sugRept") if "sugRept" in header else None
                    cov_col = header.index("coverage") if "coverage" in header else None
                    if repeat_col is None or cov_col is None:
                        break
                    continue
                fs = line.strip().split()
                edge_id = get_edge_agv_id(get_edge_num(fs[0]))
                rc_edge_id = get_edge_agv_id(-get_edge_num(fs[0]))
                if edge_id in dict_edges:
                    coverage = int(float(fs[cov_col]))
                    dict_edges[edge_id].cov = coverage
                    dict_edges[rc_edge_id].cov = coverage
                    if fs[repeat_col] == "yes":
                        dict_edges[edge_id].repetitive = True
                        dict_edges[rc_edge_id].repetitive = True
                # else:
                #    print("Warning! Edge %s is not found!" % edge_id)
    return dict_edges
Beispiel #4
0
def fastg_to_gfa(input_fpath, output_dirpath, assembler_name):
    k8_exec = join(TOOLS_DIR, "k8-darwin") if is_osx() else join(
        TOOLS_DIR, "k8-linux")
    gfatools_exec = join(TOOLS_DIR, "gfatools.js")
    if gfatools_exec and k8_exec:
        output_fpath = join(output_dirpath,
                            basename(input_fpath).replace("fastg", "gfa"))
        cmd = None
        if is_abyss(assembler_name):
            cmd = "abyss2gfa"
        elif is_spades(assembler_name):
            cmd = "spades2gfa"
        elif is_sga(assembler_name):
            cmd = "sga2gfa"
        elif is_soap(assembler_name):
            cmd = "soap2gfa"
        elif is_velvet(assembler_name):
            cmd = "velvet2gfa"
        if not cmd:
            sys.exit(
                "FASTG files produced by " + assembler_name +
                " are not supported. Supported assemblers: " + ' '.join([
                    ABYSS_NAME, SGA_NAME, SOAP_NAME, SPADES_NAME, VELVET_NAME
                ]) + " or use files in GFA format.")
        cmdline = [k8_exec, gfatools_exec, cmd, input_fpath]
        subprocess.call(cmdline,
                        stdout=output_fpath,
                        stderr=open("/dev/null", "w"))
        if not is_empty_file(output_fpath):
            return output_fpath
Beispiel #5
0
def parse_flye_assembly_info(input_dirpath, dict_edges):
    contig_edges = defaultdict(list)
    info_fpath = join(input_dirpath, "assembly_info.txt")
    if is_empty_file(info_fpath):
        print(
            "Warning! Assembly_info.txt is not found, information about contigs will not be provided"
        )
    with open(info_fpath) as f:
        for i, line in enumerate(f):
            if i == 0:
                # header = line.strip().split()
                continue
            fs = line.strip().split()
            contig = fs[0]
            path = fs[-1]
            edges = path.split(',')
            start = 0
            for edge_name in edges:
                edge_id = get_edge_agv_id(edge_name)
                if edge_id in dict_edges:
                    edge_len = dict_edges[edge_id].length
                    contig_edges[contig].append(
                        (str(start), str(start + edge_len), edge_id))
                    start += edge_len
    return contig_edges
Beispiel #6
0
def parse_spades_paths(input_dirpath, dict_edges):
    contig_edges = defaultdict(list)
    paths_fpath = join(input_dirpath, "scaffolds.paths")
    if is_empty_file(paths_fpath):
        print(
            "Warning! %s is not found, information about scaffold paths will not be provided"
            % paths_fpath)
    # NODE_1_length_8242890_cov_19.815448
    # 1893359+,1801779-,1893273-,400678-,1892977+,1869659-,1892443+,272108+,1694470+,1893863+
    with open(paths_fpath) as f:
        contig = None
        start = 0
        for line in f:
            if line.strip().endswith("'"):
                contig = None
            elif line.startswith("NODE"):
                contig = line.strip()
                start = 0
                continue
            elif contig:
                edges = line.strip().replace(';', '').split(',')
                for edge_name in edges:
                    edge_num = int(edge_name[:-1])
                    if edge_name[-1] == '-':
                        edge_num *= -1
                    edge_id = get_edge_agv_id(edge_num)
                    if edge_id in dict_edges:
                        edge_len = dict_edges[edge_id].length
                        contig_edges[contig].append(
                            (str(start), str(start + edge_len), edge_id))
                        start += edge_len
                start += 10  # NNNNNNNNNN
    return contig_edges
Beispiel #7
0
def parse_canu_contigs_info(input_dirpath):
    contig_info = dict()
    edges_by_contig = defaultdict(list)
    unitigs_info_fpath = find_file_by_pattern(input_dirpath, "unitigs.bed")
    if input_dirpath and not is_empty_file(unitigs_info_fpath):
        with open(unitigs_info_fpath) as f:
            for line in f:
                fs = line.strip().split()
                contig, start, end, unitig = fs[:4]
                strand = fs[-1]
                edge_name = get_edge_num(
                    unitig) if strand == "+" else -get_edge_num(unitig)
                contig_id = get_canu_id(contig)
                edges_by_contig[contig_id].append(str(edge_name))
    contigs_info_fpath = find_file_by_pattern(input_dirpath,
                                              "contigs.layout.tigInfo")
    if input_dirpath and not is_empty_file(contigs_info_fpath):
        len_col = None
        cov_col = None
        with open(contigs_info_fpath) as f:
            for i, line in enumerate(f):
                if i == 0:
                    header = line.strip().split()
                    len_col = header.index(
                        "tigLen") if "tigLen" in header else None
                    cov_col = header.index(
                        "coverage") if "coverage" in header else None
                    if len_col is None or cov_col is None:
                        break
                    continue
                fs = line.strip().split()
                length = int(float(fs[len_col]))
                coverage = int(float(fs[cov_col]))
                contig_id = get_canu_id(fs[0])
                if contig_id in edges_by_contig:
                    contig_info[contig_id] = {
                        'length': length,
                        'cov': coverage,
                        'mult': 1
                    }
    for contig_id, edges in edges_by_contig.items():
        contig_info[contig_id]['edges'] = edges
    return contig_info
Beispiel #8
0
def parse_assembler_output(assembler_name, input_dirpath, input_fpath,
                           output_dirpath, input_fasta_fpath, min_edge_len):
    edges_fpath = None
    if not is_empty_file(input_fpath):
        contig_edges = []
        if input_fpath.endswith("fastg"):
            input_fpath = fastg_to_gfa(input_fpath, output_dirpath,
                                       assembler_name)
        if not input_fpath:
            sys.exit("ERROR! Failed parsing " + input_fpath + " file.")
        if input_fpath.endswith("gfa") or input_fpath.endswith("gfa2"):
            dict_edges = parse_gfa(input_fpath, min_edge_len)
            edges_fpath = get_edges_from_gfa(input_fpath, output_dirpath,
                                             min_edge_len)
        elif input_fpath.endswith("dot") or input_fpath.endswith("gv"):
            edges_fpath = format_edges_file(input_fasta_fpath, output_dirpath)
            dict_edges = dict()
            if is_abyss(assembler_name):
                dict_edges = parse_abyss_dot(input_fpath, min_edge_len)
            if not dict_edges:
                try:
                    dict_edges = parse_flye_dot(input_fpath, min_edge_len)
                except Exception as e:
                    sys.exit(
                        "ERROR! Failed parsing " + input_fpath + " file.\n"
                        "During parsing the following error has occured: " +
                        str(e) +
                        "\nPlease make sure that you correctly specified the assembler name using -a option. "
                        "DOT files produced by different assemblers can have very different formats.\n"
                        "Examples of input data can be found here https://github.com/almiheenko/AGB/tree/master/test_data"
                    )
    else:
        if is_canu(assembler_name):
            dict_edges, contig_edges, edges_fpath = parse_canu_output(
                input_dirpath, output_dirpath, min_edge_len)
        elif is_flye(assembler_name):
            dict_edges, contig_edges, edges_fpath = parse_flye_output(
                input_dirpath, output_dirpath, min_edge_len)
        elif is_spades(assembler_name):
            dict_edges, contig_edges, edges_fpath = parse_spades_output(
                input_dirpath, output_dirpath, min_edge_len)
        else:
            sys.exit(
                "Output folder of %s assembler can not be parsed! Supported assemblers: %s. "
                "More assemblers will be added in the next release.\n"
                "To visualize the assembly graph produced by this assembler, "
                "you should manually specify the assembly graph file in GFA/FASTG/GraphViz formats using --graph option "
                "and (optionally) file with edge sequences using --fasta option"
                % (assembler_name, ', '.join(SUPPORTED_ASSEMBLERS)))
    for edge_id, edge in dict_edges.items():
        dict_edges[edge_id].start, dict_edges[edge_id].end = str(
            edge.start), str(edge.end)
    return dict_edges, contig_edges, edges_fpath
Beispiel #9
0
def format_edges_file(input_fpath, output_dirpath):
    if is_empty_file(input_fpath):
        return None
    edges_fpath = join(output_dirpath, "edges.fasta")
    if not can_reuse(edges_fpath, files_to_check=[input_fpath]):
        with open(input_fpath) as f:
            with open(edges_fpath, "w") as out_f:
                for line in f:
                    if line.startswith('>'):
                        edge_id = get_edge_agv_id(get_edge_num(line[1:]))
                        out_f.write(">%s\n" % edge_id)
                    else:
                        out_f.write(line)
    return edges_fpath
Beispiel #10
0
def parse_canu_assembly_info(input_dirpath, dict_edges):
    contig_edges = defaultdict(list)
    unitigs_fpath = find_file_by_pattern(input_dirpath, ".unitigs.bed")
    if is_empty_file(unitigs_fpath):
        print(
            "Warning! Unitigs.bed is not found, information about contigs will not be provided"
        )
    with open(unitigs_fpath) as f:
        for line in f:
            fs = line.strip().split()
            contig, start, end, unitig = fs[:4]
            edge_id = get_edge_agv_id(get_edge_num(unitig))
            if edge_id in dict_edges:
                contig_id = get_canu_id(contig)
                contig_edges[contig_id].append((start, end, edge_id))
    return contig_edges
Beispiel #11
0
def run(input_fpath, reference_fpath, out_fpath, output_dirpath, threads,
        is_meta):
    if not exists(output_dirpath):
        os.makedirs(output_dirpath)
    if not can_reuse(out_fpath, files_to_check=[input_fpath, reference_fpath]):
        quast_exec_path = get_path_to_program("quast.py")
        if not quast_exec_path:
            print("QUAST is not found!")
            return None
        cmdline = [quast_exec_path, "--fast",  "--agb", input_fpath, "-r", reference_fpath,
                   "-t", str(threads), "-o", output_dirpath, "--min-contig", "0"] + \
                  (["--large"] if getsize(input_fpath) > 10 * 1024 * 1024 or is_meta else []) + (["--min-identity", "90"] if is_meta else [])
        subprocess.call(cmdline,
                        stdout=open("/dev/null", "w"),
                        stderr=open("/dev/null", "w"))
    if is_empty_file(out_fpath) or not can_reuse(
            out_fpath, files_to_check=[input_fpath, reference_fpath]):
        return None
    return out_fpath
Beispiel #12
0
def parse_canu_output(input_dirpath, output_dirpath, min_edge_len):
    raw_gfa_fpath = find_file_by_pattern(input_dirpath, ".unitigs.gfa")
    if not raw_gfa_fpath:
        print("ERROR! GFA file is not found in %s! Please check the options" %
              abspath(input_dirpath))
        sys.exit(1)
    edges_fpath = get_edges_from_gfa(raw_gfa_fpath, output_dirpath,
                                     min_edge_len)
    gfa_fpath = join(output_dirpath, basename(raw_gfa_fpath))
    if is_empty_file(gfa_fpath) or not can_reuse(
            gfa_fpath, files_to_check=[raw_gfa_fpath]):
        cmd = 'sed "1s/bogart.edges/1.0/" ' + raw_gfa_fpath
        subprocess.call(shlex.split(cmd), stdout=open(gfa_fpath, 'w'))
    dict_edges = parse_gfa(gfa_fpath,
                           min_edge_len,
                           input_dirpath,
                           assembler="canu")
    contig_edges = parse_canu_assembly_info(input_dirpath, dict_edges)
    return dict_edges, contig_edges, edges_fpath
Beispiel #13
0
def parse_flye_contigs_info(input_dirpath):
    contig_info = dict()
    if input_dirpath and not is_empty_file(
            join(input_dirpath, 'assembly_info.txt')):
        with open(join(input_dirpath, 'assembly_info.txt')) as f:
            for i, line in enumerate(f):
                if i == 0:
                    header = line.strip().split()
                    continue
                fs = line.strip().split()
                contig = fs[0]
                path = fs[-1]
                length, cov = map(int, (fs[1], fs[2]))
                multiplicity = fs[5]
                edges = path.split(',')
                contig_info[contig] = {
                    'edges': edges,
                    'length': length,
                    'cov': cov,
                    'mult': multiplicity
                }
    return contig_info
Beispiel #14
0
def run_quast_analysis(input_fpath,
                       reference_fpath,
                       output_dirpath,
                       json_output_dirpath,
                       threads,
                       contig_edges,
                       dict_edges=None,
                       is_meta=False):
    ms_out_fpath = None
    quast_output_dir = join(
        output_dirpath,
        "quast_output" if not dict_edges else "quast_edge_output")
    if not is_empty_file(input_fpath) and not is_empty_file(reference_fpath):
        ms_out_fpath = get_mis_report_fpath(quast_output_dir, input_fpath)
        ms_out_fpath = run(input_fpath, reference_fpath, ms_out_fpath,
                           quast_output_dir, threads, is_meta)
    if not ms_out_fpath:
        if not is_empty_file(input_fpath) and not is_empty_file(
                reference_fpath):
            print(
                "QUAST failed! Make sure you are using the latest version of QUAST"
            )
        print("No information about %s mappings to the reference genome" %
              ("edge" if dict_edges else "contig"))
        with open(join(json_output_dirpath, "reference.json"), 'w') as handle:
            handle.write("chrom_lengths=" + json.dumps([]) + ";\n")
            handle.write("edgeMappingInfo=" + json.dumps([]) + ";\n")
            handle.write("chromGaps=" + json.dumps([]) + ";\n")
            handle.write("chromAligns=" + json.dumps([]) + ";\n")
        with open(join(json_output_dirpath, 'errors.json'), 'w') as handle:
            handle.write("misassembledContigs=[];\n")
        return None, None, None, dict_edges

    # search for misassemblies and store them for each edge and contig
    misassembled_seqs = defaultdict(list)
    with open(ms_out_fpath) as f:
        seq_id = ''
        for line in f:
            if line.startswith("Extensive misassembly"):
                match = re.search(align_pattern, line)
                if not match or len(match.groups()) < 4:
                    continue
                start1, end1, start2, end2 = match.group(
                    'start1'), match.group('end1'), match.group(
                        'start2'), match.group('end2')
                if dict_edges:
                    edge_id = get_edge_agv_id(get_edge_num(seq_id))
                    dict_edges[edge_id].errors.append(
                        (start1, end1, start2, end2))
                else:
                    misassembled_seqs[seq_id].append(
                        (start1, end1, start2, end2))
                ## add misassembl edge
            else:
                seq_id = line.strip()

    if not dict_edges:
        with open(join(json_output_dirpath, 'errors.json'), 'w') as handle:
            handle.write("misassembledContigs='" +
                         json.dumps(misassembled_seqs) + "';\n")
        return None, None, None, dict_edges
    else:
        parse_alignments(get_alignments_fpath(quast_output_dir, input_fpath),
                         json_output_dirpath)
        mapping_fpath = map_edges_to_ref(input_fpath, output_dirpath,
                                         reference_fpath, threads)
        mapping_info, chrom_names, edge_by_chrom = parse_mapping_info(
            mapping_fpath, json_output_dirpath, dict_edges)
        return mapping_info, chrom_names, edge_by_chrom, dict_edges