コード例 #1
0
def extract_unmapped_reads(args, reads2contigs_mapping, unmapped_reads_path,
                           mapping_rate_threshold):
    mapping_rates = calc_mapping_rates(reads2contigs_mapping)
    total_bases = 0
    unmapped_bases = 0

    with open(unmapped_reads_path, "w") as fout:
        for file in args.reads:
            for hdr, sequence in fp.stream_sequence(file):
                total_bases += len(sequence)

                is_unmapped = True
                contigs = mapping_rates.get(hdr)
                if contigs is not None:
                    is_unmapped = True
                    for contig, mapping_rate in contigs.iteritems():
                        if mapping_rate >= mapping_rate_threshold:
                            is_unmapped = False

                if is_unmapped:
                    unmapped_bases += len(sequence)
                    fout.write(">{0}\n{1}\n".format(hdr, sequence))

    logger.debug("Unmapped sequence: {0} / {1} ({2})".format(
        unmapped_bases, total_bases,
        float(unmapped_bases) / total_bases))
コード例 #2
0
def filter_by_coverage(args, stats_in, contigs_in, stats_out, contigs_out):
    """
    Filters out contigs with low coverage
    """
    SUBASM_MIN_COVERAGE = 1
    HARD_MIN_COVERAGE = cfg.vals["hard_minimum_coverage"]
    RELATIVE_MIN_COVERAGE = cfg.vals["relative_minimum_coverage"]

    ctg_stats = {}
    sum_cov = 0
    sum_length = 0

    with open(stats_in, "r") as f:
        for line in f:
            if line.startswith("#"): continue
            tokens = line.split("\t")
            ctg_id, ctg_len, ctg_cov = tokens[0], int(tokens[1]), int(
                tokens[2])
            ctg_stats[ctg_id] = (ctg_len, ctg_cov)
            sum_cov += ctg_cov * ctg_len
            sum_length += ctg_len

    mean_coverage = int(float(sum_cov) / sum_length)
    coverage_threshold = None
    if args.read_type == "subasm":
        coverage_threshold = SUBASM_MIN_COVERAGE
    elif args.meta:
        coverage_threshold = HARD_MIN_COVERAGE
    else:
        coverage_threshold = int(
            round(float(mean_coverage) / RELATIVE_MIN_COVERAGE))
        coverage_threshold = max(HARD_MIN_COVERAGE, coverage_threshold)
    logger.debug("Mean contig coverage: {0}, selected threshold: {1}".format(
        mean_coverage, coverage_threshold))

    filtered_num = 0
    filtered_seq = 0
    good_fasta = {}
    for hdr, seq in fp.stream_sequence(contigs_in):
        if ctg_stats[hdr][1] >= coverage_threshold:
            good_fasta[hdr] = seq
        else:
            filtered_num += 1
            filtered_seq += ctg_stats[hdr][0]
    logger.debug("Filtered {0} contigs of total length {1}".format(
        filtered_num, filtered_seq))

    fp.write_fasta_dict(good_fasta, contigs_out)
    with open(stats_out, "w") as f:
        f.write("#seq_name\tlength\tcoverage\n")
        for ctg_id in good_fasta:
            f.write("{0}\t{1}\t{2}\n".format(ctg_id, ctg_stats[ctg_id][0],
                                             ctg_stats[ctg_id][1]))
コード例 #3
0
ファイル: alignment.py プロジェクト: sebschmi/Flye
def split_into_chunks(fasta_in, chunk_size, fasta_out):
    out_dict = {}
    for header, seq in fp.stream_sequence(fasta_in):
        #print len(seq)
        for i in range(0, max(len(seq) // chunk_size, 1)):
            chunk_hdr = "{0}$chunk_{1}".format(header, i)
            start = i * chunk_size
            end = (i + 1) * chunk_size
            if len(seq) - end < chunk_size:
                end = len(seq)

            #print(start, end)
            out_dict[chunk_hdr] = seq[start : end]

    fp.write_fasta_dict(out_dict, fasta_out)
コード例 #4
0
ファイル: plasmids.py プロジェクト: zxgsy520/Flye
def assemble_short_plasmids(args, work_dir, contigs_path):
    logger.debug("Extracting unmapped reads")
    reads2contigs_mapping = os.path.join(work_dir, "reads2contigs.paf")
    make_alignment(contigs_path, args.reads, args.threads,
                   work_dir, args.platform, reads2contigs_mapping,
                   reference_mode=True, sam_output=False)

    unmapped_reads_path = os.path.join(work_dir, "unmapped_reads.fasta")
    unmapped.extract_unmapped_reads(args, reads2contigs_mapping,
                                    unmapped_reads_path,
                                    mapping_rate_threshold=0.5)

    logger.debug("Finding self-mappings for unmapped reads")
    unmapped_reads_mapping = os.path.join(work_dir, "unmapped_ava.paf")
    make_alignment(unmapped_reads_path, [unmapped_reads_path], args.threads,
                   work_dir, args.platform, unmapped_reads_mapping,
                   reference_mode=False, sam_output=False)

    logger.debug("Extracting circular reads")
    circular_reads = circular.extract_circular_reads(unmapped_reads_mapping)
    logger.debug("Extracted %d circular reads", len(circular_reads))

    logger.debug("Extracing circular pairs")
    circular_pairs = circular.extract_circular_pairs(unmapped_reads_mapping)
    logger.debug("Extracted %d circular pairs", len(circular_pairs))

    #extracting only the necesssary subset of reads (the entire file could be pretty big)
    interesting_reads = {}
    for read in circular_reads:
        interesting_reads[read] = None
    for pair in circular_pairs:
        interesting_reads[pair[0].query] = None
        interesting_reads[pair[0].target] = None
    for hdr, seq in fp.stream_sequence(unmapped_reads_path):
        if hdr in interesting_reads:
            interesting_reads[hdr] = seq

    trimmed_circular_reads = \
        circular.trim_circular_reads(circular_reads, interesting_reads)
    trimmed_circular_pairs = \
        circular.trim_circular_pairs(circular_pairs, interesting_reads)
    trimmed_sequences_path = os.path.join(work_dir, "trimmed_sequences.fasta")
    fp.write_fasta_dict(dict(list(trimmed_circular_reads.items()) +
                             list(trimmed_circular_pairs.items())),
                        trimmed_sequences_path)

    logger.debug("Clustering circular sequences")
    trimmed_sequences_mapping = os.path.join(work_dir, "trimmed.paf")
    make_alignment(trimmed_sequences_path, [trimmed_sequences_path], args.threads,
                   work_dir, args.platform, trimmed_sequences_mapping,
                   reference_mode=False, sam_output=False)

    plasmids = \
        circular.extract_unique_plasmids(trimmed_sequences_mapping,
                                         trimmed_sequences_path)

    plasmids_raw = os.path.join(work_dir, "plasmids_raw.fasta")
    fp.write_fasta_dict(plasmids, plasmids_raw)
    _, polished_stats = \
        pol.polish(plasmids_raw, [unmapped_reads_path], work_dir, 1,
                   args.threads, args.platform, output_progress=False)

    #extract coverage
    plasmids_with_coverage = {}
    if os.path.isfile(polished_stats):
        with open(polished_stats, "r") as f:
            for line in f:
                if line.startswith("#"): continue
                tokens = line.strip().split()
                seq_id, coverage = tokens[0], int(tokens[2])
                if coverage > 0:
                    plasmids_with_coverage[seq_id] = plasmids[seq_id], coverage

    logger.info("Added %d extra contigs", len(plasmids_with_coverage))

    # remove all unnecesarry files
    os.remove(reads2contigs_mapping)
    os.remove(unmapped_reads_path)
    os.remove(unmapped_reads_mapping)
    os.remove(trimmed_sequences_path)
    os.remove(trimmed_sequences_mapping)

    return plasmids_with_coverage