コード例 #1
0
ファイル: unmapped_reads.py プロジェクト: zeeev/Flye
def calc_mapping_rates(reads2contigs_mapping):
    hits = read_paf(reads2contigs_mapping)
    hits.sort(key=lambda hit: (hit.query, hit.target))

    mapping_rates = dict()
    current_hit = None
    mapping_segments = []

    for hit in hits:
        if current_hit is None or hit.query != current_hit.query or \
                                  hit.target != current_hit.target:
            if current_hit is not None:
                mapping_rate = calc_mapping_rate(current_hit.query_length,
                                                 mapping_segments)
                if current_hit.query not in mapping_rates:
                    mapping_rates[current_hit.query] = dict()

                mapping_rates[current_hit.query][
                    current_hit.target] = mapping_rate

            current_hit = hit
            mapping_segments = []

        mapping_segments.append(MappingSegment(hit.query_start, hit.query_end))

    return mapping_rates
コード例 #2
0
def extract_circular_reads(unmapped_reads_mapping, max_overhang=150):
    circular_reads = {}

    for current_hit in read_paf(unmapped_reads_mapping):
        if is_circular_read(current_hit, max_overhang):
            hit = circular_reads.get(current_hit.query)
            if hit is None or current_hit.query_mapping_length() > \
               hit.query_mapping_length():
                circular_reads[current_hit.query] = current_hit
                #logger.debug("\t" + current_hit.query)

    return circular_reads
コード例 #3
0
ファイル: circular_sequences.py プロジェクト: zeeev/Flye
def extract_circular_pairs(unmapped_reads_mapping, max_overhang=300):
    hits = read_paf(unmapped_reads_mapping)
    hits.sort(key=lambda hit: (hit.query, hit.target))

    circular_pairs = []
    circular_pair = [None, None]
    previous_hit = None
    has_overlap = False
    is_circular = False

    used_reads = set()

    for hit in hits:
        if hit.query == hit.target:
            continue

        if hit.query in used_reads or hit.target in used_reads:
            continue

        if previous_hit is None or \
           hit.query != previous_hit.query or \
           hit.target != previous_hit.target:
            if previous_hit is not None and has_overlap and is_circular:
                if mapping_segments_without_intersection(circular_pair):
                    circular_pairs.append(circular_pair)
                    used_reads.add(circular_pair[0].target)
                    used_reads.add(circular_pair[0].query)

            circular_pair = [None, None]
            has_overlap = False
            is_circular = False
            previous_hit = hit

        if not has_overlap:
            if hit.query_right_overhang() < max_overhang and \
               hit.target_left_overhang() < max_overhang:
                has_overlap = True
                circular_pair[0] = hit
                continue

        if not is_circular:
            if hit.query_left_overhang() < max_overhang and \
               hit.target_right_overhang() < max_overhang:
                is_circular = True
                circular_pair[1] = hit

    return circular_pairs
コード例 #4
0
def extract_unique_plasmids(trimmed_reads_mapping,
                            trimmed_reads_path,
                            mapping_rate_threshold=0.8,
                            max_length_difference=500,
                            min_sequence_length=1000):
    trimmed_reads = set()
    for hit in read_paf(trimmed_reads_mapping):
        trimmed_reads.add(hit.query)
        trimmed_reads.add(hit.target)

    trimmed_reads = list(trimmed_reads)
    n_trimmed_reads = len(trimmed_reads)
    read2int = dict()
    int2read = dict()

    for i in xrange(n_trimmed_reads):
        read2int[trimmed_reads[i]] = i
        int2read[i] = trimmed_reads[i]

    similarity_graph = [[] for _ in xrange(n_trimmed_reads)]

    #each hit group stores alginmemnts for each (query, target) pair
    for hit_group in read_paf_grouped(trimmed_reads_mapping):
        if hit_group[0].query == hit_group[0].target:
            continue

        query_mapping_segments = []
        target_mapping_segments = []
        for hit in hit_group:
            query_mapping_segments.append(
                unmapped.MappingSegment(hit.query_start, hit.query_end))
            target_mapping_segments.append(
                unmapped.MappingSegment(hit.target_start, hit.target_end))

        query_length = hit_group[0].query_length
        target_length = hit_group[0].target_length
        query_mapping_rate = unmapped.calc_mapping_rate(
            query_length, query_mapping_segments)
        target_mapping_rate = unmapped.calc_mapping_rate(
            target_length, target_mapping_segments)

        if (query_mapping_rate > mapping_rate_threshold
                or target_mapping_rate > mapping_rate_threshold):
            #abs(query_length - target_length) < max_length_difference:
            vertex1 = read2int[hit_group[0].query]
            vertex2 = read2int[hit_group[0].target]
            similarity_graph[vertex1].append(vertex2)
            similarity_graph[vertex2].append(vertex1)

    connected_components, n_components = \
        utils.find_connected_components(similarity_graph)

    groups = [[] for _ in xrange(n_components)]
    for i in xrange(len(connected_components)):
        groups[connected_components[i]].append(int2read[i])

    #for g in groups:
    #    logger.debug("Group {0}".format(len(g)))
    #    for s in g:
    #        logger.debug("\t{0}".format(seq_lengths[s]))

    groups = [group for group in groups if len(group) > 1]
    trimmed_reads_dict = fp.read_sequence_dict(trimmed_reads_path)
    unique_plasmids = dict()

    for group in groups:
        sequence = trimmed_reads_dict[group[0]]
        if len(sequence) >= min_sequence_length:
            unique_plasmids[group[0]] = sequence

    return unique_plasmids
コード例 #5
0
ファイル: circular_sequences.py プロジェクト: zeeev/Flye
def extract_unique_plasmids(trimmed_reads_mapping,
                            trimmed_reads_path,
                            mapping_rate_threshold=0.8,
                            max_length_difference=500,
                            min_sequence_length=1000):
    hits = read_paf(trimmed_reads_mapping)
    trimmed_reads = set()

    for hit in hits:
        trimmed_reads.add(hit.query)
        trimmed_reads.add(hit.target)

    trimmed_reads = list(trimmed_reads)
    n_trimmed_reads = len(trimmed_reads)
    read2int = dict()
    int2read = dict()

    for i in xrange(n_trimmed_reads):
        read2int[trimmed_reads[i]] = i
        int2read[i] = trimmed_reads[i]

    similarity_graph = [[] for _ in xrange(n_trimmed_reads)]
    hits.sort(key=lambda hit: (hit.query, hit.target))

    current_hit = None
    query_mapping_segments = []
    target_mapping_segments = []
    seq_lengths = {}

    for hit in hits:
        seq_lengths[hit.query] = hit.query_length
        seq_lengths[hit.target] = hit.target_length

        if hit.query == hit.target:
            continue

        if (current_hit is None or hit.query != current_hit.query
                or hit.target != current_hit.target):
            if current_hit is not None:
                query_length = current_hit.query_length
                target_length = current_hit.target_length
                query_mapping_rate = \
                    unmapped.calc_mapping_rate(query_length,
                                               query_mapping_segments)
                target_mapping_rate = \
                    unmapped.calc_mapping_rate(target_length,
                                               target_mapping_segments)

                if (query_mapping_rate > mapping_rate_threshold
                        or target_mapping_rate > mapping_rate_threshold):
                    #abs(query_length - target_length) < max_length_difference:
                    vertex1 = read2int[current_hit.query]
                    vertex2 = read2int[current_hit.target]
                    similarity_graph[vertex1].append(vertex2)
                    similarity_graph[vertex2].append(vertex1)

            query_mapping_segments = []
            target_mapping_segments = []
            current_hit = hit

        query_mapping_segments.append(
            unmapped.MappingSegment(hit.query_start, hit.query_end))
        target_mapping_segments.append(
            unmapped.MappingSegment(hit.target_start, hit.target_end))

    connected_components, n_components = \
        utils.find_connected_components(similarity_graph)

    groups = [[] for _ in xrange(n_components)]
    for i in xrange(len(connected_components)):
        groups[connected_components[i]].append(int2read[i])

    #for g in groups:
    #    logger.debug("Group {0}".format(len(g)))
    #    for s in g:
    #        logger.debug("\t{0}".format(seq_lengths[s]))

    groups = [group for group in groups if len(group) > 1]
    trimmed_reads_dict = fp.read_sequence_dict(trimmed_reads_path)
    unique_plasmids = dict()

    for group in groups:
        sequence = trimmed_reads_dict[group[0]]
        if len(sequence) >= min_sequence_length:
            unique_plasmids[group[0]] = sequence

    return unique_plasmids