Python get_all_seq_length Beispiele

Programmiersprache: Python

Namespace / Paketname: basic_seq_utilities

Methode / Funktion: get_all_seq_length

Beispiele auf hotexamples.com: 2

Python get_all_seq_length - 2 Beispiele gefunden. Dies sind die am besten bewerteten Python Beispiele für die basic_seq_utilities.get_all_seq_length, die aus Open Source-Projekten extrahiert wurden. Sie können Beispiele bewerten, um die Qualität der Beispiele zu verbessern.

Beispiel #1

Datei anzeigen

Datei: mutant_flanking_region_tools.py Projekt: Jonikas-Lab/Zhang-Patena-2014

def print_base_count_fraction_for_dist(flanking_region_count_list, distance_from_middle, convert_counts=lambda x: x, 
                                      ignore_bases_pattern=None, average_both_sides=False):
    """ Print the base counts/fractions for the N bases around the middle, for all the flanking regions, weighed by converted count.

    Flanking_region_count_list should be a (seq,count) list; each sequence will be weighed as convert_counts(count). 

    Ignore_bases_pattern should be None to count all the bases, or a string giving a base pattern (centered around the middle), 
     in which case only the bases with an N will be counted - so for instnace if ignore_bases_pattern is CANN, bases -2 and -1 
      will be ignored (presumed to have been filtered through a pattern that requires them to be CA, although this isn't checked), 
      and only data for bases 1 and 2 (as well as bases before -2 and after 2, depending on distance_from_middle) will be given.

    If average_both_sides is True, bases from the two sides around the middle will be averaged together (after reverse-complementing
     one side, of course): CA|GG will be converted to |GG and |TG (rev-compl of CA|) and the GG and TG treated together for 
     calculating base frequencies/counts.
    """
    flanking_region_length = get_all_seq_length(zip(*flanking_region_count_list)[0])
    if flanking_region_length % 2:              raise ValueError("Flanking region length must be an even number!")
    flank_length = int(flanking_region_length/2)
    # grab only the flanking region length we're actually interested in, and convert the counts
    local_flanking_region_length = 2*distance_from_middle
    local_flanking_region_count_list = [(seq[flank_length-distance_from_middle:flank_length+distance_from_middle], 
                                         convert_counts(count)) for (seq,count) in flanking_region_count_list]
    # apply ignore_bases_pattern by changing the relevant bases to N
    #  note that I'm doing it this way, instead of just skipping these positions in the final output, 
    #  because that wouldn't work if average_both_sides is True: if ignore_bases_pattern is ANAN, 
    #   the ignore pattern isn't symmetrical around the middle, so half the position 1 bases will be ignored (first N in ANAN), 
    #    and half the position 2 bases will be ignored (second N in ANAN) - there will be no single position with all bases ignored.
    if ignore_bases_pattern is not None:
        if len(ignore_bases_pattern) % 2:       raise ValueError("Ignore_bases_pattern length must be an even number!")
        length_diff = int((len(ignore_bases_pattern)-local_flanking_region_length)/2)
        if length_diff>0:   raise ValueError("Ignore_bases_pattern is longer than 2*distance_from_middle - probably error!")
        if length_diff<0:   ignore_bases_pattern = 'N'*length_diff + ignore_bases_pattern + 'N'*length_diff
        def mask_seq(seq, mask_pattern):
            return ''.join([(base if if_mask=='N' else 'N') for (base,if_mask) in zip(seq, mask_pattern)])
        local_flanking_region_count_list = [(mask_seq(seq, ignore_bases_pattern), count) 
                                            for (seq,count) in local_flanking_region_count_list]
    # if average_both_sides, make a new local_flanking_region_count_list that has each half of each sequence separately
    if average_both_sides:
        new_flanking_region_count_list = []
        for flanking_region,count in local_flanking_region_count_list:
            first_half = reverse_complement(flanking_region[:distance_from_middle])
            second_half = flanking_region[distance_from_middle:]
            new_flanking_region_count_list.extend([(first_half,count), (second_half,count)])
        local_flanking_region_count_list = new_flanking_region_count_list
        local_flanking_region_length = int(local_flanking_region_length/2)
    base_count_list_dict = base_count_dict(local_flanking_region_count_list)
    base_fraction_list_dict = base_fraction_dict(local_flanking_region_count_list)
    # for each position in the final flanking regions, give the base fraction/count; 
    #  ignore positions in which there were no non-N bases.
    all_lines = ''
    for position in range(local_flanking_region_length):
        if sum(base_count_list_dict[base][position] for base in NORMAL_DNA_BASES):
            data = ['%s %.0f%% (%s)'%(base, base_fraction_list_dict[base][position]*100, base_count_list_dict[base][position]) 
                    for base in NORMAL_DNA_BASES]
            display_pos = position+1 if average_both_sides else _relative_position_vs_cut(position, distance_from_middle)
            all_lines +=  " - position %s: \t%s\n"%(display_pos, ', \t'.join(data))
    return all_lines

Beispiel #2

Datei anzeigen

Datei: mutant_flanking_region_tools.py Projekt: Jonikas-Lab/mutant-pools

def filter_flanking_regions_by_pattern(
    flanking_region_count_list,
    pattern,
    either_orientation=True,
    print_info=True,
    category=None,
    meaning_of_seqs="positions",
    meaning_of_counts="counts",
):
    """ Return separate lists of flanking regions that do and don't match given sequence pattern.
    
    flanking_region_count_list should be a list of (flanking_region, count) pairs (like from grab_flanking_regions_from_mutantfile); 
     the two return values (flanking regions that match and don't match the pattern) are the same format.

    The pattern should be a sequence string (allowed letters are ACTGN). It'ss considered to be centered around the cut site; 
     the flanking regions likewise.  E.g. if pattern is GNAN, a fl.region of GCAC or TTGCACTT would match, but TTTTGCAC would not. 
    If either_orientation is True, each flanking region will be tried against the pattern in both the forward and the reverse
     orientation, and the returned flanking region will be in the orientation that matched - e.g. if pattern is GNAN, 
     a flanking region of either TTGCACTT or TTCTCCTT would match (forward and rev-compl respectively), 
      and the latter would be returned as rev-compl, AAGGAGAA.

    If print_info is True, some information will be printed about what number/percentage matched and didn't: it'll be given two ways:
     - by flanking region, counting each once, if meaning_of_seqs is not None, and meaning_of_seqs will be used as the description
     - by count, if some counts are not 1 and meaning_of_counts is not None, and meaning_of_counts will be used as the description.
    """
    if not flanking_region_count_list:
        return []
    flanking_region_length = get_all_seq_length(zip(*flanking_region_count_list)[0])
    if flanking_region_length % 2:
        raise ValueError("Flanking region length must be an even number!")
    if len(pattern) % 2:
        raise ValueError("Pattern length must be an even number!")
    if len(pattern) > flanking_region_length:
        raise ValueError("Pattern cannot be longer than flanking regions!")
    # pad the pattern to match the flanking region length
    orig_pattern = pattern
    if len(pattern) < flanking_region_length:
        padding_len = int((flanking_region_length - len(pattern)) / 2)
        pattern = "N" * padding_len + pattern + "N" * padding_len
    # go over all the flanking regions:
    flanking_region_count_list_match, flanking_region_count_list_nomatch = [], []
    for (flanking_region, count) in flanking_region_count_list:
        # if the flanking region is padded with .'s, change them to N's to make check_seq_against_pattern take it
        flanking_region = flanking_region.replace(".", "N")
        # if we're looking at both orientations, then first randomize the orientation to avoid bias
        if either_orientation and random.random() < 0.5:
            flanking_region = reverse_complement(flanking_region)
        # if it matches the pattern, save it as a match and go to the next one
        if check_seq_against_pattern(flanking_region, pattern):
            flanking_region_count_list_match.append((flanking_region, count))
            continue
        # or if its rev-compl matches the pattern and either_orientation is True, save it as a match and go on to the next one;
        if either_orientation:
            flanking_region = reverse_complement(flanking_region)
            if check_seq_against_pattern(flanking_region, pattern):
                flanking_region_count_list_match.append((flanking_region, count))
                continue
        # if it didn't match anywhere, save it as a no-match.
        flanking_region_count_list_nomatch.append((flanking_region, count))
    if print_info:
        if meaning_of_seqs is None and meaning_of_counts is None:
            raise ValueError("To get info printed, at least one of meaning_of_seqs/meaning_of_counts must be not None!")
        print_data = "%smatched %s:  " % ("" if category is None else category + " ", orig_pattern)
        if meaning_of_seqs is not None:
            positions_matched, positions_unmatched = (
                len(flanking_region_count_list_match),
                len(flanking_region_count_list_nomatch),
            )
            positions_all = positions_matched + positions_unmatched
            print_data += "%s, unmatched %s/%s" % (
                general_utilities.value_and_percentages(
                    positions_matched, [positions_all], insert_word=meaning_of_seqs
                ),
                positions_unmatched,
                positions_all,
            )
        if meaning_of_counts is not None:
            counts_matched, counts_unmatched = [
                sum(zip(*data)[1]) for data in (flanking_region_count_list_match, flanking_region_count_list_nomatch)
            ]
            counts_all = counts_matched + counts_unmatched
            print_data += ";  %s, unmatched %s/%s." % (
                general_utilities.value_and_percentages(counts_matched, [counts_all], insert_word=meaning_of_counts),
                counts_unmatched,
                counts_all,
            )
        print print_data
    return flanking_region_count_list_match, flanking_region_count_list_nomatch