コード例 #1
0
def generate_subfeatures(parent, window_size, other, protein=False):
    log.debug("Generating subfeatures for %s %s %s", parent, window_size, other)
    # We strip off trailing mismatches, and add a stop codon.
    #
    # The trailing mismatches cause blocks to run past the end where we don't
    # want them (and violate the gff3 spec).
    #
    # The stop codon will never match (and will slightly decrease %ident) but
    # it makes the last block the same length as the others which looks nicer.
    ps = parent["seq"].rstrip("-") + "*"
    for i in range(0, len(ps), window_size):
        block_seq = ps[i : i + window_size]
        real_window_size = len(block_seq)
        real_start = abs(parent["start"])
        # This is the index against the parent feature start/end region where our current comparison block is
        #
        # In order to handle cases where the parent sequence has a bunch of
        # ----- leading, we want to find the "real" start of the current block.
        # So we take the current block start (i), and subtract the number of -s
        #
        # Then convert to nucl.
        nucl2prot = 3
        indexed_start = nucl2prot * (i - ps[0:i].count("-"))

        log.debug(
            "  I: %s, BS: %s, RWS: %s, RS: %s, IS: %s, C: %s",
            i,
            block_seq,
            real_window_size,
            real_start,
            indexed_start,
            ps[0:i].count("-"),
        )

        if parent["start"] < 0:
            strand = -1
        else:
            strand = 1

        pid = percent_identity(block_seq, other["seq"][i : i + real_window_size])
        # Ignore 0% identity sequences
        if pid == 0:
            continue

        yield SeqFeature(
            FeatureLocation(
                real_start + indexed_start,
                real_start + indexed_start + nucl2prot * real_window_size,
            ),
            type="match_part",
            strand=strand,
            qualifiers={
                "source": "progressiveMauve",
                "score": pid,
                # 'alignment': block_seq + '<br/>' + other['seq'][i:i + real_window_size],
                "qseq": block_seq,
                "sseq": other["seq"][i : i + real_window_size],
            },
        )
コード例 #2
0
def convert_xmfa_to_gff3(xmfa_file,
                         fasta_genomes,
                         window_size=3,
                         relative_to="1"):
    label_convert = _id_tn_dict(fasta_genomes)
    try:
        os.makedirs("out")
    except Exception:
        pass

    for lcb_idx, lcb in enumerate(parse_xmfa(xmfa_file)):
        ids = [seq["id"] for seq in lcb]

        # Doesn't match part of our sequence
        if relative_to not in ids:
            continue

        # Skip sequences that are JUST our "relative_to" genome
        if len(ids) == 1:
            continue

        parent = [seq for seq in lcb if seq["id"] == relative_to][0]
        others = [seq for seq in lcb if seq["id"] != relative_to]

        if parent["start"] == 0 and parent["end"] == 0:
            continue

        corrected_parent, corrected_targets = remove_gaps(
            parent["seq"], [other["seq"] for other in others])
        # Update the parent/others with corrected sequences
        parent["corrected"] = corrected_parent
        for i, target in enumerate(corrected_targets):
            others[i]["corrected"] = target

        for i in range(1, len(corrected_parent) - 1):
            for other in others:
                left_bound = max(0, i - window_size)
                right_bound = i + window_size
                point_pid = percent_identity(
                    parent["corrected"][left_bound:right_bound],
                    other["corrected"][left_bound:right_bound],
                )

                label_convert[other["id"]]["temp"].write(
                    "%s\t%s\n" % (abs(parent["start"]) + i, point_pid))

    for key in label_convert.keys():
        # Ignore self-self
        if key == relative_to:
            continue

        other = label_convert[key]
        other["temp"].close()
        sizes = [(label_convert[relative_to]["record_id"],
                  label_convert[relative_to]["len"])]
        bw_file = os.path.join("out",
                               secure_filename(other["record_id"] + ".bigwig"))

        convert_to_bigwig(label_convert[key]["temp"].name, sizes, bw_file)
コード例 #3
0
ファイル: xmfa2tbl.py プロジェクト: Guiwenting/galaxy-tools
def total_similarity(xmfa_file, sequences=None, dice=False):
    if sequences is None:
        raise Exception("Must provide a non-zero number of sequence files")

    label_convert = _id_tn_dict(sequences)
    lcbs = parse_xmfa(xmfa_file)

    # make a matrix based on number of sequences
    table = {}

    for lcb in lcbs:
        # ignore LCBs containing only one sequence
        if len(lcb) == 0:
            continue

        # permutations based on num sequences to compare for current LCB
        compare_seqs = list(itertools.permutations(range(0, len(lcb)), 2))
        for permutation in compare_seqs:
            (i, j) = permutation
            similarity = percent_identity(lcb[i]['seq'], lcb[j]['seq'])

            i_name = label_convert[lcb[i]['id']]['id']
            j_name = label_convert[lcb[j]['id']]['id']
            # find length of sequence in LCB
            length_seq_lcb = lcb[i]['end'] - (lcb[i]['start'] - 1)
            # populate table with normalized similarity value based on length_seq_lcb
            if (i_name, j_name) not in table:
                table[(i_name, j_name)] = 0
            table[(i_name, j_name)] += length_seq_lcb * similarity

    # finalize total percent similarity by dividing by length of parent sequence
    for i in label_convert.keys():
        for j in label_convert.keys():
            i_name = label_convert[i]['id']
            j_name = label_convert[j]['id']
            if (i_name, j_name) in table:
                if dice:
                    table[(i_name, j_name)] = 2 * table[(i_name, j_name)] / (
                        label_convert[i]['len'] + label_convert[j]['len'])
                else:
                    table[(i_name,
                           j_name)] = table[(i_name,
                                             j_name)] / label_convert[i]['len']
            else:
                table[(i_name, j_name)] = 0

            if i_name == j_name:
                table[(i_name, j_name)] = 100

    # print table
    names = []
    table_keys = sorted(label_convert.keys())

    for i in table_keys:
        names.append(label_convert[i]['id'])

    sys.stdout.write('\t' + '\t'.join(names) + '\n')
    for j in table_keys:
        j_key = label_convert[j]['id']
        sys.stdout.write(j_key)
        for i in table_keys:
            i_key = label_convert[i]['id']
            sys.stdout.write('\t%0.2f' % table[(i_key, j_key)])
        sys.stdout.write('\n')