def convert_xmfa_to_gff3(xmfa_file,
                         fasta_genomes,
                         window_size=3,
                         relative_to="1"):
    label_convert = _id_tn_dict(fasta_genomes)
    try:
        os.makedirs("out")
    except Exception:
        pass

    for lcb_idx, lcb in enumerate(parse_xmfa(xmfa_file)):
        ids = [seq["id"] for seq in lcb]

        # Doesn't match part of our sequence
        if relative_to not in ids:
            continue

        # Skip sequences that are JUST our "relative_to" genome
        if len(ids) == 1:
            continue

        parent = [seq for seq in lcb if seq["id"] == relative_to][0]
        others = [seq for seq in lcb if seq["id"] != relative_to]

        if parent["start"] == 0 and parent["end"] == 0:
            continue

        corrected_parent, corrected_targets = remove_gaps(
            parent["seq"], [other["seq"] for other in others])
        # Update the parent/others with corrected sequences
        parent["corrected"] = corrected_parent
        for i, target in enumerate(corrected_targets):
            others[i]["corrected"] = target

        for i in range(1, len(corrected_parent) - 1):
            for other in others:
                left_bound = max(0, i - window_size)
                right_bound = i + window_size
                point_pid = percent_identity(
                    parent["corrected"][left_bound:right_bound],
                    other["corrected"][left_bound:right_bound],
                )

                label_convert[other["id"]]["temp"].write(
                    "%s\t%s\n" % (abs(parent["start"]) + i, point_pid))

    for key in label_convert.keys():
        # Ignore self-self
        if key == relative_to:
            continue

        other = label_convert[key]
        other["temp"].close()
        sizes = [(label_convert[relative_to]["record_id"],
                  label_convert[relative_to]["len"])]
        bw_file = os.path.join("out",
                               secure_filename(other["record_id"] + ".bigwig"))

        convert_to_bigwig(label_convert[key]["temp"].name, sizes, bw_file)
def convert_xmfa_to_gff3(xmfa_file, sequences=None, window_size=1000, protein=False):
    label_convert = id_tn_dict(sequences)
    lcbs = parse_xmfa(xmfa_file)

    parent_records = {
        x: SeqRecord(
            Seq("ACTG", IUPAC.IUPACUnambiguousDNA), id=label_convert[x]["record_id"]
        )
        for x in label_convert.keys()
    }

    for lcb_idx, lcb in enumerate(lcbs):
        ids = [seq["id"] for seq in lcb]
        # Skip sequences that are JUST a single genome
        if len(ids) == 1:
            continue

        for parent_id in ids:
            parent = [seq for seq in lcb if seq["id"] == parent_id][0]
            others = [seq for seq in lcb if seq["id"] != parent_id]

            if parent["start"] == 0 and parent["end"] == 0:
                continue

            for o_idx, other in enumerate(others):
                # A feature representing a region of synteny between parent and the given other
                other_feature = SeqFeature(
                    FeatureLocation(parent["start"], parent["end"]),
                    type="match",
                    strand=parent["strand"],
                    qualifiers={
                        "source": "progressiveMauve",
                        "Target": label_convert[other["id"]]["record_id"],
                        "Target_protein": other["comment"],
                        "ID": "m_%s_%s_%s_%s"
                        % (
                            lcb_idx,
                            o_idx,
                            label_convert[parent["id"]]["record_id"],
                            label_convert[other["id"]]["record_id"],
                        ),
                        "Name": other["comment"],
                    },
                )
                other_feature.sub_features = []

                subs = generate_subfeatures(parent, window_size, other, protein=protein)
                for subfeature in reduce_subfeatures(
                    sorted(subs, key=lambda x: x.location.start)
                ):
                    other_feature.sub_features.append(subfeature)

                parent_records[parent["id"]].features.append(other_feature)

    for i in parent_records:
        yield parent_records[i]
Exemple #3
0
def total_similarity(xmfa_file, sequences=None, dice=False):
    if sequences is None:
        raise Exception("Must provide a non-zero number of sequence files")

    label_convert = _id_tn_dict(sequences)
    lcbs = parse_xmfa(xmfa_file)

    # make a matrix based on number of sequences
    table = {}

    for lcb in lcbs:
        # ignore LCBs containing only one sequence
        if len(lcb) == 0:
            continue

        # permutations based on num sequences to compare for current LCB
        compare_seqs = list(itertools.permutations(range(0, len(lcb)), 2))
        for permutation in compare_seqs:
            (i, j) = permutation
            similarity = percent_identity(lcb[i]['seq'], lcb[j]['seq'])

            i_name = label_convert[lcb[i]['id']]['id']
            j_name = label_convert[lcb[j]['id']]['id']
            # find length of sequence in LCB
            length_seq_lcb = lcb[i]['end'] - (lcb[i]['start'] - 1)
            # populate table with normalized similarity value based on length_seq_lcb
            if (i_name, j_name) not in table:
                table[(i_name, j_name)] = 0
            table[(i_name, j_name)] += length_seq_lcb * similarity

    # finalize total percent similarity by dividing by length of parent sequence
    for i in label_convert.keys():
        for j in label_convert.keys():
            i_name = label_convert[i]['id']
            j_name = label_convert[j]['id']
            if (i_name, j_name) in table:
                if dice:
                    table[(i_name, j_name)] = 2 * table[(i_name, j_name)] / (
                        label_convert[i]['len'] + label_convert[j]['len'])
                else:
                    table[(i_name,
                           j_name)] = table[(i_name,
                                             j_name)] / label_convert[i]['len']
            else:
                table[(i_name, j_name)] = 0

            if i_name == j_name:
                table[(i_name, j_name)] = 100

    # print table
    names = []
    table_keys = sorted(label_convert.keys())

    for i in table_keys:
        names.append(label_convert[i]['id'])

    sys.stdout.write('\t' + '\t'.join(names) + '\n')
    for j in table_keys:
        j_key = label_convert[j]['id']
        sys.stdout.write(j_key)
        for i in table_keys:
            i_key = label_convert[i]['id']
            sys.stdout.write('\t%0.2f' % table[(i_key, j_key)])
        sys.stdout.write('\n')
Exemple #4
0
    return new_lcbs


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Split XMFA alignments',
                                     prog='xmfa2smallerXmfa')
    parser.add_argument('xmfa_file',
                        type=argparse.FileType("r"),
                        help='XMFA File')

    parser.add_argument('--window_size',
                        type=int,
                        help='Window size for analysis',
                        default=10)
    parser.add_argument('--threshold',
                        type=float,
                        help='All genomes must meet N percent similarity',
                        default=0.7)

    args = parser.parse_args()

    # Write
    xmfa.to_xmfa(
        # Split
        split_lcbs(
            # Parse
            xmfa.parse_xmfa(args.xmfa_file),
            window_size=args.window_size,
            threshold=args.threshold,
        ))