Esempio n. 1
0
                log.info("insignificant genomic alignment block %s ..." %
                         ch.id)
                continue
            new_id = "%si%d" % (ch.id, i)
            print >> opt.output, str(ch._replace(id=new_id))
            map(lambda tup: opt.output.write("%d %d %d\n" % tup),
                izip(S, T, Q))
            print >> opt.output, "%d\n" % S[-1]
        except KeyError:
            log.warning("skipping chromosome/contig (%s, %s)" %
                        (a.chrom, b.chrom))


if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description="""EPO alignments (.out) to .chain converter.""",
        epilog="Olgert Denas (Taylor Lab)",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument("input", help="File to process.")
    parser.add_argument(
        "--species",
        nargs=2,
        default=["homo_sapiens", "mus_musculus"],
        help=
        "Names of target and query species (respectively) in the alignment.")
    parser.add_argument("--chrsizes",
                        nargs=2,
                        required=True,
                        help="Chromosome sizes for the given species.")
    parser.add_argument("-o",
                        '--output',
Esempio n. 2
0
        data = np.array(data, dtype=elem_t)
    else:
        with open(path) as fd:
            for line in fd:
                cols = line.split()
                data.append(
                    (cols[0], int(cols[1]), int(cols[2]), cols[3],
                     int(cols[4]), cols[5], float(cols[6]), float(cols[7]),
                     float(cols[8]), int(cols[-1]) + int(cols[1])))
        data = np.array(data, dtype=narrowPeak_t)
    return data


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description=__doc__,
        epilog="Adam Diehl (Boyle Lab)",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument(
        "input",
        help=
        "Input regions to process. Should be in standard bed format. Only the first four bed fields will be used."
    )
    parser.add_argument(
        "tree",
        help=
        "Tree, in standard Newick format, with or without branch lengths, describing relationships of query and target species to outgroups."
    )
    parser.add_argument(
        "qname",
        help=
Esempio n. 3
0
def loadFeatures(path):
    "load BED4 features (all other columns are ignored)"

    log.info("loading from %s ..." % path)
    data = []
    with open(path) as fd:
        for line in fd:
            cols = line.split()
            data.append((cols[0], int(cols[1]), int(cols[2]), cols[3]))
    return np.array(data, dtype=elem_t)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description=__doc__,
        epilog="Olgert Denas (Taylor Lab)",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument(
        "input",
        nargs='+',
        help=
        "Input to process. If more than a file is specified, all files will be mapped and placed on --output, which should be a directory."
    )
    parser.add_argument("alignment", help="Alignment file (.chain or .pkl)")

    parser.add_argument("-f",
                        '--format',
                        choices=("BED4", "BED12"),
                        default="BED4",
                        help="Output format.")
Esempio n. 4
0
def main():
    parser = argparse.ArgumentParser(
        description=__doc__,
        epilog="Adam Diehl (Boyle Lab)",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)

    parser.add_argument(
        "input",
        help=
        "Input regions to process. Should be in standard bed format. Only the first four bed fields will be used."
    )
    parser.add_argument(
        "tree",
        help=
        "Tree, in standard Newick format, with or without branch lengths, describing relationships of query and target species to outgroups. May be given as a string or file."
    )
    parser.add_argument(
        "qname",
        help=
        "Name of the query species. Regions from this species will be mapped to target species coordinates."
    )
    parser.add_argument(
        "tname",
        help=
        "Name of the target species. Regions from the query species will be mapped to coordinates from this species."
    )
    parser.add_argument(
        "alignments",
        nargs='+',
        help=
        "Alignment files (.chain or .pkl): One for the target species and one per outgroup species. Files should be named according to the convention: qname.tname[...].chain.gz, where qname is the query species name and tname is the name of the target/outgroup species. Names used for qname and tname must match names used in the newick tree."
    )
    parser.add_argument("-o",
                        '--output',
                        metavar="FILE",
                        default='stdout',
                        type=lambda s:
                        ((s in ('stdout', '-') and "/dev/stdout") or s),
                        help="Output file. Default stdout.")
    parser.add_argument(
        "-t",
        '--threshold',
        metavar="FLOAT",
        default=0.0,
        type=float,
        help=
        "Mapping threshold i.e., |elem| * threshold <= |mapped_elem|. Default = 0.0 -- equivalent to accepting a single-base overlap. On the other end of the spectrum, setting this value to 1 is equivalent to only accepting full-length overlaps."
    )
    parser.add_argument(
        '-g',
        '--gap',
        type=int,
        default=-1,
        help=
        "Ignore elements with an insertion/deletion of this or bigger size. Using the default value (-1) will allow gaps of any size."
    )
    parser.add_argument('-v',
                        '--verbose',
                        type=str,
                        choices=list(LOG_LEVELS.keys()),
                        default='info',
                        help='Verbosity level')
    parser.add_argument(
        "-d",
        '--drop_split',
        default=False,
        action='store_true',
        help=
        "If elements span multiple chains, report them as non-mapping. These will then be reported as gains or losses, according to the maximum-parsimony predictions. This is the default mapping behavior for bnMapper. By default, mapGL.pys will follow the mapping convention used by liftOver, whereas the longest mapped alignment is reported for split elements."
    )
    parser.add_argument("-i",
                        "--in_format",
                        choices=["BED", "narrowPeak"],
                        default="BED",
                        help="Input file format. (Default: BED)")
    parser.add_argument(
        "-f",
        "--full_labels",
        default=False,
        action='store_true',
        help=
        "Attempt to predict gain/loss events on all branches of the tree, not just query/target branches. Output will include a comma-delimited list of gain/loss events from any/all affected branches."
    )
    parser.add_argument(
        "-n",
        "--no_prune",
        default=False,
        action='store_true',
        help=
        "Do not attempt to disambiguate the root state to resolve ambiguous gain/loss predictions. Instead, label affected features as 'ambiguous'."
    )
    parser.add_argument(
        "-p",
        "--priority",
        type=str,
        choices=list(["gain", "loss"]),
        default="gain",
        help=
        "When resolving ambiguous trees, prioritize sequence gain or sequence loss. This can be thought of as assigning a lower cost to sequence insertions relative to deletions, or vice-versa. When priority='gain', ambiguity is resolved by assigning 0 state to the root node, such that sequence presence on a descendant branch will be interpreted as a gain. When priority='loss', ambiguity is resolved by asssigning state 1 to the root node, such that sequence absence in a descendant node is interpreted as a sequence loss. Default=gain"
    )

    opt = parser.parse_args()
    log.setLevel(LOG_LEVELS[opt.verbose])

    # Load up the newick tree
    log.info("Parsing species tree: {}".format(opt.tree))
    if os.path.isfile(opt.tree):
        phylo_full = newick.read(opt.tree)[0]
    else:
        phylo_full = newick.parse_node(opt.tree)
    if opt.full_labels:
        phylo_full.name_internal_nodes()
    log.debug("Full tree:\n{}".format(
        phylo_full.ascii_art(show_internal=False, strict=True)))
    """
    # Prune the terminal outgroup (furthest from the query species)
    # to use in ambiguous cases. For now, this is assumed to be the
    # last species in the leaves list.
    ## TO-DO: newick.py lacks a copy function. Would be nice to just
    ## do a deep copy here instead of reloading from string/file.
    phylo_pruned = {}
    if not opt.no_prune:
        if os.path.isfile(opt.tree):
            phylo_pruned = newick.read(opt.tree)[0]
        else:
            phylo_pruned = newick.parse_node(opt.tree)
        leaves = phylo_pruned.get_leaves()
        leaves[-1].ancestor.descendants.remove(leaves[-1])
        phylo_pruned = newick.parse_node(leaves[-1].ancestor.newick)
        phylo_pruned.remove_redundant_nodes()
        if opt.full_labels:
            phylo_pruned.name_internal_nodes()
        log.debug("Pruned tree:\n{}".format(phylo_pruned.ascii_art(show_internal=False, strict=True)))
    """
    phylo_pruned = {}

    # Make sure target and query species are in the tree
    leaves = phylo_full.get_leaf_names()
    if opt.qname not in leaves:
        sys.stderr.write(
            "Query species name {} not present in tree: {}. Exiting.\n".format(
                opt.qname, phylo_full.newick))
        exit(1)
    if opt.tname not in leaves:
        sys.stderr.write(
            "Target species name {} not present in tree: {}. Exiting.\n".
            format(opt.tname, phylo_full.newick))
        exit(1)

    # Sanity checks and warnings for odd usages:
    if opt.no_prune and opt.full_labels:
        sys.stderr.write(
            "WARNING: --full_labels requires an unambiguous tree. --no_prune will be ignored.\n"
        )
    if len(leaves) < 4:
        # Single-outgroup phylogeny
        sys.stderr.write(
            "WARNING: Tree-disambiguation requires at least two outgroup species. Forcing --no_prune.\n"
        )
        opt.no_prune = True

    # Load up alignment chains. Need reciprocal-best chains for the pair
    # of species to be compared, and for three outgroup species. (Four
    # chains in all). TREES is a dictionary, keyed according to the
    # names of the target and output species, containing EPO and TREE
    # for each species.
    TREES = dict()
    for chain in opt.alignments:
        # Get the target species name from the file name.
        cname_parts = chain.split("/")
        cname_parts = cname_parts[-1].split(".")
        if cname_parts[0] != opt.qname:
            sys.stderr.write(
                "Chain {} does not appear to contain the correct query species. Exiting.\n"
                .format(chain))
            exit(1)
        if cname_parts[1] not in leaves:
            sys.stderr.write(
                "Chain {} target species not present in tree {}. Exiting.\n".
                format(chain, phylo_full.newick))
            exit(1)

        #loading alignments from the chain/pkl file
        EPO = dict((ch[0].id, ch) for ch in loadChains(chain))

        ## create an interval tree based on chain headers (from_species side)
        ## for fast feature-to-chain_header searching
        log.info("indexing %d chains ..." % (len(EPO), ))
        TREE = GIntervalTree()
        for gabid in EPO:
            chain, t, q = EPO[gabid]
            TREE.add(chain.tName, Interval(chain.tStart, chain.tEnd, chain.id))

        TREES[cname_parts[1]] = {}
        TREES[cname_parts[1]]["EPO"] = EPO
        TREES[cname_parts[1]]["TREE"] = TREE

    if len(TREES) < len(leaves) - 1:
        sys.stderr.write("Not enough alignments for the given tree!\n")
        exit(1)

    # transform elements
    transform_file(loadFeatures(opt.input, opt, TREES[opt.tname]["TREE"]),
                   opt.output, TREES, leaves, phylo_full, phylo_pruned, opt)