Esempio n. 1
0
def annotate(args):
    """
    %prog annotate new.bed old.bed 2> log

    Annotate the `new.bed` with features from `old.bed` for the purpose of
    gene numbering.

    Ambiguity in ID assignment can be resolved by either of the following 2 methods:
    - `alignment`: make use of global sequence alignment score (calculated by `needle`)
    - `overlap`: make use of overlap length (calculated by `intersectBed`)

    Transfer over as many identifiers as possible while following guidelines:
    http://www.arabidopsis.org/portals/nomenclature/guidelines.jsp#editing

    Note: Following RegExp pattern describes the structure of the identifier
    assigned to features in the `new.bed` file.

    new_id_pat = re.compile(r"^\d+\.[cemtx]+\S+")

    Examples: 23231.m312389, 23231.t004898, 23231.tRNA.144
    Adjust the value of `new_id_pat` manually as per your ID naming conventions.
    """
    from jcvi.utils.grouper import Grouper

    valid_resolve_choices = ["alignment", "overlap"]

    p = OptionParser(annotate.__doc__)
    p.add_option("--resolve", default="alignment", choices=valid_resolve_choices,
                 help="Resolve ID assignment based on a certain metric" \
                        + " [default: %default]")
    p.add_option("--atg_name", default=False, action="store_true",
                help="Specify is locus IDs in `new.bed` file follow ATG nomenclature" \
                        + " [default: %default]")

    g1 = OptionGroup(p, "Optional parameters (alignment):\n" \
            + "Use if resolving ambiguities based on sequence `alignment`")
    g1.add_option("--pid", dest="pid", default=35., type="float",
            help="Percent identity cutoff [default: %default]")
    g1.add_option("--score", dest="score", default=250., type="float",
            help="Alignment score cutoff [default: %default]")
    p.add_option_group(g1)

    g2 = OptionGroup(p, "Optional parameters (overlap):\n" \
            + "Use if resolving ambiguities based on `overlap` length\n" \
            + "Parameters equivalent to `intersectBed`")
    g2.add_option("-f", dest="f", default=0.5, type="float",
            help="Minimum overlap fraction (0.0 - 1.0) [default: %default]")
    g2.add_option("-r", dest="r", default=False, action="store_true",
            help="Require fraction overlap to be reciprocal [default: %default]")
    g2.add_option("-s", dest="s", default=True, action="store_true",
            help="Require same strandedness [default: %default]")
    p.add_option_group(g2)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    nbedfile, obedfile = args
    npf, opf = nbedfile.rsplit(".", 1)[0], obedfile.rsplit(".", 1)[0]

    # Make consolidated.bed
    cbedfile = "consolidated.bed"
    if not os.path.isfile(cbedfile):
        consolidate(nbedfile, obedfile, cbedfile)
    else:
        logging.warning("`{0}` already exists. Skipping step".format(cbedfile))

    logging.warning("Resolving ID assignment ambiguity based on `{0}`".\
            format(opts.resolve))

    if opts.resolve == "alignment":
        # Get pairs and prompt to run needle
        pairsfile = "nw.pairs"
        scoresfile = "nw.scores"
        if not os.path.isfile(pairsfile):
            get_pairs(cbedfile, pairsfile)
        else:
            logging.warning("`{0}` already exists. Checking for needle output".\
                    format(pairsfile))

        # If needle scores do not exist, prompt user to run needle
        if not os.path.isfile(scoresfile):
            logging.error("`{0}` does not exist. Please process {1} using `needle`".\
                    format(scoresfile, pairsfile))
            sys.exit()
    else:
        scoresfile = "ovl.scores"
        # Calculate overlap length using intersectBed
        calculate_ovl(nbedfile, obedfile, opts, scoresfile)

    logging.warning("`{0}' exists. Storing scores in memory".\
            format(scoresfile))
    scores = read_scores(scoresfile, opts)

    # Iterate through consolidated bed and
    # filter piles based on score
    abedline = {}

    cbed = Bed(cbedfile)
    g = Grouper()
    for c in cbed:
        accn = c.accn
        g.join(*accn.split(";"))

    nbedline = {}
    nbed = Bed(nbedfile)
    for line in nbed: nbedline[line.accn] = line

    splits = set()
    for chr, chrbed in nbed.sub_beds():
        abedline, splits = annotate_chr(chr, chrbed, g, scores, nbedline, abedline, opts, splits)

    if splits is not None:
        abedline = process_splits(splits, scores, nbedline, abedline)

    abedfile = npf + ".annotated.bed"
    afh = open(abedfile, "w")
    for accn in abedline:
        print >> afh, abedline[accn]
    afh.close()

    sort([abedfile, "-i"])
Esempio n. 2
0
    p.set_beds()
    p.set_stripnames()
    p.set_outfile()

    coge_group = OptionGroup(p, "CoGe-specific options")
    coge_group.add_option("--sqlite", help="Write sqlite database")
    coge_group.add_option("--qnote", default="null",
                          help="Query dataset group id")
    coge_group.add_option("--snote", default="null",
                          help="Subject dataset group id")

    params_group = OptionGroup(p, "Synteny parameters")
    params_group.add_option("--window", type="int", default=40,
                            help="Synteny window size")
    params_group.add_option("--cutoff", type="float", default=.1,
                            help="Minimum number of anchors to call synteny")
    supported_scoring = ("collinear", "density")
    params_group.add_option("--scoring", choices=supported_scoring,
                            default="collinear", help="Scoring scheme")

    p.add_option_group(coge_group)
    p.add_option_group(params_group)

    opts, args = p.parse_args()

    if len(args) != 1:
        sys.exit(not p.print_help())

    blastfile, = args
    main(blastfile, p, opts)
Esempio n. 3
0
def prepare(args):
    """
    %prog prepare --rearray_lib=<rearraylibrary> --orig_lib_file=<origlibfile>

    Inferred file names
    ---------------------------------------------
    `lookuptblfile` : rearraylibrary.lookup
    `rearraylibfile`: rearraylibrary.fasta

    Pick sequences from the original library file and the rearrayed library file
    based on the mapping information provided in the `lookuptblfile`.

    # lookuptblfile format: column number (index)
    # 1 (0)          2 (1)          3 (2)         4 (3)        5 (4)        6 (5)
    # source_clone   source_plate   source_well   dest_clone   dest_plate   dest_well

    The 1st and 4th column in the `lookuptblfile` form the pair of clones which
    constitute the elements used for the per-clone assembly.
    """
    from operator import itemgetter
    from jcvi.formats.fasta import Fasta, SeqIO

    p = OptionParser(prepare.__doc__)
    p.add_option("--rearray_lib", default=None,
            help="name of the rearrayed library [default: %default]")
    p.add_option("--orig_lib_file",
            help="fasta file containing reads from the original libraries [default: %default]")

    g = OptionGroup(p, "Optional parameters")
    g.add_option("--output_folder", default="to_assemble",
            help="output folder to write the FASTA files to [default: %default]")
    p.add_option_group(g)

    opts, args = p.parse_args(args)

    if not opts.rearray_lib or not opts.orig_lib_file:
        logging.error("Please specify the required parameters")
        sys.exit(not p.print_help())

    rearraylib, origlibfile = opts.rearray_lib, opts.orig_lib_file

    if not op.isfile(origlibfile):
        logging.error("Original library reads file `{0}` does not exist!".format(origlibfile))
        sys.exit()

    lookuptblfile  = rearraylib + '.lookup'
    logging.debug(lookuptblfile)
    if not op.isfile(lookuptblfile):
        logging.error("Lookup table file `{0}` does not exist!".format(lookuptblfile))
        sys.exit()

    rearraylibfile = rearraylib + '.fasta'
    logging.debug(rearraylibfile)
    if not op.isfile(rearraylibfile):
        logging.error("Rearrayed library reads file `{0}` does not exist!".format(rearraylibfile))
        sys.exit()

    origlibFasta = Fasta(origlibfile)
    rearraylibFasta = Fasta(rearraylibfile)

    origlibids = [o for o in origlibFasta.iterkeys_ordered()]
    rearraylibids = [r for r in rearraylibFasta.iterkeys_ordered()]

    if not op.isdir(opts.output_folder):
        logging.warning("Output directory `{0}` missing. Creating it now...".format(opts.output_folder))
        os.makedirs(opts.output_folder)

    logfile = rearraylib + '.log'
    log = open(logfile, 'w')

    fp = open(lookuptblfile, 'r')
    for row in fp:
        origprefix, rearrayprefix = itemgetter(0,3)(row.split('\t'))
        libpair = origprefix + '_' + rearrayprefix
        outfile = opts.output_folder + '/' + libpair + '.fasta'
        ofp = open(outfile, 'w')

        for o in origlibids:
            if re.match(origprefix, o):
                SeqIO.write(origlibFasta[o], ofp, 'fasta')

        for r in rearraylibids:
            if re.match(rearrayprefix, r):
                SeqIO.write(rearraylibFasta[r], ofp, 'fasta')

        ofp.close()
        print >>log, outfile

    log.close()
    logging.debug('Wrote log file `{0}`'.format(logfile))
Esempio n. 4
0
def assemble(args):
    """
    Run `cap3` on a single multi FASTA file containing reads or a folder containing several
    multi FASTA files. Allows for tweaking of `cap3` parameters max_gap_len, ovl_pct_id, etc.
    """
    p = OptionParser(assemble.__doc__)
    g1 = OptionGroup(p, "Input file options (required)",
            "Note: Please choose from and provide values for one of the following parameters")
    g1.add_option("--input_file", default=None,
            help="input file of reads [default: %default]")
    g1.add_option("--input_folder", default=None,
            help="input folder containing multi FASTA files of reads [default: %default]")
    g1.add_option("--input_file_list", default=None,
            help="list file containing paths to multi FASTA files of reads [default: %default]")
    p.add_option_group(g1)

    g2 = OptionGroup(p, "Optional parameters",
            "Note: If not specified, `cap3` defaults will be used")
    g2.add_option("-f", "--max_gap_len", default=20, type="int",
            help="maximum gap length in any overlap [default: %default]\n" +\
                 "Same as cap3 `-f` parameter.")
    g2.add_option("-p", "--ovl_pct_id", default=90, type="int",
            help="overlap percent identity cutoff [default: %default]\n" +\
                 "Same as cap3 `-p` parameter.")
    g2.add_option("-s", "--ovl_sim_score", default=900, type="int",
            help="overlap similarity score cutoff [default: %default]\n" +\
                 "Same as cap3 `-s` parameter.")
    g2.add_option("-x", "--prefix", dest="prefix", default="cap3",
            help="prefix string for output file name [default: %default]")
    p.add_option_group(g2)

    p.set_params()

    opts, args = p.parse_args(args)

    if opts.max_gap_len and opts.max_gap_len <= 1:
        logging.error("--max_gap_len should be > 1")
        sys.exit()
    elif opts.ovl_pct_id and opts.ovl_pct_id <= 65:
        logging.error("--ovl_pct_id should be > 65")
        sys.exit()
    elif opts.ovl_sim_score and opts.ovl_sim_score <= 250:
        logging.error("--ovl_sim_score should be > 250")
        sys.exit()

    file_list = []
    if opts.input_file_list:
        if not op.isfile(opts.input_file_list):
            logging.error("Input file list {0} does not exist".format(opts.input_file_list))
            sys.exit()
        with open(opts.input_file_list, 'r') as f:
            file_list = f.read().splitlines()
    elif opts.input_folder:
        if not op.isdir(opts.input_folder):
            logging.error("Input folder {0} does not exist".format(opts.input_folder))
            sys.exit()

        file_list = [file for file in os.listdir(opts.input_folder) \
                if file.lower().endswith('.fa') or file.lower().endswith('.fasta')]
        folder = opts.input_folder
        folder = folder.rstrip('/')
        for i in xrange(len(file_list)):
            file_list[i] = folder + "/" + file_list[i]
    elif opts.input_file:
        file_list.append(opts.input_file)
    else:
        logging.error("Please specify one of the options for input files")
        sys.exit(not p.print_help())

    if len(file_list) == 0:
        logging.warning("List of files to process is empty. Please check your input!")
        sys.exit()

    for file in file_list:
        if not op.isfile(file):
            logging.warning("Input file {0} does not exist".format(file))
        else:
            cmd = "cap3 {0} -f {1} -p {2} -s {3} -x {4}".format(file, opts.max_gap_len, \
                    opts.ovl_pct_id, opts.ovl_sim_score, opts.prefix)
            if opts.extra:
                cmd += " {0}".format(opts.extra)
            logfile = "{0}.{1}.log".format(file, opts.prefix)

            sh(cmd, outfile=logfile)
Esempio n. 5
0
def main():
    """
    %prog bedfile id_mappings

    Takes a bedfile that contains the coordinates of features to plot on the
    chromosomes, and `id_mappings` file that map the ids to certain class. Each
    class will get assigned a unique color. `id_mappings` file is optional (if
    omitted, will not paint the chromosome features, except the centromere).

    The extent of the chromosomes are given by --sizes, which contains
    chr<tab>size, one per line. If not specified, the extent of the chromosomes
    are assumed to be the end for the last feature, which might be an underestimate.
    """

    p = OptionParser(main.__doc__)
    p.add_option(
        "--sizes", help="FASTA sizes file, which contains chr<tab>size, one per line"
    )
    g = OptionGroup(p, "Display accessories")
    g.add_option(
        "--title", help="title of the image",
    )
    g.add_option(
        "--gauge",
        default=False,
        action="store_true",
        help="draw a gauge with size label",
    )
    p.add_option_group(g)

    g = OptionGroup(p, "HTML image map")
    g.add_option(
        "--imagemap",
        default=False,
        action="store_true",
        help="generate an HTML image map associated with the image",
    )
    g.add_option(
        "--winsize",
        default=50000,
        type="int",
        help="if drawing an imagemap, specify the window size (bases) of each map element ",
    )
    p.add_option_group(g)

    g = OptionGroup(p, "Color legend")
    g.add_option(
        "--nolegend",
        dest="legend",
        default=True,
        action="store_false",
        help="Do not generate color legend",
    )
    g.add_option(
        "--mergedist", default=0, type="int", help="Merge regions closer than "
    )
    g.add_option("--empty", help="Write legend for unpainted region")
    p.add_option_group(g)

    opts, args, iopts = p.set_image_options(figsize="6x6", dpi=300)

    if len(args) not in (1, 2):
        sys.exit(p.print_help())

    bedfile = args[0]
    mappingfile = None
    if len(args) == 2:
        mappingfile = args[1]

    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])

    draw_chromosomes(
        root,
        bedfile,
        sizes=opts.sizes,
        iopts=iopts,
        mergedist=opts.mergedist,
        winsize=opts.winsize,
        imagemap=opts.imagemap,
        mappingfile=mappingfile,
        gauge=opts.gauge,
        legend=opts.legend,
        empty=opts.empty,
        title=opts.title,
    )

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()

    prefix = bedfile.rsplit(".", 1)[0]
    figname = prefix + "." + opts.format
    savefig(figname, dpi=iopts.dpi, iopts=iopts)
Esempio n. 6
0
def prepare(args):
    """
    %prog prepare --rearray_lib=<rearraylibrary> --orig_lib_file=<origlibfile>

    Inferred file names
    ---------------------------------------------
    `lookuptblfile` : rearraylibrary.lookup
    `rearraylibfile`: rearraylibrary.fasta

    Pick sequences from the original library file and the rearrayed library file
    based on the mapping information provided in the `lookuptblfile`.

    # lookuptblfile format: column number (index)
    # 1 (0)          2 (1)          3 (2)         4 (3)        5 (4)        6 (5)
    # source_clone   source_plate   source_well   dest_clone   dest_plate   dest_well

    The 1st and 4th column in the `lookuptblfile` form the pair of clones which
    constitute the elements used for the per-clone assembly.
    """
    from operator import itemgetter
    from jcvi.formats.fasta import Fasta, SeqIO

    p = OptionParser(prepare.__doc__)
    p.add_option("--rearray_lib",
                 default=None,
                 help="name of the rearrayed library [default: %default]")
    p.add_option(
        "--orig_lib_file",
        help=
        "fasta file containing reads from the original libraries [default: %default]"
    )

    g = OptionGroup(p, "Optional parameters")
    g.add_option(
        "--output_folder",
        default="to_assemble",
        help="output folder to write the FASTA files to [default: %default]")
    p.add_option_group(g)

    opts, args = p.parse_args(args)

    if not opts.rearray_lib or not opts.orig_lib_file:
        logging.error("Please specify the required parameters")
        sys.exit(not p.print_help())

    rearraylib, origlibfile = opts.rearray_lib, opts.orig_lib_file

    if not op.isfile(origlibfile):
        logging.error(
            "Original library reads file `{0}` does not exist!".format(
                origlibfile))
        sys.exit()

    lookuptblfile = rearraylib + '.lookup'
    logging.debug(lookuptblfile)
    if not op.isfile(lookuptblfile):
        logging.error(
            "Lookup table file `{0}` does not exist!".format(lookuptblfile))
        sys.exit()

    rearraylibfile = rearraylib + '.fasta'
    logging.debug(rearraylibfile)
    if not op.isfile(rearraylibfile):
        logging.error(
            "Rearrayed library reads file `{0}` does not exist!".format(
                rearraylibfile))
        sys.exit()

    origlibFasta = Fasta(origlibfile)
    rearraylibFasta = Fasta(rearraylibfile)

    origlibids = [o for o in origlibFasta.iterkeys_ordered()]
    rearraylibids = [r for r in rearraylibFasta.iterkeys_ordered()]

    if not op.isdir(opts.output_folder):
        logging.warning(
            "Output directory `{0}` missing. Creating it now...".format(
                opts.output_folder))
        os.makedirs(opts.output_folder)

    logfile = rearraylib + '.log'
    log = open(logfile, 'w')

    fp = open(lookuptblfile, 'r')
    for row in fp:
        origprefix, rearrayprefix = itemgetter(0, 3)(row.split('\t'))
        libpair = origprefix + '_' + rearrayprefix
        outfile = opts.output_folder + '/' + libpair + '.fasta'
        ofp = open(outfile, 'w')

        for o in origlibids:
            if re.match(origprefix, o):
                SeqIO.write(origlibFasta[o], ofp, 'fasta')

        for r in rearraylibids:
            if re.match(rearrayprefix, r):
                SeqIO.write(rearraylibFasta[r], ofp, 'fasta')

        ofp.close()
        print >> log, outfile

    log.close()
    logging.debug('Wrote log file `{0}`'.format(logfile))
Esempio n. 7
0
def assemble(args):
    """
    Run `cap3` on a single multi FASTA file containing reads or a folder containing several
    multi FASTA files. Allows for tweaking of `cap3` parameters max_gap_len, ovl_pct_id, etc.
    """
    p = OptionParser(assemble.__doc__)
    g1 = OptionGroup(
        p, "Input file options (required)",
        "Note: Please choose from and provide values for one of the following parameters"
    )
    g1.add_option("--input_file",
                  default=None,
                  help="input file of reads [default: %default]")
    g1.add_option(
        "--input_folder",
        default=None,
        help=
        "input folder containing multi FASTA files of reads [default: %default]"
    )
    g1.add_option(
        "--input_file_list",
        default=None,
        help=
        "list file containing paths to multi FASTA files of reads [default: %default]"
    )
    p.add_option_group(g1)

    g2 = OptionGroup(p, "Optional parameters",
                     "Note: If not specified, `cap3` defaults will be used")
    g2.add_option("-f", "--max_gap_len", default=20, type="int",
            help="maximum gap length in any overlap [default: %default]\n" +\
                 "Same as cap3 `-f` parameter.")
    g2.add_option("-p", "--ovl_pct_id", default=90, type="int",
            help="overlap percent identity cutoff [default: %default]\n" +\
                 "Same as cap3 `-p` parameter.")
    g2.add_option("-s", "--ovl_sim_score", default=900, type="int",
            help="overlap similarity score cutoff [default: %default]\n" +\
                 "Same as cap3 `-s` parameter.")
    g2.add_option(
        "-x",
        "--prefix",
        dest="prefix",
        default="cap3",
        help="prefix string for output file name [default: %default]")
    p.add_option_group(g2)

    p.set_params()

    opts, args = p.parse_args(args)

    if opts.max_gap_len and opts.max_gap_len <= 1:
        logging.error("--max_gap_len should be > 1")
        sys.exit()
    elif opts.ovl_pct_id and opts.ovl_pct_id <= 65:
        logging.error("--ovl_pct_id should be > 65")
        sys.exit()
    elif opts.ovl_sim_score and opts.ovl_sim_score <= 250:
        logging.error("--ovl_sim_score should be > 250")
        sys.exit()

    file_list = []
    if opts.input_file_list:
        if not op.isfile(opts.input_file_list):
            logging.error("Input file list {0} does not exist".format(
                opts.input_file_list))
            sys.exit()
        with open(opts.input_file_list, 'r') as f:
            file_list = f.read().splitlines()
    elif opts.input_folder:
        if not op.isdir(opts.input_folder):
            logging.error("Input folder {0} does not exist".format(
                opts.input_folder))
            sys.exit()

        file_list = [file for file in os.listdir(opts.input_folder) \
                if file.lower().endswith('.fa') or file.lower().endswith('.fasta')]
        folder = opts.input_folder
        folder = folder.rstrip('/')
        for i in xrange(len(file_list)):
            file_list[i] = folder + "/" + file_list[i]
    elif opts.input_file:
        file_list.append(opts.input_file)
    else:
        logging.error("Please specify one of the options for input files")
        sys.exit(not p.print_help())

    if len(file_list) == 0:
        logging.warning(
            "List of files to process is empty. Please check your input!")
        sys.exit()

    for file in file_list:
        if not op.isfile(file):
            logging.warning("Input file {0} does not exist".format(file))
        else:
            cmd = "cap3 {0} -f {1} -p {2} -s {3} -x {4}".format(file, opts.max_gap_len, \
                    opts.ovl_pct_id, opts.ovl_sim_score, opts.prefix)
            if opts.extra:
                cmd += " {0}".format(opts.extra)
            logfile = "{0}.{1}.log".format(file, opts.prefix)

            sh(cmd, outfile=logfile)