Ejemplo n.º 1
def score(args):
    %prog score blastfile query.fasta A.ids

    Add up the scores for each query seq. Go through the lines and for each
    query sequence, add up the scores when subject is in each pile by A.ids.
    from jcvi.formats.base import SetFile
    from jcvi.formats.fasta import Fasta

    p = OptionParser(score.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    blastfile, fastafile, idsfile = args
    ids = SetFile(idsfile)

    blast = Blast(blastfile)
    scores = defaultdict(int)
    for b in blast:
        query = b.query
        subject = b.subject
        if subject not in ids:
        scores[query] += b.score

    logging.debug("A total of {0} ids loaded.".format(len(ids)))

    f = Fasta(fastafile)
    for s in f.iterkeys_ordered():
        sc = scores.get(s, 0)
        print "\t".join((s, str(sc)))
Ejemplo n.º 2
def score(args):
    %prog score blastfile query.fasta A.ids

    Add up the scores for each query seq. Go through the lines and for each
    query sequence, add up the scores when subject is in each pile by A.ids.
    from jcvi.formats.base import SetFile
    from jcvi.formats.fasta import Fasta

    p = OptionParser(score.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    blastfile, fastafile, idsfile = args
    ids = SetFile(idsfile)

    blast = Blast(blastfile)
    scores = defaultdict(int)
    for b in blast:
        query = b.query
        subject = b.subject
        if subject not in ids:
        scores[query] += b.score

    logging.debug("A total of {0} ids loaded.".format(len(ids)))

    f = Fasta(fastafile)
    for s in f.iterkeys_ordered():
        sc = scores.get(s, 0)
        print "\t".join((s, str(sc)))
Ejemplo n.º 3
def script(args):
    %prog script gffile cdna.fasta genome.fasta

    Parse gmap gff and produce script for sim4db to refine.
    p = OptionParser(script.__doc__)

    opts, args = p.parse_args(args)
    if len(args) != 3:

    gffile, cdnafasta, genomefasta = args
    scriptfile = gffile + ".script"
    gff = Gff(gffile)
    fw = open(scriptfile, "w")
    cdnas = Fasta(cdnafasta, lazy=True)
    cdnas = dict((x, i) for (i, x) in enumerate(cdnas.iterkeys_ordered()))
    genomes = Fasta(genomefasta, lazy=True)
    genomes = dict((x, i) for (i, x) in enumerate(genomes.iterkeys_ordered()))
    extra = 50000  # 50-kb region surrounding the locus
    for g in gff:
        if g.type != "mRNA":

        cdna = g.attributes["Name"][0]
        genome = g.seqid
        ci = cdnas[cdna]
        gi = genomes[genome]

        strand = "-r" if g.strand == "-" else "-f"
        start, end = g.start, g.end
        start = max(0, start - extra)
        end += extra
        print >> fw, "{0} -e {1} -D {2} {3} {4}"\
                .format(strand, ci, gi, start, end)
Ejemplo n.º 4
def script(args):
    %prog script gffile cdna.fasta genome.fasta

    Parse gmap gff and produce script for sim4db to refine.
    p = OptionParser(script.__doc__)

    opts, args = p.parse_args(args)
    if len(args) != 3:

    gffile, cdnafasta, genomefasta = args
    scriptfile = gffile + ".script"
    gff = Gff(gffile)
    fw = open(scriptfile, "w")
    cdnas = Fasta(cdnafasta, lazy=True)
    cdnas = dict((x, i) for (i, x) in enumerate(cdnas.iterkeys_ordered()))
    genomes = Fasta(genomefasta, lazy=True)
    genomes = dict((x, i) for (i, x) in enumerate(genomes.iterkeys_ordered()))
    extra = 50000  # 50-kb region surrounding the locus
    for g in gff:
        if g.type != "mRNA":

        cdna = g.attributes["Name"][0]
        genome = g.seqid
        ci = cdnas[cdna]
        gi = genomes[genome]

        strand = "-r" if g.strand == "-" else "-f"
        start, end = g.start, g.end
        start = max(0, start - extra)
        end += extra
        print >> fw, "{0} -e {1} -D {2} {3} {4}"\
                .format(strand, ci, gi, start, end)
Ejemplo n.º 5
def prepare(args):
    %prog prepare --rearray_lib=<rearraylibrary> --orig_lib_file=<origlibfile>

    Inferred file names
    `lookuptblfile` : rearraylibrary.lookup
    `rearraylibfile`: rearraylibrary.fasta

    Pick sequences from the original library file and the rearrayed library file
    based on the mapping information provided in the `lookuptblfile`.

    # lookuptblfile format: column number (index)
    # 1 (0)          2 (1)          3 (2)         4 (3)        5 (4)        6 (5)
    # source_clone   source_plate   source_well   dest_clone   dest_plate   dest_well

    The 1st and 4th column in the `lookuptblfile` form the pair of clones which
    constitute the elements used for the per-clone assembly.
    from operator import itemgetter
    from jcvi.formats.fasta import Fasta, SeqIO

    p = OptionParser(prepare.__doc__)
                 help="name of the rearrayed library [default: %default]")
        "fasta file containing reads from the original libraries [default: %default]"

    g = OptionGroup(p, "Optional parameters")
        help="output folder to write the FASTA files to [default: %default]")

    opts, args = p.parse_args(args)

    if not opts.rearray_lib or not opts.orig_lib_file:
        logging.error("Please specify the required parameters")
        sys.exit(not p.print_help())

    rearraylib, origlibfile = opts.rearray_lib, opts.orig_lib_file

    if not op.isfile(origlibfile):
            "Original library reads file `{0}` does not exist!".format(

    lookuptblfile = rearraylib + '.lookup'
    if not op.isfile(lookuptblfile):
            "Lookup table file `{0}` does not exist!".format(lookuptblfile))

    rearraylibfile = rearraylib + '.fasta'
    if not op.isfile(rearraylibfile):
            "Rearrayed library reads file `{0}` does not exist!".format(

    origlibFasta = Fasta(origlibfile)
    rearraylibFasta = Fasta(rearraylibfile)

    origlibids = [o for o in origlibFasta.iterkeys_ordered()]
    rearraylibids = [r for r in rearraylibFasta.iterkeys_ordered()]

    if not op.isdir(opts.output_folder):
            "Output directory `{0}` missing. Creating it now...".format(

    logfile = rearraylib + '.log'
    log = open(logfile, 'w')

    fp = open(lookuptblfile, 'r')
    for row in fp:
        origprefix, rearrayprefix = itemgetter(0, 3)(row.split('\t'))
        libpair = origprefix + '_' + rearrayprefix
        outfile = opts.output_folder + '/' + libpair + '.fasta'
        ofp = open(outfile, 'w')

        for o in origlibids:
            if re.match(origprefix, o):
                SeqIO.write(origlibFasta[o], ofp, 'fasta')

        for r in rearraylibids:
            if re.match(rearrayprefix, r):
                SeqIO.write(rearraylibFasta[r], ofp, 'fasta')

        print(outfile, file=log)

    logging.debug('Wrote log file `{0}`'.format(logfile))
Ejemplo n.º 6
def prepare(args):
    %prog prepare --rearray_lib=<rearraylibrary> --orig_lib_file=<origlibfile>

    Inferred file names
    `lookuptblfile` : rearraylibrary.lookup
    `rearraylibfile`: rearraylibrary.fasta

    Pick sequences from the original library file and the rearrayed library file
    based on the mapping information provided in the `lookuptblfile`.

    # lookuptblfile format: column number (index)
    # 1 (0)          2 (1)          3 (2)         4 (3)        5 (4)        6 (5)
    # source_clone   source_plate   source_well   dest_clone   dest_plate   dest_well

    The 1st and 4th column in the `lookuptblfile` form the pair of clones which
    constitute the elements used for the per-clone assembly.
    from operator import itemgetter
    from jcvi.formats.fasta import Fasta, SeqIO

    p = OptionParser(prepare.__doc__)
    p.add_option("--rearray_lib", default=None,
            help="name of the rearrayed library [default: %default]")
            help="fasta file containing reads from the original libraries [default: %default]")

    g = OptionGroup(p, "Optional parameters")
    g.add_option("--output_folder", default="to_assemble",
            help="output folder to write the FASTA files to [default: %default]")

    opts, args = p.parse_args(args)

    if not opts.rearray_lib or not opts.orig_lib_file:
        logging.error("Please specify the required parameters")
        sys.exit(not p.print_help())

    rearraylib, origlibfile = opts.rearray_lib, opts.orig_lib_file

    if not op.isfile(origlibfile):
        logging.error("Original library reads file `{0}` does not exist!".format(origlibfile))

    lookuptblfile  = rearraylib + '.lookup'
    if not op.isfile(lookuptblfile):
        logging.error("Lookup table file `{0}` does not exist!".format(lookuptblfile))

    rearraylibfile = rearraylib + '.fasta'
    if not op.isfile(rearraylibfile):
        logging.error("Rearrayed library reads file `{0}` does not exist!".format(rearraylibfile))

    origlibFasta = Fasta(origlibfile)
    rearraylibFasta = Fasta(rearraylibfile)

    origlibids = [o for o in origlibFasta.iterkeys_ordered()]
    rearraylibids = [r for r in rearraylibFasta.iterkeys_ordered()]

    if not op.isdir(opts.output_folder):
        logging.warning("Output directory `{0}` missing. Creating it now...".format(opts.output_folder))

    logfile = rearraylib + '.log'
    log = open(logfile, 'w')

    fp = open(lookuptblfile, 'r')
    for row in fp:
        origprefix, rearrayprefix = itemgetter(0,3)(row.split('\t'))
        libpair = origprefix + '_' + rearrayprefix
        outfile = opts.output_folder + '/' + libpair + '.fasta'
        ofp = open(outfile, 'w')

        for o in origlibids:
            if re.match(origprefix, o):
                SeqIO.write(origlibFasta[o], ofp, 'fasta')

        for r in rearraylibids:
            if re.match(rearrayprefix, r):
                SeqIO.write(rearraylibFasta[r], ofp, 'fasta')

        print >>log, outfile

    logging.debug('Wrote log file `{0}`'.format(logfile))