Ejemplo n.º 1
0
def buildContigSizes(infile, outfile):
    '''
    Get contig sizes from indexed genome :term:`fasta` files and
    outputs to a text file.
    Parameters
    ----------
    infile : str
      infile is constructed from the `PARAMS` variable to retrieve
      the `genome` :term:`fasta` file
    Returns
    -------
    outfile : str
      outfile is a text format file that contains two columns, matched
      contig name and contig size (in nucleotides).  The output file
      name is defined in `PARAMS: interface_contigs`.
    '''

    prefix = P.snip(infile, ".fasta")
    fasta = IndexedFasta.IndexedFasta(prefix)
    contigs = []

    for contig, size in fasta.getContigSizes(with_synonyms=False).items():
        contigs.append([contig, size])
    df_contig = pd.DataFrame(contigs, columns=['contigs', 'size'])
    df_contig.sort_values('contigs', inplace=True)
    df_contig.to_csv(outfile, sep="\t", header=False, index=False)
Ejemplo n.º 2
0
def writeContigSizes(genome, outfile):
    '''write contig sizes to outfile for UCSC tools.
    '''

    outf = iotools.openFile(outfile, "w")
    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], genome))
    for contig, size in fasta.getContigSizes(with_synonyms=False).items():
        outf.write("%s\t%i\n" % (contig, size))
    outf.close()
Ejemplo n.º 3
0
def get_contig_cluster(infile, outfile):
    """This will generate a contig file of the cluster genome"""

    fasta = IndexedFasta.IndexedFasta(infile)
    contigs = []

    for contig, size in fasta.getContigSizes(with_synonyms=False).items():
        contigs.append([contig, size])
    df_contig = pd.DataFrame(contigs, columns=['contigs', 'size'])
    df_contig.sort_values('contigs', inplace=True)
    df_contig.to_csv(outfile, sep="\t", header=False, index=False)
Ejemplo n.º 4
0
def runMEME(track, outfile, dbhandle):
    '''run MEME to find motifs.

    In order to increase the signal/noise ratio,
    MEME is not run on all intervals but only the
    top 10% of intervals (peakval) are used.
    Also, only the segment of 200 bp around the peak
    is used and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker

    This method is deprecated - use runMEMEOnSequences instead.
    '''
    # job_options = "-l mem_free=8000M"

    target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]),
                               "meme", outfile)

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(P.get_params()["genome_dir"],
                     P.get_params()["genome"]))

    tmpdir = P.get_temp_dir(".")
    tmpfasta = os.path.join(tmpdir, "in.fa")

    nseq = writeSequencesForIntervals(
        track,
        tmpfasta,
        dbhandle,
        full=False,
        masker=P.as_list(P.get_params()['motifs_masker']),
        halfwidth=int(P.get_params()["meme_halfwidth"]),
        maxsize=int(P.get_params()["meme_max_size"]),
        proportion=P.get_params()["meme_proportion"],
        min_sequences=P.get_params()["meme_min_sequences"])

    if nseq == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        iotools.touch_file(outfile)
    else:
        statement = '''
        meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log
        '''
        P.run(statement)

        collectMEMEResults(tmpdir, target_path, outfile)
Ejemplo n.º 5
0
def exportSequencesFromBedFile(infile, outfile, masker=None, mode="intervals"):
    '''export sequences for intervals in :term:`bed`-formatted *infile*
    to :term:`fasta` formatted *outfile*
    '''

    track = P.snip(infile, ".bed.gz")

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(P.get_params()["genome_dir"],
                     P.get_params()["genome"]))
    outs = iotools.open_file(outfile, "w")

    ids, seqs = [], []
    for bed in Bed.setName(Bed.iterator(iotools.open_file(infile))):
        lcontig = fasta.getLength(bed.contig)

        if mode == "intervals":
            seqs.append(fasta.getSequence(bed.contig, "+", bed.start, bed.end))
            ids.append("%s_%s %s:%i..%i" %
                       (track, bed.name, bed.contig, bed.start, bed.end))

        elif mode == "leftright":
            l = bed.end - bed.start

            start, end = max(0, bed.start - l), bed.end - l
            ids.append("%s_%s_l %s:%i..%i" %
                       (track, bed.name, bed.contig, start, end))
            seqs.append(fasta.getSequence(bed.contig, "+", start, end))

            start, end = bed.start + l, min(lcontig, bed.end + l)
            ids.append("%s_%s_r %s:%i..%i" %
                       (track, bed.name, bed.contig, start, end))
            seqs.append(fasta.getSequence(bed.contig, "+", start, end))

    masked = maskSequences(seqs, masker)
    outs.write("\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]))

    outs.close()
Ejemplo n.º 6
0
def buildContigBed(infile, outfile):
    '''
    Gets the contig sizes and co-ordinates from an indexed genome :term:`fasta`
    file and outputs them to :term:`BED` format
    Parameters
    ----------
    infile : str
      infile is constructed from `PARAMS` variable to retrieve
      the `genome` :term:`fasta` file
    Returns
    -------
    outfile : str
      :term:`BED` format file containing contig name, value (0) and contig size
      in nucleotides.  The output file name is defined in
      `PARAMS: interface_contigs_bed`
    '''
    prefix = P.snip(infile, ".fasta")
    fasta = IndexedFasta.IndexedFasta(prefix)
    outs = iotools.open_file(outfile, "w")

    for contig, size in fasta.getContigSizes(with_synonyms=False).items():
        outs.write("%s\t%i\t%i\n" % (contig, 0, size))

    outs.close()
Ejemplo n.º 7
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("-g",
                        "--genome-file",
                        dest="genome_file",
                        type=str,
                        help="filename with genome (indexed).")

    parser.add_argument("-w",
                        "--windows-bed-file",
                        dest="filename_windows",
                        type=str,
                        help="gff file with windows to use.")

    parser.add_argument("-d",
                        "--filename-data",
                        dest="filename_data",
                        type=str,
                        help="gff file with data to use.")

    parser.add_argument("--is-gtf",
                        dest="is_gtf",
                        action="store_true",
                        help="filename-data is gtf file")

    parser.add_argument("-f",
                        "--features",
                        dest="features",
                        type=str,
                        action="append",
                        choices=("GC", ),
                        help="features to compute.")

    parser.add_argument("-c",
                        "--decorator",
                        dest="decorator",
                        type=str,
                        choices=("counts", "gc", "gc3", "mean-length",
                                 "median-length", "percent-coverage",
                                 "median-score", "mean-score", "stddev-score",
                                 "min-score", "max-score"),
                        help="decorators to use.")

    parser.add_argument("-e",
                        "--skip-empty",
                        dest="skip_empty",
                        action="store_true",
                        help="skip empty windows.")

    parser.add_argument(
        "-t",
        "--transform=",
        dest="transform",
        type=str,
        choices=("none", "overlap", "complement", "third_codon"),
        help="transform to use when mapping overlapping regions onto window.")

    parser.set_defaults(
        genome_file=None,
        filename_windows=None,
        filename_data=None,
        features=[],
        skip_empty=False,
        decorator="counts",
        transform="none",
        is_gtf=False,
    )

    (args) = E.start(parser)

    #    test_transform_third_codon()

    if not args.filename_windows:
        raise ValueError("please supply a gff file with window information.")

    if args.loglevel >= 1:
        args.stdlog.write("# reading windows...")
        args.stdlog.flush()

    windows = GTF.readAsIntervals(
        GTF.iterator(iotools.open_file(args.filename_windows, "r")))

    if args.loglevel >= 1:
        args.stdlog.write("done\n")
        args.stdlog.flush()

    if args.filename_data:
        if args.loglevel >= 1:
            args.stdlog.write("# reading data...")
            args.stdlog.flush()

        if args.is_gtf:
            gff_data = GTF.readFromFile(
                iotools.open_file(args.filename_data, "r"))
        else:
            gff_data = GTF.readFromFile(
                IOTOols.open_file(args.filename_data, "r"))

        if args.loglevel >= 1:
            args.stdlog.write("done\n")
            args.stdlog.flush()

        data_ranges = GTF.SortPerContig(gff_data)
    else:
        # use windows to compute properties
        # by supplying no data and asking for the complement = original window
        gff_data = None
        data_ranges = None
        args.transform = "complement"

    map_contig2size = {}

    if args.genome_file:
        fasta = IndexedFasta.IndexedFasta(args.genome_file)
        map_contig2size = fasta.getContigSizes()
    else:
        for contig, values in list(windows.items()):
            map_contig2size[contig] = max(lambda x: x[1], values)
        fasta = None

    contigs = list(map_contig2size.keys())
    contigs.sort()

    # proceed contig wise
    noutput_contigs, ncontigs_skipped_windows, ncontigs_skipped_data = 0, 0, 0

    args.stdout.write("\t".join(
        map(str, ("contig", "start", "end", "ngenes", "ntranscripts", "n1",
                  "l1", "n2", "l2", "score", "extra_info"))) + "\n")

    for contig in contigs:

        skip = False
        if contig not in windows:
            ncontigs_skipped_windows += 1
            skip = True

        if data_ranges and contig not in data_ranges:
            ncontigs_skipped_data += 1
            skip = True

        if skip:
            continue

        noutput_contigs += 1
        if data_ranges:
            annotateWindows(
                contig, windows[contig],
                gff_data[data_ranges[contig][0]:data_ranges[contig][1]], fasta,
                args)
        else:
            annotateWindows(contig, windows[contig], [], fasta, args)

    E.info(
        "ninput_windows=%i, noutput_contigs=%i, ninput_contigs=%i, nskipped_windows=%i, nskipped_data=%i"
        % (len(windows), noutput_contigs, len(contigs),
           ncontigs_skipped_windows, ncontigs_skipped_data))

    E.stop()
Ejemplo n.º 8
0
def main(argv=None):
    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genomic sequence to retrieve "
                      "sequences from.")

    parser.add_option("-m", "--masker", dest="masker", type="choice",
                      choices=("dust", "dustmasker", "softmask", "none"),
                      help="apply masker to mask output sequences "
                      "[%default].")

    parser.add_option("--output-mode", dest="output_mode", type="choice",
                      choices=("intervals", "leftright", "segments"),
                      help="what to output. "
                      "'intervals' generates a single sequence for "
                      "each bed interval. 'leftright' generates two "
                      "sequences, one in each direction, for each bed "
                      "interval. 'segments' can be used to output "
                      "sequence from bed12 files so that sequence only covers "
                      "the segements [%default]")

    parser.add_option("--min-sequence-length", dest="min_length", type="int",
                      help="require a minimum sequence length [%default]")

    parser.add_option("--max-sequence-length", dest="max_length", type="int",
                      help="require a maximum sequence length [%default]")

    parser.add_option(
        "--extend-at", dest="extend_at", type="choice",
        choices=("none", "3", "5", "both", "3only", "5only"),
        help="extend at 3', 5' or both or no ends. If 3only or 5only "
        "are set, only the added sequence is returned [default=%default]")

    parser.add_option(
        "--extend-by", dest="extend_by", type="int",
        help="extend by # bases [default=%default]")

    parser.add_option(
        "--use-strand", dest="ignore_strand",
        action="store_false",
        help="use strand information and return reverse complement "
        "on intervals located on the negative strand. "
        "[default=%default]")

    parser.set_defaults(
        genome_file=None,
        masker=None,
        output_mode="intervals",
        min_length=0,
        max_length=0,
        extend_at=None,
        extend_by=100,
        ignore_strand=True,
    )

    (options, args) = E.start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = fasta.getContigSizes()
        fasta.setConverter(IndexedFasta.getConverter("zero-both-open"))

    counter = E.Counter()
    ids, seqs = [], []

    E.info("collecting sequences")
    for bed in Bed.setName(Bed.iterator(options.stdin)):
        counter.input += 1

        lcontig = fasta.getLength(bed.contig)

        if options.ignore_strand:
            strand = "+"
        else:
            strand = bed.strand

        if options.output_mode == "segments" and bed.columns == 12:
            ids.append("%s %s:%i..%i (%s) %s %s" %
                       (bed.name, bed.contig, bed.start, bed.end, strand,
                        bed["blockSizes"], bed["blockStarts"]))
            seg_seqs = [fasta.getSequence(bed.contig, strand, start, end)
                        for start, end in bed.toIntervals()]
            seqs.append("".join(seg_seqs))

        elif (options.output_mode == "intervals" or
              options.output_mode == "segments"):
            ids.append("%s %s:%i..%i (%s)" %
                       (bed.name, bed.contig, bed.start, bed.end, strand))
            seqs.append(
                fasta.getSequence(bed.contig, strand, bed.start, bed.end))

        elif options.output_mode == "leftright":
            l = bed.end - bed.start

            start, end = max(0, bed.start - l), bed.end - l
            ids.append("%s_l %s:%i..%i (%s)" %
                       (bed.name, bed.contig, start, end, strand))
            seqs.append(fasta.getSequence(bed.contig, strand, start, end))

            start, end = bed.start + l, min(lcontig, bed.end + l)
            ids.append("%s_r %s:%i..%i (%s)" %
                       (bed.name, bed.contig, start, end, strand))
            seqs.append(fasta.getSequence(bed.contig, strand, start, end))

    E.info("collected %i sequences" % len(seqs))

    masked = Masker.maskSequences(seqs, options.masker)
    options.stdout.write(
        "\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]) + "\n")

    E.info("masked %i sequences" % len(seqs))

    counter.output = len(seqs)

    E.info("%s" % counter)

    E.stop()
Ejemplo n.º 9
0
def main(argv=sys.argv):

    parser = E.ArgumentParser(description=__doc__)

    # IMS: new method: extend intervals by set amount
    parser.add_argument("-m",
                        "--method",
                        dest="methods",
                        type=str,
                        action="append",
                        choices=("merge", "filter-genome", "bins", "block",
                                 "sanitize-genome", "shift", "extend",
                                 "filter-names", "rename-chr"),
                        help="method to apply")

    parser.add_argument("--num-bins",
                        dest="num_bins",
                        type=int,
                        help="number of bins into which to merge (used for "
                        "method `bins)")

    parser.add_argument("--bin-edges",
                        dest="bin_edges",
                        type=str,
                        help="bin_edges for binning method")

    parser.add_argument(
        "--binning-method",
        dest="binning_method",
        type=str,
        choices=("equal-bases", "equal-intervals", "equal-range"),
        help="method used for binning (used for method `bins` if no "
        "bin_edges is given)")

    parser.add_argument(
        "--merge-distance",
        dest="merge_distance",
        type=int,
        help="distance in bases over which to merge that are not "
        "directly adjacent")

    parser.add_argument(
        "--merge-min-intervals",
        dest="merge_min_intervals",
        type=int,
        help="only output merged intervals that are build from at least "
        "x intervals")

    parser.add_argument("--merge-by-name",
                        dest="merge_by_name",
                        action="store_true",
                        help="only merge intervals with the same name")

    parser.add_argument(
        "--merge-and-resolve-blocks",
        dest="resolve_blocks",
        action="store_true",
        help="When merging bed12 entrys, should blocks be resolved?")

    parser.add_argument("--merge-stranded",
                        dest="stranded",
                        action="store_true",
                        help="Only merge intervals on the same strand")

    parser.add_argument(
        "--remove-inconsistent-names",
        dest="remove_inconsistent_names",
        action="store_true",
        help="when merging, do not output intervals where the names of "
        "overlapping intervals do not match")

    parser.add_argument("--offset",
                        dest="offset",
                        type=int,
                        help="offset for shifting intervals")

    parser.add_argument("-g",
                        "--genome-file",
                        dest="genome_file",
                        type=str,
                        help="filename with genome.")

    parser.add_argument("-b",
                        "--bam-file",
                        dest="bam_file",
                        type=str,
                        help="bam-formatted filename with genome.")

    parser.add_argument("--filter-names-file",
                        dest="names",
                        type=str,
                        help="list of names to keep. One per line")

    parser.add_argument(
        "--rename-chr-file",
        dest="rename_chr_file",
        type=str,
        help="mapping table between old and new chromosome names."
        "TAB separated 2-column file.")

    parser.set_defaults(methods=[],
                        merge_distance=0,
                        binning_method="equal-bases",
                        merge_by_name=False,
                        genome_file=None,
                        rename_chr_file=None,
                        bam_file=None,
                        num_bins=5,
                        merge_min_intervals=1,
                        bin_edges=None,
                        offset=10000,
                        test=None,
                        extend_distance=1000,
                        remove_inconsistent_names=False,
                        resolve_blocks=False)

    (args) = E.start(parser, add_pipe_options=True)

    contigs = None
    chr_map = None

    # Why provide full indexed genome, when a tsv of contig sizes would do?
    if args.genome_file:
        genome_fasta = IndexedFasta.IndexedFasta(args.genome_file)
        contigs = genome_fasta.getContigSizes()

    if args.bam_file:
        samfile = pysam.AlignmentFile(args.bam_file)
        contigs = dict(list(zip(samfile.references, samfile.lengths)))

    if args.rename_chr_file:
        chr_map = {}
        with open(args.rename_chr_file, 'r') as filein:
            reader = csv.reader(filein, delimiter='\t')
            for row in reader:
                if len(row) != 2:
                    raise ValueError(
                        "Mapping table must have exactly two columns")
                chr_map[row[0]] = row[1]
        if not len(chr_map.keys()) > 0:
            raise ValueError("Empty mapping dictionnary")

    processor = Bed.iterator(args.stdin)

    for method in args.methods:
        if method == "filter-genome":
            if not contigs:
                raise ValueError("please supply contig sizes")
            processor = filterGenome(processor, contigs)
        elif method == "sanitize-genome":
            if not contigs:
                raise ValueError("please supply contig sizes")
            processor = sanitizeGenome(processor, contigs)
        elif method == "merge":
            processor = merge(
                processor,
                args.merge_distance,
                by_name=args.merge_by_name,
                min_intervals=args.merge_min_intervals,
                remove_inconsistent=args.remove_inconsistent_names,
                resolve_blocks=args.resolve_blocks,
                stranded=args.stranded)
        elif method == "bins":
            if args.bin_edges:
                bin_edges = list(map(float, args.bin_edges.split(",")))
                # IMS: check bin edges are valid
                if not (len(bin_edges) == args.num_bins + 1):
                    raise ValueError(
                        "Number of bin edge must be one more than "
                        "number of bins")
            else:
                bin_edges = None
            processor, bin_edges = Bed.binIntervals(processor,
                                                    num_bins=args.num_bins,
                                                    method=args.binning_method,
                                                    bin_edges=bin_edges)
            E.info("# split bed: bin_edges=%s" % (str(bin_edges)))

        elif method == "block":
            processor = Bed.blocked_iterator(processor)
        elif method == "shift":
            # IMS: test that contig sizes are availible
            if not contigs:
                raise ValueError("please supply genome file")
            processor = shiftIntervals(processor, contigs, offset=args.offset)
        # IMS: new method: extend intervals by set amount
        elif method == "extend":
            if not contigs:
                raise ValueError("please supply genome file")
            processor = extendInterval(processor, contigs, args.offset)
        elif method == "filter-names":
            if not args.names:
                raise ValueError("please supply list of names to filter")
            names = [name.strip() for name in open(args.names)]
            processor = filterNames(processor, names)
        elif method == "rename-chr":
            if not chr_map:
                raise ValueError("please supply mapping file")
            processor = renameChromosomes(processor, chr_map)

    noutput = 0
    for bed in processor:
        args.stdout.write(str(bed) + "\n")
        noutput += 1

    E.info("noutput=%i" % (noutput))

    E.stop()
Ejemplo n.º 10
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: gff2psl.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"])

    parser.add_option("--is-gtf", dest="is_gtf", action="store_true",
                      help="input is gtf.")

    parser.add_option("--no-header", dest="with_header", action="store_false",
                      help="do not output BLAT header [default=%default].")

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome.")

    parser.add_option("--queries-tsv-file", dest="input_filename_queries", type="string",
                      help="fasta filename with queries [default=%default].")

    parser.add_option("--allow-duplicates", dest="allow_duplicates", action="store_true",
                      help="""permit duplicate entries. Adjacent exons of a transcript will still be merged [default=%default]."""  )

    parser.set_defaults(is_gtf=False,
                        genome_file=None,
                        with_header=True,
                        allow_duplicates=False,
                        test=None)

    (options, args) = E.start(parser, add_pipe_options=True)

    if options.genome_file:
        genome_fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        genome_fasta = None

    if options.input_filename_queries:
        queries_fasta = IndexedFasta.IndexedFasta(
            options.input_filename_queries)
    else:
        queries_fasta = None

    ninput, noutput, nskipped = 0, 0, 0

    if options.is_gtf:
        iterator = GTF.transcript_iterator(GTF.iterator_filtered(GTF.iterator(sys.stdin),
                                                                 feature="exon"),
                                           strict=not options.allow_duplicates)
    else:
        iterator = GTF.joined_iterator(GTF.iterator(sys.stdin))

    if options.with_header:
        options.stdout.write(Blat.Match().getHeader() + "\n")

    for gffs in iterator:

        if options.test and ninput >= options.test:
            break

        ninput += 1

        result = alignlib_lite.py_makeAlignmentBlocks()

        xstart = 0

        intervals = Intervals.combine([(gff.start, gff.end) for gff in gffs])

        for start, end in intervals:
            xend = xstart + end - start

            result.addDiagonal(xstart, xend,
                               start - xstart)
            xstart = xend

        entry = Blat.Match()
        entry.mQueryId = gffs[0].transcript_id
        entry.mSbjctId = gffs[0].contig
        entry.strand = gffs[0].strand

        if genome_fasta:
            if entry.mSbjctId in genome_fasta:
                entry.mSbjctLength = genome_fasta.getLength(entry.mSbjctId)
            else:
                entry.mSbjctLength = result.getColTo()

        if queries_fasta:
            if entry.mQueryId in queries_fasta:
                entry.mQueryLength = queries_fasta.getLength(entry.mQueryId)
        else:
            entry.mQueryLength = result.getRowTo()

        entry.fromMap(result)

        options.stdout.write(str(entry) + "\n")
        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.stop()
Ejemplo n.º 11
0
def main(argv=None):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("-g",
                        "--genome-file",
                        dest="genome_file",
                        type=str,
                        help="filename with genome.")

    parser.add_argument("-q",
                        "--quality-file",
                        dest="quality_file",
                        type=str,
                        help="filename with genomic base quality "
                        "information.")

    parser.add_argument("-b",
                        "--bam-file",
                        dest="bam_files",
                        type=str,
                        metavar="bam",
                        help="filename with read mapping information. "
                        "Multiple files can be submitted in a "
                        "comma-separated list.")

    parser.add_argument("-i",
                        "--bigwig-file",
                        dest="bigwig_file",
                        type=str,
                        metavar="bigwig",
                        help="filename with bigwig information ")

    parser.add_argument("-f",
                        "--gff-file",
                        dest="filename_gff",
                        type=str,
                        action="append",
                        metavar='bed',
                        help="filename with extra gff files. The order "
                        "is important.")

    parser.add_argument("--filename-format",
                        dest="filename_format",
                        type=str,
                        choices=("bed", "gff", "gtf"),
                        help="format of secondary stream.")

    parser.add_argument("--restrict-source",
                        dest="gff_sources",
                        type=str,
                        action="append",
                        help="restrict input to this 'source' in extra "
                        "gff file (for counter: overlap).")

    parser.add_argument("--restrict-feature",
                        dest="gff_features",
                        type=str,
                        action="append",
                        help="restrict input to this 'feature' in extra gff "
                        "file (for counter: overlap).")

    parser.add_argument("-r",
                        "--reporter",
                        dest="reporter",
                        type=str,
                        choices=("genes", "transcripts"),
                        help="report results for 'genes' or 'transcripts' ")

    parser.add_argument("-s",
                        "--section",
                        dest="sections",
                        type=str,
                        action="append",
                        choices=("exons", "introns"),
                        help="select range on which counters will operate ")

    parser.add_argument(
        "-c",
        "--counter",
        dest="counters",
        type=str,
        action="append",
        choices=("bigwig-counts", "binding-pattern", "classifier",
                 "classifier-rnaseq", "classifier-rnaseq-splicing",
                 "classifier-polii", "composition-na", "composition-cpg",
                 "coverage", "distance", "distance-genes", "distance-tss",
                 "length", 'neighbours', "overlap", "overlap-stranded",
                 "overlap-transcripts", "overrun", "position", "proximity",
                 "proximity-exclusive", "proximity-lengthmatched", "quality",
                 "read-coverage", "read-extension", "read-overlap",
                 "read-counts", "read-fullcounts", "readpair-counts",
                 "readpair-fullcounts", "splice", "splice-comparison",
                 "territories"),
        help="select counters to apply to input ")

    parser.add_argument("--add-gtf-source",
                        dest="add_gtf_source",
                        action="store_true",
                        help="add gtf field of source to output ")

    parser.add_argument("--proximal-distance",
                        dest="proximal_distance",
                        type=int,
                        help="distance to be considered proximal to "
                        "an interval.")

    parser.add_argument("--multi-mapping-method",
                        dest="multi_mapping",
                        type=str,
                        choices=('all', 'ignore', 'weight'),
                        help="how to treat multi-mapping reads in "
                        "bam-files. Requires "
                        "the NH flag to be set by the mapper ")

    parser.add_argument("--use-barcodes",
                        dest="use_barcodes",
                        action="store_true",
                        help="Use barcodes to count unique umi's. "
                        "UMI's are specified in the read identifier "
                        "as the last field, where fields are separated "
                        "by underscores, e.g. "
                        "@READ:ILLUMINA:STUFF_NAMINGSTUFF_UMI. "
                        "When true, unique counts are returned. "
                        "Currently only compatible with count-reads")

    parser.add_argument("--sample-probability",
                        dest="sample_probability",
                        type=float,
                        help="Specify the probability of whether any"
                        "given read or read pair in a file bam is counted"
                        "Currently only compatible with count-reads")

    parser.add_argument("--column-prefix",
                        dest="prefixes",
                        type=str,
                        action="append",
                        help="add prefix to column headers - prefixes "
                        "are used in the same order as the counters ")

    parser.add_argument("--library-type",
                        dest="library_type",
                        type=str,
                        choices=("unstranded", "firststrand", "secondstrand",
                                 "fr-unstranded", "fr-firststrand",
                                 "fr-secondstrand"),
                        help="library type of reads in bam file. ")

    parser.add_argument("--min-mapping-quality",
                        dest="minimum_mapping_quality",
                        type=float,
                        help="minimum mapping quality. Reads with a quality "
                        "score of less will be ignored. ")

    parser.set_defaults(genome_file=None,
                        reporter="genes",
                        with_values=True,
                        sections=[],
                        counters=[],
                        filename_gff=[],
                        filename_format=None,
                        gff_features=[],
                        gff_sources=[],
                        add_gtf_source=False,
                        proximal_distance=10000,
                        bam_files=None,
                        multi_mapping='all',
                        library_type='fr-unstranded',
                        prefixes=[],
                        minimum_mapping_quality=0,
                        use_barcodes=False,
                        sample_probability=1.0)

    if not argv:
        argv = sys.argv

    (args) = E.start(parser, add_output_options=True, argv=argv)

    if args.prefixes:
        if len(args.prefixes) != len(args.counters):
            raise ValueError("if any prefix is given, the number of prefixes "
                             "must be the same as the number of counters")

    # get files
    if args.genome_file:
        fasta = IndexedFasta.IndexedFasta(args.genome_file)
    else:
        fasta = None

    if args.quality_file:
        quality = IndexedFasta.IndexedFasta(args.quality_file)
        quality.setTranslator(IndexedFasta.TranslatorBytes())
    else:
        quality = None

    if args.bam_files:
        bam_files = []
        for bamfile in args.bam_files.split(","):
            bam_files.append(pysam.AlignmentFile(bamfile, "rb"))
    else:
        bam_files = None

    if args.bigwig_file:
        bigwig_file = pyBigWig.open(args.bigwig_file)
    else:
        bigwig_file = None

    counters = []

    if not args.sections:
        E.info("counters will use the default section (exons)")
        args.sections.append(None)

    if not args.gff_sources:
        args.gff_sources.append(None)
    if not args.gff_features:
        args.gff_features.append(None)

    cc = E.Counter()

    for n, c in enumerate(args.counters):
        if args.prefixes:
            prefix = args.prefixes[n]
        else:
            prefix = None

        if c == "position":
            for section in args.sections:
                counters.append(
                    GeneModelAnalysis.CounterPosition(section=section,
                                                      options=args,
                                                      prefix=prefix))
        elif c == "length":
            for section in args.sections:
                counters.append(
                    GeneModelAnalysis.CounterLengths(section=section,
                                                     options=args,
                                                     prefix=prefix))
        elif c == "splice":
            if fasta is None:
                raise ValueError('splice requires a genomic sequence')
            counters.append(
                GeneModelAnalysis.CounterSpliceSites(fasta=fasta,
                                                     prefix=prefix))
        elif c == "quality":
            if fasta is None:
                raise ValueError('quality requires a quality score sequence')
            counters.append(
                GeneModelAnalysis.CounterQuality(fasta=quality, prefix=prefix))
        elif c == "overrun":
            counters.append(
                GeneModelAnalysis.CounterOverrun(
                    filename_gff=args.filename_gff,
                    options=args,
                    prefix=prefix))
        elif c == "read-coverage":
            counters.append(
                GeneModelAnalysis.CounterReadCoverage(bam_files,
                                                      options=args,
                                                      prefix=prefix))
        elif c == "read-extension":
            counters.append(
                GeneModelAnalysis.CounterReadExtension(
                    bam_files,
                    filename_gff=args.filename_gff,
                    options=args,
                    prefix=prefix))
        elif c == "read-overlap":
            counters.append(
                GeneModelAnalysis.CounterReadOverlap(
                    bam_files,
                    multi_mapping=args.multi_mapping,
                    minimum_mapping_quality=args.minimum_mapping_quality,
                    options=args,
                    prefix=prefix))
        elif c == "read-counts":
            counters.append(
                GeneModelAnalysis.CounterReadCounts(
                    bam_files,
                    multi_mapping=args.multi_mapping,
                    use_barcodes=args.use_barcodes,
                    sample_probability=args.sample_probability,
                    minimum_mapping_quality=args.minimum_mapping_quality,
                    options=args,
                    prefix=prefix))
        elif c == "read-fullcounts":
            counters.append(
                GeneModelAnalysis.CounterReadCountsFull(
                    bam_files,
                    multi_mapping=args.multi_mapping,
                    sample_probability=args.sample_probability,
                    minimum_mapping_quality=args.minimum_mapping_quality,
                    options=args,
                    prefix=prefix))
        elif c == "readpair-counts":
            counters.append(
                GeneModelAnalysis.CounterReadPairCounts(
                    bam_files,
                    multi_mapping=args.multi_mapping,
                    sample_probability=args.sample_probability,
                    library_type=args.library_type,
                    minimum_mapping_quality=args.minimum_mapping_quality,
                    options=args,
                    prefix=prefix))
        elif c == "readpair-fullcounts":
            counters.append(
                GeneModelAnalysis.CounterReadPairCountsFull(
                    bam_files,
                    multi_mapping=args.multi_mapping,
                    sample_probability=args.sample_probability,
                    minimum_mapping_quality=args.minimum_mapping_quality,
                    options=args,
                    prefix=prefix))
        elif c == "bigwig-counts":
            counters.append(
                GeneModelAnalysis.CounterBigwigCounts(bigwig_file,
                                                      options=args,
                                                      prefix=prefix))
        elif c == "splice-comparison":
            if fasta is None:
                raise ValueError('splice-comparison requires a genomic '
                                 'sequence')
            counters.append(
                GeneModelAnalysis.CounterSpliceSiteComparison(
                    fasta=fasta,
                    filename_gff=args.filename_gff,
                    feature=None,
                    source=None,
                    options=args,
                    prefix=prefix))
        elif c == "composition-na":
            if fasta is None:
                raise ValueError('composition-na requires a genomic sequence')
            for section in args.sections:
                counters.append(
                    GeneModelAnalysis.CounterCompositionNucleotides(
                        fasta=fasta,
                        section=section,
                        options=args,
                        prefix=prefix))
        elif c == "composition-cpg":
            if fasta is None:
                raise ValueError('composition-cpg requires a genomic sequence')
            for section in args.sections:
                counters.append(
                    GeneModelAnalysis.CounterCompositionCpG(fasta=fasta,
                                                            section=section,
                                                            options=args,
                                                            prefix=prefix))

        elif c in ("overlap", "overlap-stranded", "overlap-transcripts",
                   "proximity", "proximity-exclusive",
                   "proximity-lengthmatched", "neighbours", "territories",
                   "distance", "distance-genes", "distance-tss",
                   "binding-pattern", "coverage"):
            if c == "overlap":
                template = GeneModelAnalysis.CounterOverlap
            if c == "overlap-stranded":
                template = GeneModelAnalysis.CounterOverlapStranded
            elif c == "overlap-transcripts":
                template = GeneModelAnalysis.CounterOverlapTranscripts
            elif c == "proximity":
                template = GeneModelAnalysis.CounterProximity
            elif c == "neighbours":
                template = GeneModelAnalysis.CounterNeighbours
            elif c == "proximity-exclusive":
                template = GeneModelAnalysis.CounterProximityExclusive
            elif c == "proximity-lengthmatched":
                template = GeneModelAnalysis.CounterProximityLengthMatched
            elif c == "territories":
                template = GeneModelAnalysis.CounterTerritories
            elif c == "distance":
                template = GeneModelAnalysis.CounterDistance
            elif c == "distance-genes":
                template = GeneModelAnalysis.CounterDistanceGenes
            elif c == "distance-tss":
                template = GeneModelAnalysis.CounterDistanceTranscriptionStartSites
            elif c == "coverage":
                template = GeneModelAnalysis.CounterCoverage
            elif c == "binding-pattern":
                template = GeneModelAnalysis.CounterBindingPattern

            for section in args.sections:
                for source in args.gff_sources:
                    for feature in args.gff_features:
                        counters.append(
                            template(filename_gff=args.filename_gff,
                                     feature=feature,
                                     source=source,
                                     fasta=fasta,
                                     section=section,
                                     options=args,
                                     prefix=prefix))

        elif c == "classifier":
            counters.append(
                GeneModelAnalysis.Classifier(filename_gff=args.filename_gff,
                                             fasta=fasta,
                                             options=args,
                                             prefix=prefix))

        elif c == "classifier-rnaseq":
            counters.append(
                GeneModelAnalysis.ClassifierRNASeq(
                    filename_gff=args.filename_gff,
                    fasta=fasta,
                    options=args,
                    prefix=prefix))
        elif c == "classifier-rnaseq-splicing":
            counters.append(
                GeneModelAnalysis.ClassifierRNASeqSplicing(
                    filename_gff=args.filename_gff,
                    fasta=fasta,
                    options=args,
                    prefix=prefix))
        elif c == "classifier-polii":
            counters.append(
                GeneModelAnalysis.ClassifierPolII(
                    filename_gff=args.filename_gff,
                    feature=None,
                    source=None,
                    fasta=fasta,
                    options=args,
                    prefix=prefix))
        elif c == "binding-pattern":
            counters.append(
                GeneModelAnalysis.CounterBindingPattern(
                    filename_gff=args.filename_gff,
                    feature=None,
                    source=None,
                    fasta=fasta,
                    options=args,
                    prefix=prefix))

    if args.reporter == "genes":
        iterator = GTF.flat_gene_iterator
        header = ["gene_id"]
        fheader = lambda x: [x[0].gene_id]
    elif args.reporter == "transcripts":
        iterator = GTF.transcript_iterator
        header = ["transcript_id"]
        fheader = lambda x: [x[0].transcript_id]

    if args.add_gtf_source:
        header.append("source")
        ffields = lambda x: [x[0].source]
    else:
        ffields = lambda x: []

    args.stdout.write("\t".join(header + [x.getHeader()
                                          for x in counters]) + "\n")

    for gffs in iterator(GTF.iterator(args.stdin)):
        cc.input += 1

        for counter in counters:
            counter.update(gffs)

        skip = len([x for x in counters if x.skip]) == len(counters)
        if skip:
            cc.skipped += 1
            continue

        args.stdout.write("\t".join(
            fheader(gffs) + ffields(gffs) +
            [str(counter) for counter in counters]) + "\n")

        cc.output += 1

    E.info("%s" % str(cc))
    for counter in counters:
        E.info("%s\t%s" % (repr(counter), str(counter.counter)))
    E.stop()
Ejemplo n.º 12
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("-g",
                        "--genome-file",
                        dest="genome_file",
                        type=str,
                        help="filename with genome.")

    parser.add_argument(
        "-b",
        "--bam-file",
        dest="bam_files",
        type=str,
        help="filename with read mapping information. Multiple files can be "
        "submitted in a comma-separated list.")

    parser.add_argument(
        "--control-bam-file",
        dest="control_bam_files",
        type=str,
        help="filename with read mapping information for input/control. "
        "Multiple files can be submitted in a comma-separated list ")

    parser.add_argument("--filename-format",
                        dest="filename_format",
                        type=str,
                        choices=("bed", "gff", "gtf"),
                        help="format of secondary stream.")

    parser.add_argument("-c",
                        "--counter",
                        dest="counters",
                        type=str,
                        action="append",
                        choices=("length", "overlap", "peaks",
                                 "composition-na", "composition-cpg",
                                 "classifier-chipseq", "motif"),
                        help="select counters to apply.")

    parser.add_argument("--motif-sequence",
                        dest="motif_sequence",
                        type=str,
                        help="specify a sequence to search for")

    parser.add_argument(
        "-o",
        "--offset",
        dest="offsets",
        type=int,
        action="append",
        help="tag offsets for tag counting - supply as many as there "
        "are bam-files")

    parser.add_argument(
        "--control-offset",
        dest="control_offsets",
        type=int,
        action="append",
        help="control tag offsets for tag counting - supply as many as "
        "there are bam-files.")

    parser.add_argument(
        "-a",
        "--output-all-fields",
        dest="all_fields",
        action="store_true",
        help="output all fields in original bed file, by default only "
        "the first 4 are output.")

    parser.add_argument(
        "--output-bed-headers",
        dest="bed_headers",
        type=str,
        help="supply ',' separated list of headers for bed component ")

    parser.add_argument(
        "-f",
        "--gff-file",
        dest="filename_gff",
        type=str,
        action="append",
        metavar='bed',
        help="filename with extra gff files. The order is important")

    parser.add_argument(
        "--has-header",
        dest="has_header",
        action="store_true",
        help="bed file with headers. Headers and first columns are "
        "preserved ")

    parser.set_defaults(genome_file=None,
                        counters=[],
                        bam_files=None,
                        offsets=[],
                        control_bam_files=None,
                        control_offsets=[],
                        all_fields=False,
                        filename_format=None,
                        bed_headers=None,
                        filename_gff=[],
                        has_header=False,
                        motif_sequence=None)

    (args) = E.start(parser)

    if args.bed_headers is not None:
        bed_headers = [x.strip() for x in args.bed_headers.split(",")]
        if len(bed_headers) < 3:
            raise ValueError("a bed file needs at least three columns")
    else:
        bed_headers = None

    if args.has_header:
        while 1:
            line = args.stdin.readline()
            if not line:
                E.warn("empty bed file with no header")
                E.stop()
                return
            if not line.startswith("#"):
                break
        bed_headers = line[:-1].split("\t")

    if "motif" in args.counters and not args.motif_sequence:
        raise ValueError("if using motif must specify a motif-sequence")

    # get files
    if args.genome_file:
        fasta = IndexedFasta.IndexedFasta(args.genome_file)
    else:
        fasta = None

    if args.bam_files:
        bam_files = []
        for bamfile in args.bam_files.split(","):
            bam_files.append(pysam.AlignmentFile(bamfile, "rb"))
    else:
        bam_files = None

    if args.control_bam_files:
        control_bam_files = []
        for bamfile in args.control_bam_files.split(","):
            control_bam_files.append(pysam.AlignmentFile(bamfile, "rb"))
    else:
        control_bam_files = None

    counters = []

    for c in args.counters:
        if c == "length":
            counters.append(CounterLength(fasta=fasta, options=args))

        elif c == "overlap":
            counters.append(
                CounterOverlap(filename=args.filename_gff[0],
                               fasta=fasta,
                               options=args))
            del args.filename_gff[0]
        elif c == "peaks":
            counters.append(
                CounterPeaks(bam_files,
                             args.offsets,
                             control_bam_files,
                             args.control_offsets,
                             options=args))
        elif c == "composition-na":
            counters.append(
                CounterCompositionNucleotides(fasta=fasta, options=args))
        elif c == "composition-cpg":
            counters.append(CounterCompositionCpG(fasta=fasta, options=args))
        elif c == "classifier-chipseq":
            counters.append(
                ClassifierChIPSeq(filename_gff=args.filename_gff,
                                  fasta=fasta,
                                  options=args,
                                  prefix=None))
            del args.filename_gff[0]

        elif c == "motif":
            counters.append(
                CounterMotif(fasta=fasta, motif=args.motif_sequence))

    extra_fields = None

    for bed in Bed.iterator(args.stdin):

        if extra_fields is None:

            # output explicitely given headers
            if bed_headers:
                if len(bed_headers) > bed.columns:
                    raise ValueError(
                        "insufficient columns (%i, expected %i) in %s" %
                        (bed.columns, len(bed_headers), str(bed)))

            else:
                bed_headers = Bed.Headers[:bed.columns]

            args.stdout.write("\t".join(bed_headers))
            args.stdout.write("\t" +
                              "\t".join([x.getHeader()
                                         for x in counters]) + "\n")

            extra_fields = list(range(len(bed_headers) - 3))

        for counter in counters:
            counter.update(bed)

        if args.all_fields:
            args.stdout.write(str(bed))
        else:
            args.stdout.write("\t".join(
                [bed.contig, str(bed.start),
                 str(bed.end)] + [bed.fields[x] for x in extra_fields]))
        for counter in counters:
            args.stdout.write("\t%s" % str(counter))

        args.stdout.write("\n")

    E.stop()
Ejemplo n.º 13
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: "
                            "$Id: gff2coverage.py 2781 2009-09-10 11:33:14Z "
                            "andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default]")

    parser.add_option("-f",
                      "--features",
                      dest="features",
                      type="string",
                      action="append",
                      help="features to collect "
                      "[default=%default]")

    parser.add_option("-w",
                      "--window-size",
                      dest="window_size",
                      type="int",
                      help="window size in bp for histogram computation. "
                      "Determines the bin size.  "
                      "[default=%default]")

    parser.add_option("-b",
                      "--num-bins",
                      dest="num_bins",
                      type="int",
                      help="number of bins for histogram computation "
                      "if window size is not given. "
                      "[default=%default]")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=(
                          "genomic",
                          "histogram",
                      ),
                      help="methods to apply. "
                      "[default=%default]")

    parser.set_defaults(
        genome_file=None,
        window_size=None,
        num_bins=1000,
        value_format="%6.4f",
        features=[],
        method="genomic",
    )

    (options, args) = E.start(parser, add_output_options=True)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
    else:
        fasta = None

    if options.method == "histogram":

        gff = GTF.readFromFile(options.stdin)

        gff.sort(key=lambda x: (x.contig, x.start))

        chunk = []
        last_contig = None

        for entry in gff:

            if last_contig != entry.contig:
                processChunk(last_contig, chunk, options, fasta)
                last_contig = entry.contig
                chunk = []

            chunk.append(entry)

        processChunk(last_contig, chunk, options, fasta)

    elif options.method == "genomic":
        intervals = collections.defaultdict(int)
        bases = collections.defaultdict(int)
        total = 0
        for entry in GTF.iterator(options.stdin):
            intervals[(entry.contig, entry.source, entry.feature)] += 1
            bases[(entry.contig, entry.source,
                   entry.feature)] += entry.end - entry.start
            total += entry.end - entry.start

        options.stdout.write("contig\tsource\tfeature\tintervals\tbases")
        if fasta:
            options.stdout.write(
                "\tpercent_coverage\ttotal_percent_coverage\n")
        else:
            options.stdout.write("\n")

        total_genome_size = sum(
            fasta.getContigSizes(with_synonyms=False).values())

        for key in sorted(intervals.keys()):
            nbases = bases[key]
            nintervals = intervals[key]
            contig, source, feature = key
            options.stdout.write("\t".join(
                ("\t".join(key), str(nintervals), str(nbases))))
            if fasta:
                options.stdout.write(
                    "\t%f" % (100.0 * float(nbases) / fasta.getLength(contig)))
                options.stdout.write(
                    "\t%f\n" % (100.0 * float(nbases) / total_genome_size))
            else:
                options.stdout.write("\n")

    E.stop()
Ejemplo n.º 14
0
def writeSequencesForIntervals(track,
                               filename,
                               dbhandle,
                               full=False,
                               halfwidth=None,
                               maxsize=None,
                               proportion=None,
                               masker=[],
                               offset=0,
                               shuffled=False,
                               num_sequences=None,
                               min_sequences=None,
                               order="peakval",
                               shift=None):
    '''build a sequence set for motif discovery. Intervals are taken from
    the table <track>_intervals in the database *dbhandle* and save to
    *filename* in :term:`fasta` format.

    If num_shuffles is set, shuffled copies are created as well with
    the shuffled number appended to the filename.

    The sequences are masked before shuffling (is this appropriate?)

    If *full* is set, the whole intervals will be output, otherwise
    only the region around the peak given by *halfwidth*

    If *maxsize* is set, the output is truncated at *maxsize* characters
    in order to create jobs that take too long.

    If proportion is set, only the top *proportion* intervals are output
    (sorted by peakval).

    If *num_sequences* is set, the first *num_sequences* will be used.

    *masker* can be a combination of
        * dust, dustmasker: apply dustmasker
        * softmask: mask softmasked genomic regions

    *order* is the order by which peaks should be sorted. Possible
    values are 'peakval' (peak value, descending order), 'score' (peak
    score, descending order)

    If *shift* is set, intervals will be shifted. ``leftright``
    creates two intervals on the left and right of the actual
    interval. The intervals will be centered around the mid-point and
    truncated the same way as the main intervals.

    '''

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(P.get_params()["genome_dir"],
                     P.get_params()["genome"]))

    if order == "peakval":
        orderby = " ORDER BY peakval DESC"
    elif order == "max":
        orderby = " ORDER BY score DESC"
    else:
        raise ValueError(
            "Unknown value passed as order parameter, check your ini file")

    tablename = "%s_intervals" % P.tablequote(track)
    statement = '''SELECT contig, start, end, interval_id, peakcenter
    FROM %(tablename)s
    ''' % locals() + orderby

    cc = dbhandle.execute(statement)
    data = cc.fetchall()
    cc.close()

    if proportion:
        cutoff = int(len(data) * proportion) + 1
        if min_sequences:
            cutoff = max(cutoff, min_sequences)
    elif num_sequences:
        cutoff = num_sequences
    else:
        cutoff = len(data)
        L.info(
            "writeSequencesForIntervals %s: using at most %i sequences for pattern finding"
            % (track, cutoff))

    data = data[:cutoff]

    L.info("writeSequencesForIntervals %s: masker=%s" % (track, str(masker)))

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(P.get_params()["genome_dir"],
                     P.get_params()["genome"]))

    # modify the ranges
    if shift:
        if shift == "leftright":
            new_data = [(contig, start - (end - start), start,
                         str(interval_id) + "_left", peakcenter)
                        for contig, start, end, interval_id, peakcenter in data
                        ]
            new_data.extend([
                (contig, end, end + (end - start), str(interval_id) + "_right",
                 peakcenter)
                for contig, start, end, interval_id, peakcenter in data
            ])
        data = new_data

    if halfwidth:
        # center around peakcenter, add halfwidth on either side
        data = [(contig, peakcenter - halfwidth, peakcenter + halfwidth,
                 interval_id)
                for contig, start, end, interval_id, peakcenter in data]
    else:
        # remove peakcenter
        data = [(contig, start, end, interval_id)
                for contig, start, end, interval_id, peakcenter in data]

    # get the sequences - cut at number of nucleotides
    sequences = []
    current_size, nseq = 0, 0
    new_data = []
    for contig, start, end, interval_id in data:
        lcontig = fasta.getLength(contig)
        start, end = max(0, start + offset), min(end + offset, lcontig)
        if start >= end:
            L.info(
                "writeSequencesForIntervals %s: sequence %s is empty: start=%i, end=%i, offset=%i - ignored"
                % (track, id, start, end, offset))
            continue
        seq = fasta.getSequence(contig, "+", start, end)
        sequences.append(seq)
        new_data.append((start, end, interval_id, contig))
        current_size += len(seq)
        if maxsize and current_size >= maxsize:
            L.info(
                "writeSequencesForIntervals %s: maximum size (%i) reached - only %i sequences output (%i ignored)"
                % (track, maxsize, nseq, len(data) - nseq))
            break
        nseq += 1

    data = new_data

    if shuffled:
        # note that shuffling is done on the unmasked sequences
        # Otherwise N's would be interspersed with real sequence
        # messing up motif finding unfairly. Instead, masking is
        # done on the shuffled sequence.
        sequences = [list(x) for x in sequences]
        for sequence in sequences:
            random.shuffle(sequence)
        sequences = maskSequences(["".join(x) for x in sequences], masker)

    c = E.Counter()
    outs = iotools.open_file(filename, "w")
    for masker in masker:
        if masker not in ("unmasked", "none", None):
            sequences = maskSequences(sequences, masker)

    for sequence, d in zip(sequences, data):
        c.input += 1
        if len(sequence) == 0:
            c.empty += 1
            continue
        start, end, id, contig = d
        id = "%s_%s %s:%i-%i" % (track, str(id), contig, start, end)
        outs.write(">%s\n%s\n" % (id, sequence))
        c.output += 1
    outs.close()

    E.info("%s" % c)

    return c.output
Ejemplo n.º 15
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("-g",
                        "--genome-file",
                        dest="genome_file",
                        type=str,
                        help="filename with genome")

    parser.add_argument(
        "-i",
        "--ignore-missing",
        dest="ignore_missing",
        action="store_true",
        help="Ignore transcripts on contigs that are not in the genome-file.")

    parser.add_argument(
        "--min-intron-length",
        dest="min_intron_length",
        type=int,
        help=
        "minimum intron length. If the distance between two consecutive exons is smaller, the region will be marked 'unknown"
    )

    parser.add_argument("-m",
                        "--method",
                        dest="method",
                        type=str,
                        choices=["full"],
                        help="method to apply")

    parser.set_defaults(
        genome_file=None,
        flank=1000,
        max_frameshift_length=4,
        min_intron_length=30,
        ignore_missing=False,
        restrict_source=None,
        method="full",
        report_step=1000,
    )

    # add common options (-h/--help, ...) and parse command line
    (args) = E.start(parser, argv=argv, add_output_options=True)

    if not args.genome_file:
        raise ValueError("an indexed genome is required.")

    fasta = IndexedFasta.IndexedFasta(args.genome_file)

    iterator = GTF.transcript_iterator(GTF.iterator(args.stdin))

    annotateGenome(iterator, fasta, args)

    # write footer and output benchmark information.
    E.stop()
Ejemplo n.º 16
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("-g",
                        "--genome-file",
                        dest="genome_file",
                        type=str,
                        help="filename with genome.")

    parser.add_argument("-f",
                        "--features",
                        dest="features",
                        type=str,
                        help="feature to collect.")

    parser.add_argument("-i",
                        "--files",
                        dest="files",
                        action="append",
                        help="use multiple annotations.")

    parser.add_argument(
        "-a",
        "--annotations",
        dest="annotations",
        type=str,
        help=
        "aggregate name for annotations if only single file is provided from STDIN."
    )

    parser.add_argument("--map-tsv-file",
                        dest="input_filename_map",
                        type=str,
                        help="filename with a map of gene_ids to categories.")

    parser.add_argument("-l",
                        "--max-length",
                        dest="max_length",
                        type=str,
                        help="maximum segment length.")

    parser.add_argument("-m",
                        "--merge-overlapping",
                        dest="merge",
                        action="store_true",
                        help="merge overlapping bed segments.")

    parser.add_argument("-s",
                        "--section",
                        dest="section",
                        type=str,
                        choices=("segments", "annotations", "workspace"),
                        help="annotator section.")

    parser.add_argument(
        "--subset",
        dest="subsets",
        type=str,
        action="append",
        help=
        "add filenames to delimit subsets within the gff files. The syntax is filename.gff,label,filename.ids."
    )

    parser.set_defaults(
        genome_file=None,
        feature=None,
        remove_random=True,
        section="segments",
        annotations="annotations",
        max_length=100000,
        files=[],
        subsets=[],
        input_filename_map=None,
        merge=False,
    )

    (args, unknown) = E.start(parser, unknowns=True)

    args.files += unknown
    if len(args.files) == 0:
        args.files.append("-")
    args.files = list(
        itertools.chain(*[re.split("[,; ]+", x) for x in args.files]))

    if args.subsets:
        subsets = collections.defaultdict(list)
        for s in args.subsets:
            filename_gff, label, filename_ids = s.split(",")
            subsets[filename_gff].append((label, filename_ids))
        args.subsets = subsets

    if args.genome_file:
        fasta = IndexedFasta.IndexedFasta(args.genome_file)
    else:
        fasta = None

    if args.section == "segments":
        prefix = "##Segs"
    elif args.section == "annotations":
        prefix = "##Id"
    elif args.section == "workspace":
        prefix = "##Work"
    else:
        raise ValueError("unknown section %s" % args.section)

    if args.max_length:
        max_length = args.max_length
    else:
        max_length = 0

    ninput, ntracks, ncontigs, nsegments, ndiscarded = 0, 0, 0, 0, 0

    if args.section in ("annotations"):
        contigs = set()
        it = itertools.groupby(Bed.iterator(args.stdin),
                               key=lambda x: x.track["name"])

        map_track2segments = {}
        for track, beds in it:
            ntracks += 1
            map_track2segments[track] = []
            first_segment = nsegments

            beds = list(beds)

            if args.merge:
                beds = Bed.merge(beds)

            for bed in beds:
                contig, start, end = bed.contig, bed.start, bed.end

                if args.remove_random and "random" in contig:
                    continue

                if max_length > 0 and end - start > max_length:
                    ndiscarded += 1
                    continue

                contigs.add(contig)
                map_track2segments[track].append(nsegments)
                args.stdout.write("%s\t%i\t%s\t(%i,%i)\n" %
                                  (prefix, nsegments, contig, start, end))
                nsegments += 1

            args.stdout.write("##Ann\t%s\t%s\n" % (track, "\t".join(
                ["%i" % x for x in range(first_segment, nsegments)])))
            E.info("track %s: annotated with %i segments" %
                   (track, nsegments - first_segment))

        ncontigs = len(contigs)
        E.info(
            "ninput=%i, ntracks=%i, ncontigs=%i, nsegments=%i, ndiscarded=%i" %
            (ninput, ntracks, ncontigs, nsegments, ndiscarded))

    E.stop()
Ejemplo n.º 17
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-e",
        "--extract",
        dest="extract",
        type="string",
        help="extract region for testing purposes. Format is "
        "contig:strand:from:to. "
        "The default coordinates are 0-based "
        "open/closed coordinates on both strands, but can be changed "
        "by --input-format. "
        "For example, 'chr1:+:10:12' will return "
        "bases 11 and 12 on chr1. Elements from the end of the "
        "string can be omitted. For example, 'chr1' will return "
        "all of chromosome 'chr1'.")

    input_format_choices = ("one-forward-open", "zero-both-open")
    parser.add_option("-i",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=input_format_choices,
                      help="coordinate format of input. Valid choices are "
                      "%s. See --extract. [default=%%default]." %
                      ", ".join(input_format_choices))

    parser.add_option(
        "-s",
        "--synonyms",
        dest="synonyms",
        type="string",
        help="list of synonyms. This is a comma separated with list "
        "of equivalence relations. For example, chrM=chrMT "
        "means that chrMT will refer to chrM and either "
        "can be used to retrieve a sequence "
        "[default=%default]")

    group = E.OptionGroup(parser, "Bencharking options")
    group.add_option("-b",
                     "--benchmark",
                     dest="benchmark",
                     action="store_true",
                     help="benchmark time for read access "
                     "[default=%default].")
    group.add_option("--benchmark-num-iterations",
                     dest="benchmark_num_iterations",
                     type="int",
                     help="number of iterations for benchmark "
                     "[default=%default].")
    group.add_option("--benchmark-fragment-size",
                     dest="benchmark_fragment_size",
                     type="int",
                     help="benchmark: fragment size [default=%default].")
    parser.add_option_group(group)

    group = E.OptionGroup(parser, "Validation options")
    group.add_option("--verify",
                     dest="verify",
                     type="string",
                     help="verify against other database [default=%default].")

    group.add_option("--verify-iterations",
                     dest="verify_num_iterations",
                     type="int",
                     help="number of iterations for verification "
                     "[default=%default].")
    parser.add_option_group(group)

    file_format_choices = ("fasta", "auto", "fasta.gz", "tar", "tar.gz")
    parser.add_option("--file-format",
                      dest="file_format",
                      type="choice",
                      choices=file_format_choices,
                      help="file format of input. Supply if data comes "
                      "from stdin "
                      "Valid choices are %s [default=%%default]." %
                      ", ".join(file_format_choices))

    parser.add_option("-a",
                      "--clean-sequence",
                      dest="clean_sequence",
                      action="store_true",
                      help="remove X/x from DNA sequences - they cause "
                      "errors in exonerate [default=%default].")

    parser.add_option("--allow-duplicates",
                      dest="allow_duplicates",
                      action="store_true",
                      help="allow duplicate identifiers. Further occurances "
                      "of an identifier are suffixed by an '_%i' "
                      "[default=%default].")

    parser.add_option("--regex-identifier",
                      dest="regex_identifier",
                      type="string",
                      help="regular expression for extracting the "
                      "identifier from fasta description line "
                      "[default=%default].")

    parser.add_option("--force-output",
                      dest="force",
                      action="store_true",
                      help="force overwriting of existing files "
                      "[default=%default].")

    translator_choices = ("solexa", "phred", "bytes", "range200")
    parser.add_option("-t",
                      "--translator",
                      dest="translator",
                      type="choice",
                      choices=translator_choices,
                      help="translate numerical quality scores. "
                      "Valid choices are %s [default=%%default]." %
                      ", ".join(translator_choices))

    group = E.OptionGroup(parser, 'Compression options')
    compression_choices = ("lzo", "zlib", "gzip", "dictzip", "bzip2", "debug")
    group.add_option("-c",
                     "--compression",
                     dest="compression",
                     type="choice",
                     choices=compression_choices,
                     help="compress database, using specified compression "
                     "method. "
                     "Valid choices are %s, but depend on availability on the "
                     "system "
                     "[default=%%default]." % ", ".join(compression_choices))

    group.add_option("--random-access-points",
                     dest="random_access_points",
                     type="int",
                     help="set random access points every # number "
                     "of nucleotides for block compression schemes "
                     "[default=%default].")

    group.add_option(
        "--compress-index",
        dest="compress_index",
        action="store_true",
        help="compress index. The default is to use a plain-text, "
        "human-readable index [default=%default].")

    parser.add_option_group(group)

    parser.set_defaults(extract=None,
                        input_format="zero-both-open",
                        benchmark_fragment_size=1000,
                        benchmark_num_iterations=1000000,
                        benchmark=False,
                        compression=None,
                        random_access_points=0,
                        synonyms=None,
                        verify=None,
                        verify_num_iterations=100000,
                        verify_fragment_size=100,
                        clean_sequence=False,
                        allow_duplicates=False,
                        regex_identifier=None,
                        compress_index=False,
                        file_format="auto",
                        force=False,
                        translator=None)

    (options, args) = E.start(parser)

    if options.synonyms:
        synonyms = {}
        for x in options.synonyms.split(","):
            a, b = x.split("=")
            a = a.strip()
            b = b.strip()
            if a not in synonyms:
                synonyms[a] = []
            synonyms[a].append(b)
    else:
        synonyms = None

    if options.translator:
        if options.translator == "phred":
            options.translator = IndexedFasta.TranslatorPhred()
        elif options.translator == "solexa":
            options.translator = IndexedFasta.TranslatorSolexa()
        elif options.translator == "bytes":
            options.translator = IndexedFasta.TranslatorBytes()
        elif options.translator == "range200":
            options.translator = IndexedFasta.TranslatorRange200()
        else:
            raise ValueError("unknown translator %s" % options.translator)

    if options.extract:
        fasta = IndexedFasta.IndexedFasta(args[0])
        fasta.setTranslator(options.translator)
        converter = IndexedFasta.getConverter(options.input_format)

        contig, strand, start, end = IndexedFasta.parseCoordinates(
            options.extract)
        sequence = fasta.getSequence(contig,
                                     strand,
                                     start,
                                     end,
                                     converter=converter)
        options.stdout.write(">%s\n%s\n" % (options.extract, sequence))

    elif options.benchmark:
        import timeit
        timer = timeit.Timer(
            stmt="IndexedFasta.benchmarkRandomFragment(fasta=fasta, size=%i)" %
            (options.benchmark_fragment_size),
            setup="from cgat import IndexedFasta\n"
            "fasta=IndexedFasta.IndexedFasta('%s')" % (args[0]))

        t = timer.timeit(number=options.benchmark_num_iterations)
        options.stdout.write("iter\tsize\ttime\n")
        options.stdout.write("%i\t%i\t%i\n" %
                             (options.benchmark_num_iterations,
                              options.benchmark_fragment_size, t))

    elif options.verify:
        fasta1 = IndexedFasta.IndexedFasta(args[0])
        fasta2 = IndexedFasta.IndexedFasta(options.verify)
        nerrors1 = IndexedFasta.verify(fasta1,
                                       fasta2,
                                       options.verify_num_iterations,
                                       options.verify_fragment_size,
                                       stdout=options.stdout)
        options.stdout.write("errors=%i\n" % (nerrors1))
        nerrors2 = IndexedFasta.verify(fasta2,
                                       fasta1,
                                       options.verify_num_iterations,
                                       options.verify_fragment_size,
                                       stdout=options.stdout)
        options.stdout.write("errors=%i\n" % (nerrors2))
    elif options.compress_index:
        fasta = IndexedFasta.IndexedFasta(args[0])
        fasta.compressIndex()
    else:
        if options.loglevel >= 1:
            options.stdlog.write("# creating database %s\n" % args[0])
            options.stdlog.write("# indexing the following files: \n# %s\n" %
                                 (" \n# ".join(args[1:])))
            options.stdlog.flush()

            if synonyms:
                options.stdlog.write("# Applying the following synonyms:\n")
                for k, v in list(synonyms.items()):
                    options.stdlog.write("# %s=%s\n" % (k, ",".join(v)))
                options.stdlog.flush()
        if len(args) < 2:
            print(globals()["__doc__"])
            sys.exit(1)

        iterator = IndexedFasta.MultipleFastaIterator(
            args[1:],
            regex_identifier=options.regex_identifier,
            format=options.file_format)

        IndexedFasta.createDatabase(
            args[0],
            iterator,
            synonyms=synonyms,
            random_access_points=options.random_access_points,
            compression=options.compression,
            clean_sequence=options.clean_sequence,
            allow_duplicates=options.allow_duplicates,
            translator=options.translator,
            force=options.force)

    E.stop()
# coding: utf-8

# In[2]:

import sys

from cgat import Bed
from cgat import IndexedFasta
from cgatcore import iotools
from cgat import Genomics


# In[3]:


genome = IndexedFasta.IndexedFasta("/shared/sudlab1/General/mirror/genomes/plain/hg38.fasta")


# In[7]:


bedfile = Bed.iterator(iotools.open_file(sys.argv[1]))
splice_site_dict = dict()
outfile = iotools.open_file(sys.argv[2], "w")
for utron in bedfile:
    
    ss5_sequence = genome.getSequence(utron.contig, "+", utron.start, utron.start+2)
    ss3_sequence = genome.getSequence(utron.contig, "+", utron.end-2, utron.end)
    if utron.strand == "+":
        splice_site_dict[utron.name] = (ss5_sequence, ss3_sequence)
        if ":" in utron.name:
Ejemplo n.º 19
0
def main(argv=sys.argv):

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("-m",
                        "--method",
                        dest="methods",
                        type=str,
                        action="append",
                        choices=("threshold", "stddev-above-mean",
                                 "multiple-of-mean"),
                        help="method to apply")

    parser.add_argument("-g",
                        "--genome-file",
                        dest="genome_file",
                        type=str,
                        help="filename with genome.")

    parser.add_argument("-t",
                        "--threshold",
                        dest="threshold",
                        type=float,
                        help="threshold to apply")

    parser.add_argument("-i",
                        "--bigwig-file",
                        dest="bigwig_file",
                        type=str,
                        metavar="bigwig",
                        help="filename with bigwig information.")

    parser.set_defaults(methods=[],
                        genome_file=None,
                        threshold=10,
                        max_distance=0)

    (args) = E.start(parser, add_pipe_options=True)

    if args.bigwig_file:
        bigwig_file = pyBigWig.open(args.bigwig_file)
    else:
        bigwig_file = None

    if args.genome_file:
        genome_fasta = IndexedFasta.IndexedFasta(args.genome_file)
        contigs = genome_fasta.getContigSizes()

    for method in args.methods:
        if method == "threshold":
            if not contigs:
                raise ValueError("please supply contig sizes")
            if not bigwig_file:
                raise NotImplementedError(
                    "threshold not implemented for wig files")
            processor = applyThreshold(bigwig_file,
                                       genome_fasta,
                                       threshold=args.threshold,
                                       max_distance=args.max_distance)
        elif method == "stddev-above-mean":
            if not contigs:
                raise ValueError("please supply contig sizes")
            if not bigwig_file:
                raise NotImplementedError(
                    "threshold not implemented for wig files")
            summary = getBigwigSummary(args.bigwig_file)
            threshold = summary.mean + args.threshold * summary.std
            E.info("applying threshold %f: mean=%f, std=%f" %
                   (threshold, summary.mean, summary.std))
            processor = applyThreshold(bigwig_file,
                                       genome_fasta,
                                       threshold=threshold,
                                       max_distance=args.max_distance)

        elif method == "multiple-of-mean":
            if not contigs:
                raise ValueError("please supply contig sizes")
            if not bigwig_file:
                raise NotImplementedError(
                    "threshold not implemented for wig files")
            summary = getBigwigSummary(args.bigwig_file)
            threshold = summary.mean * args.threshold
            E.info("applying threshold %f: mean=%f, std=%f" %
                   (threshold, summary.mean, summary.std))
            processor = applyThreshold(bigwig_file,
                                       genome_fasta,
                                       threshold=threshold,
                                       max_distance=args.max_distance)

    outfile = args.stdout

    outfile.write("".join(["%s\t%i\t%i\n" % x for x in processor]))
    outfile.write("\n")

    E.stop()
Ejemplo n.º 20
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--is-gtf",
                      dest="is_gtf",
                      action="store_true",
                      help="input is gtf instead of gff.")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("-m",
                      "--merge-adjacent",
                      dest="merge",
                      action="store_true",
                      help="merge adjacent intervals with the same attributes."
                      " [default=%default]")

    parser.add_option("-e",
                      "--feature",
                      dest="feature",
                      type="string",
                      help="filter by a feature, for example 'exon', 'CDS'."
                      " If set to the empty string, all entries are output "
                      "[%default].")

    parser.add_option("-f",
                      "--maskregions-bed-file",
                      dest="filename_masks",
                      type="string",
                      metavar="gff",
                      help="mask sequences with regions given in gff file "
                      "[%default].")

    parser.add_option("--remove-masked-regions",
                      dest="remove_masked_regions",
                      action="store_true",
                      help="remove regions instead of masking [%default].")

    parser.add_option("--min-interval-length",
                      dest="min_length",
                      type="int",
                      help="set minimum length for sequences output "
                      "[%default]")

    parser.add_option("--max-length",
                      dest="max_length",
                      type="int",
                      help="set maximum length for sequences output "
                      "[%default]")

    parser.add_option("--extend-at",
                      dest="extend_at",
                      type="choice",
                      choices=("none", "3", "5", "both", "3only", "5only"),
                      help="extend at no end, 3', 5' or both ends. If "
                      "3only or 5only are set, only the added sequence "
                      "is returned [default=%default]")

    parser.add_option("--header-attributes",
                      dest="header_attr",
                      action="store_true",
                      help="add GFF entry attributes to the FASTA record"
                      " header section")

    parser.add_option("--extend-by",
                      dest="extend_by",
                      type="int",
                      help="extend by # bases [default=%default]")

    parser.add_option("--extend-with",
                      dest="extend_with",
                      type="string",
                      help="extend using base [default=%default]")

    parser.add_option("--masker",
                      dest="masker",
                      type="choice",
                      choices=("dust", "dustmasker", "softmask", "none"),
                      help="apply masker [%default].")

    parser.add_option("--fold-at",
                      dest="fold_at",
                      type="int",
                      help="fold sequence every n bases[%default].")

    parser.add_option(
        "--fasta-name-attribute",
        dest="naming_attribute",
        type="string",
        help="use attribute to name fasta entry. Currently only compatable"
        " with gff format [%default].")

    parser.set_defaults(
        is_gtf=False,
        genome_file=None,
        merge=False,
        feature=None,
        filename_masks=None,
        remove_masked_regions=False,
        min_length=0,
        max_length=0,
        extend_at=None,
        extend_by=100,
        extend_with=None,
        masker=None,
        fold_at=None,
        naming_attribute=False,
        header_attr=False,
    )

    (options, args) = E.start(parser)

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contigs = fasta.getContigSizes()

    if options.is_gtf:
        iterator = GTF.transcript_iterator(GTF.iterator(options.stdin))
    else:
        gffs = GTF.iterator(options.stdin)
        if options.merge:
            iterator = GTF.joined_iterator(gffs)
        else:
            iterator = GTF.chunk_iterator(gffs)

    masks = None
    if options.filename_masks:
        masks = {}
        with iotools.open_file(options.filename_masks, "r") as infile:
            e = GTF.readAsIntervals(GTF.iterator(infile))

        # convert intervals to intersectors
        for contig in list(e.keys()):
            intersector = quicksect.IntervalTree()
            for start, end in e[contig]:
                intersector.add(start, end)
            masks[contig] = intersector

    ninput, noutput, nmasked, nskipped_masked = 0, 0, 0, 0
    nskipped_length = 0
    nskipped_noexons = 0

    feature = options.feature

    # iterator is a list containing groups (lists) of features.
    # Each group of features have in common the same transcript ID, in case of
    # GTF files.
    for ichunk in iterator:

        ninput += 1

        if feature:
            chunk = [x for x in ichunk if x.feature == feature]
        else:
            chunk = ichunk

        if len(chunk) == 0:
            nskipped_noexons += 1
            E.info("no features in entry from "
                   "%s:%i..%i - %s" % (ichunk[0].contig, ichunk[0].start,
                                       ichunk[0].end, str(ichunk[0])))
            continue

        contig, strand = chunk[0].contig, chunk[0].strand

        if options.is_gtf:
            name = chunk[0].transcript_id
        else:
            if options.naming_attribute:
                attr_dict = {
                    x.split("=")[0]: x.split("=")[1]
                    for x in chunk[0].attributes.split(";")
                }
                name = attr_dict[options.naming_attribute]
            else:
                name = str(chunk[0].attributes)

        lcontig = contigs[contig]
        positive = Genomics.IsPositiveStrand(strand)
        intervals = [(x.start, x.end) for x in chunk]
        intervals.sort()

        if masks:
            if contig in masks:
                masked_regions = []
                for start, end in intervals:
                    masked_regions += [(x.start, x.end)
                                       for x in masks[contig].find(
                                           quicksect.Interval(start, end))]

                masked_regions = Intervals.combine(masked_regions)
                if len(masked_regions):
                    nmasked += 1

                if options.remove_masked_regions:
                    intervals = Intervals.truncate(intervals, masked_regions)
                else:
                    raise NotImplementedError("unimplemented")

                if len(intervals) == 0:
                    nskipped_masked += 1
                    if options.loglevel >= 1:
                        options.stdlog.write(
                            "# skipped because fully masked: "
                            "%s: regions=%s masks=%s\n" %
                            (name, str([(x.start, x.end)
                                        for x in chunk]), masked_regions))
                    continue

        out = intervals

        if options.extend_at and not options.extend_with:
            if options.extend_at == "5only":
                intervals = [(max(0, intervals[0][0] - options.extend_by),
                              intervals[0][0])]
            elif options.extend_at == "3only":
                intervals = [(intervals[-1][1],
                              min(lcontig,
                                  intervals[-1][1] + options.extend_by))]
            else:
                if options.extend_at in ("5", "both"):
                    intervals[0] = (max(0,
                                        intervals[0][0] - options.extend_by),
                                    intervals[0][1])
                if options.extend_at in ("3", "both"):
                    intervals[-1] = (intervals[-1][0],
                                     min(lcontig,
                                         intervals[-1][1] + options.extend_by))

        if not positive:
            intervals = [(lcontig - x[1], lcontig - x[0])
                         for x in intervals[::-1]]
            out.reverse()

        s = [
            fasta.getSequence(contig, strand, start, end)
            for start, end in intervals
        ]
        # IMS: allow for masking of sequences
        s = Masker.maskSequences(s, options.masker)
        l = sum([len(x) for x in s])
        if (l < options.min_length
                or (options.max_length and l > options.max_length)):
            nskipped_length += 1
            if options.loglevel >= 1:
                options.stdlog.write("# skipped because length out of bounds "
                                     "%s: regions=%s len=%i\n" %
                                     (name, str(intervals), l))
                continue

        if options.extend_at and options.extend_with:
            extension = "".join((options.extend_with, ) * options.extend_by)

            if options.extend_at in ("5", "both"):
                s[1] = extension + s[1]
            if options.extend_at in ("3", "both"):
                s[-1] = s[-1] + extension

        if options.fold_at:
            n = options.fold_at
            s = "".join(s)
            seq = "\n".join([s[i:i + n] for i in range(0, len(s), n)])
        else:
            seq = "\n".join(s)

        if options.header_attr:
            attributes = " ".join(
                [":".join([ax, ay]) for ax, ay in chunk[0].asDict().items()])
            options.stdout.write(
                ">%s %s:%s:%s feature:%s %s\n%s\n" %
                (name, contig, strand, ";".join(
                    ["%i-%i" % x
                     for x in out]), chunk[0].feature, attributes, seq))
        else:
            options.stdout.write(
                ">%s %s:%s:%s\n%s\n" %
                (name, contig, strand, ";".join(["%i-%i" % x
                                                 for x in out]), seq))

        noutput += 1

    E.info("ninput=%i, noutput=%i, nmasked=%i, nskipped_noexons=%i, "
           "nskipped_masked=%i, nskipped_length=%i" %
           (ninput, noutput, nmasked, nskipped_noexons, nskipped_masked,
            nskipped_length))

    E.stop()
Ejemplo n.º 21
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("-g",
                        "--genome-file",
                        dest="genome_file",
                        type=str,
                        help="filename with genome.")

    parser.add_argument("--remove-regex",
                        dest="remove_regex",
                        type=str,
                        help="regular expression of contigs to remove.")

    parser.add_argument("-e",
                        "--gff-file",
                        dest="gff_file",
                        type=str,
                        help="gff file to use for getting contig sizes.")

    parser.add_argument(
        "-f",
        "--fixed-width-windows",
        dest="fixed_width_windows",
        type=str,
        help="fixed width windows. Supply the window size as a "
        "parameter. Optionally supply an offset.")

    parser.set_defaults(
        genome_file=None,
        remove_regex=None,
        fixed_windows=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (args) = E.start(parser, argv=argv)

    if args.remove_regex:
        remove_regex = re.compile(args.remove_regex)
    else:
        remove_regex = None

    if args.fixed_width_windows:
        v = list(map(int, args.fixed_width_windows.split(",")))
        if len(v) == 2:
            window_size, window_increment = v
        elif len(v) == 1:
            window_size, window_increment = v[0], v[0]
        else:
            raise ValueError(
                "could not parse window size '%s': should be size[,increment]"
                % args.fixed_width_windows)

    if args.gff_file:
        infile = iotools.open_file(args.gff_file, "r")
        gff = GTF.readFromFile(infile)
        infile.close()
        for g in gff:
            try:
                map_contig2size[g.mName] = max(map_contig2size[g.mName], g.end)
            except ValueError:
                map_contig2size[g.mName] = g.end

    else:
        gff = None

    if args.genome_file:
        fasta = IndexedFasta.IndexedFasta(args.genome_file)
        map_contig2size = fasta.getContigSizes(with_synonyms=False)
    else:
        fasta = None

    if map_contig2size is None:
        raise ValueError("no source of contig sizes supplied")

    # do sth
    counter = E.Counter()

    for contig, size in list(map_contig2size.items()):
        size = int(size)
        counter.input += 1

        if remove_regex and remove_regex.search(contig):
            counter.skipped += 1
            continue

        if args.fixed_width_windows:
            for x in range(0, size, window_increment):
                if x + window_size > size:
                    continue
                args.stdout.write("%s\t%i\t%i\n" %
                                  (contig, x, min(size, x + window_size)))
                counter.windows += 1
        else:
            args.stdout.write("%s\t%i\t%i\n" % (contig, 0, size))
            counter.windows += 1

        counter.output += 1

    E.info(str(counter))

    # write footer and output benchmark information.
    E.stop()
from cgat import IndexedFasta
from cgatcore import iotools
from cgat import Genomics

from cgatcore import expriment as E

parser = E.OptionParser(version="%prog version: $1.0$",
                           usage=globals()["__doc__"])
parser.add_option("-g", "--genome", dest="genome",
                  help="index fasta genome sequence")
parser.add_option("-O", "--per-utron-out", dest="outfile",
                  help="File name for output file that will contain one row"
                       "per entry in the input")
options, args = E.start(parser, sys.argv)
                  
genome = IndexedFasta.IndexedFasta(options["genome"])


bedfile = Bed.iterator(options.stdin)
splice_site_dict = dict()
outfile = iotools.open_file(options["outfile"], "w")
outfile.write("\t".join("transcript_id",
                        "strand",
                        "ss5",
                        "ss3",
                        "contig",
                        "splice_site_start",
                        "splice_site_end",
                        "utron_size"))
for utron in bedfile:
    
Ejemplo n.º 23
0
def buildUngappedContigBed(infile, outfiles):
    '''
    Constructs :term:`BED` format files containing both gapped and ungapped
    contig sizes from an index genome :term:`fasta` file.

    Parameters
    ----------
    infile: str
      infile is constructed from `PARAMS` variable to retrieve
      the `genome` :term:`fasta` file

    assembly_gaps_min_size: int
      `PARAMS` - the minimum size (in nucleotides) for an assembly gap

    Returns
    -------
    outfiles: list
      two separate :term:`BED` format output files containing the contig sizes
      for contigs with and without gaps.  The names are defined
      in the `PARAMS` `interface_contigs_ungapped_bed` and
      `interface_gaps_bed` parameters.
    '''

    prefix = P.snip(infile, ".fasta")
    fasta = IndexedFasta.IndexedFasta(prefix)
    outs_nogap = iotools.open_file(outfiles[0], "w")
    outs_gap = iotools.open_file(outfiles[1], "w")
    min_gap_size = PARAMS["assembly_gaps_min_size"]

    for contig, size in fasta.getContigSizes(with_synonyms=False).items():

        seq = fasta.getSequence(contig)

        def gapped_regions(seq):
            is_gap = seq[0] == "N"
            last = 0
            for x, c in enumerate(seq):
                if c == "N":
                    if not is_gap:
                        last = x
                        is_gap = True
                else:
                    if is_gap:
                        yield (last, x)
                        last = x
                        is_gap = False
            if is_gap:
                yield last, size

        last_end = 0
        for start, end in gapped_regions(seq):
            if end - start < min_gap_size:
                continue

            if last_end != 0:
                outs_nogap.write("%s\t%i\t%i\n" % (contig, last_end, start))
            outs_gap.write("%s\t%i\t%i\n" % (contig, start, end))
            last_end = end

        if last_end < size:
            outs_nogap.write("%s\t%i\t%i\n" % (contig, last_end, size))

    outs_nogap.close()
    outs_gap.close()
Ejemplo n.º 24
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id",
                            usage=globals()["__doc__"])

    parser.add_option("-u",
                      "--ucsc-genome",
                      dest="ucsc_genome",
                      type="string",
                      help="UCSC genome identifier [default=%default].")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("--extend",
                      dest="extension",
                      type="int",
                      help="extend tags by this number of bases "
                      "[default=%default].")

    parser.add_option("--shift-size",
                      dest="shift",
                      type="int",
                      help="shift tags by this number of bases "
                      "[default=%default].")

    parser.add_option("--window-size",
                      dest="window_size",
                      type="int",
                      help="window size to be used in the analysis"
                      "[default=%default].")

    parser.add_option("--saturation-iterations",
                      dest="saturation_iterations",
                      type="int",
                      help="iterations for saturation analysis "
                      "[default=%default].")

    parser.add_option("-t",
                      "--toolset",
                      dest="toolset",
                      type="choice",
                      action="append",
                      choices=("saturation", "coverage", "enrichment", "dmr",
                               "rms", "rpm", "all", "convert"),
                      help="actions to perform [default=%default].")

    parser.add_option("-w",
                      "--bigwig-file",
                      dest="bigwig",
                      action="store_true",
                      help="store wig files as bigwig files - requires a "
                      "genome file [default=%default]")

    parser.add_option("--treatment",
                      dest="treatment_files",
                      type="string",
                      action="append",
                      help="BAM files for treatment. At least one is required "
                      "[%default]")

    parser.add_option("--control",
                      dest="control_files",
                      type="string",
                      action="append",
                      help="BAM files for control for differential "
                      "methylation analysis. Optional [%default].")

    parser.add_option("--input",
                      dest="input_files",
                      type="string",
                      action="append",
                      help="BAM files for input correction. "
                      "Optional [%default].")

    parser.add_option("--is-not-medip",
                      dest="is_medip",
                      action="store_false",
                      help="data is not MeDIP data and is not expected "
                      "to fit the calibration model. No CpG "
                      "density normalized rms data is computed"
                      "[default=%default].")

    parser.add_option("--output-rdata",
                      dest="output_rdata",
                      action="store_true",
                      help="in dmr analysis, write R session to file. "
                      "The file name "
                      "is given by --ouptut-filename-pattern [%default].")

    parser.add_option("--rdata-file",
                      dest="input_rdata",
                      type="string",
                      help="in dmr analysis, read saved R session from "
                      "file. This can be used to apply different "
                      "filters [%default]")

    parser.add_option("--fdr-threshold",
                      dest="fdr_threshold",
                      type="float",
                      help="FDR threshold to apply for selecting DMR "
                      "[default=%default].")

    parser.add_option("--fdr-method",
                      dest="fdr_method",
                      type="choice",
                      choices=("bonferroni", "BH", "holm", "hochberg",
                               "hommel", "BY", "fdr", "none"),
                      help="FDR method to apply for selecting DMR "
                      "[default=%default].")

    parser.add_option("--bwa",
                      dest="bwa",
                      action="store_true",
                      help="alignment generated with bwa"
                      "[default=%default].")

    parser.add_option("--unique",
                      dest="unique",
                      type="float",
                      help="Threshold p-value to determine which read pile\
                      ups are the result of PCR overamplification"
                      "[default=%default].")

    parser.add_option("--chroms",
                      dest="chroms",
                      type="str",
                      help="Comma delimited list of chromosomes to include"
                      "[default=%default].")

    parser.set_defaults(input_format="bam",
                        ucsc_genome="Hsapiens.UCSC.hg19",
                        genome_file=None,
                        extend=0,
                        shift=0,
                        window_size=300,
                        saturation_iterations=10,
                        toolset=[],
                        bigwig=False,
                        treatment_files=[],
                        control_files=[],
                        input_files=[],
                        output_rdata=False,
                        input_rdata=None,
                        is_medip=True,
                        fdr_threshold=0.1,
                        fdr_method="BH",
                        bwa=False,
                        unique=0.001,
                        chroms=None)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if "convert" in options.toolset:

        results = []
        for line in csv.DictReader(options.stdin, dialect="excel-tab"):
            if line['edgeR.p.value'] == "NA":
                continue

            # assumes only a single treatment/control
            treatment_name = options.treatment_files[0]
            control_name = options.control_files[0]
            status = "OK"
            try:
                results.append(
                    Expression.GeneExpressionResult._make((
                        "%s:%i-%i" %
                        (line['chr'], int(line['start']), int(line['stop'])),
                        treatment_name,
                        float(line['MSets1.rpkm.mean']),
                        0,
                        control_name,
                        float(line['MSets2.rpkm.mean']),
                        0,
                        float(line['edgeR.p.value']),
                        float(line['edgeR.adj.p.value']),
                        float(line['edgeR.logFC']),
                        math.pow(2.0, float(line['edgeR.logFC'])),
                        float(line['edgeR.logFC']),  # no transform
                        ["0", "1"][float(line['edgeR.adj.p.value']) <
                                   options.fdr_threshold],
                        status)))
            except ValueError as msg:
                raise ValueError("parsing error %s in line: %s" % (msg, line))

        Expression.writeExpressionResults(options.stdout, results)
        return

    if len(options.treatment_files) < 1:
        raise ValueError("please specify a filename with sample data")

    if options.bigwig and not options.genome_file:
        raise ValueError("please provide a genome file when outputting bigwig")

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contig_sizes = fasta.getContigSizes()

    if len(options.toolset) == 0:
        options.toolset = ["all"]

    do_all = "all" in options.toolset

    if options.chroms is None:
        chrstring = ""
    else:
        chroms = options.chroms.split(",")
        chrstring = ' chr.select=c(\"%s\"), ' % '\",\"'.join(chroms)
    # load MEDIPS
    R.library('MEDIPS')
    genome_file = 'BSgenome.%s' % options.ucsc_genome
    R.library(genome_file)

    window_size = options.window_size
    extend = options.extend
    shift = options.shift
    saturation_iterations = options.saturation_iterations

    uniq = float(options.unique)

    if options.bwa is True:
        BWA = "TRUE"
    else:
        BWA = "FALSE"

    if "saturation" in options.toolset or do_all:
        E.info("saturation analysis")
        for fn in options.treatment_files + options.control_files:
            paired = isPaired(fn)
            R('''sr = MEDIPS.saturation(
            file='%(fn)s',
            BSgenome='%(genome_file)s',
            shift=%(shift)i,
            extend=%(extend)i,
            window_size=%(window_size)i,
            uniq=%(uniq)s,
            nit = %(saturation_iterations)i,
            paired = %(paired)s,
            bwa = %(BWA)s,
            %(chrstring)s
            nrit = 1)''' % locals())

            R.png(E.get_output_file("%s_saturation.png" % fn))
            R('''MEDIPS.plotSaturation(sr)''')
            R('''dev.off()''')
            R('''write.table(sr$estimation, file ='%s', sep='\t')''' %
              E.get_output_file("%s_saturation_estimation.tsv" % fn))

            outfile = iotools.open_file(
                E.get_output_file("%s_saturation.tsv" % fn), "w")
            outfile.write("category\tvalues\n")
            outfile.write("estimated_correlation\t%s\n" %
                          ",".join(["%f" % x for x in R('''sr$maxEstCor''')]))
            outfile.write("true_correlation\t%s\n" %
                          ",".join(["%f" % x for x in R('''sr$maxTruCor''')]))
            outfile.write("nreads\t%s\n" %
                          ",".join(["%i" % x
                                    for x in R('''sr$numberReads''')]))
            outfile.close()

    if "coverage" in options.toolset or do_all:
        E.info("CpG coverage analysis")
        for fn in options.treatment_files + options.control_files:
            paired = isPaired(fn)
            R('''cr = MEDIPS.seqCoverage(
            file='%(fn)s',
            BSgenome='%(genome_file)s',
            pattern='CG',
            shift=%(shift)i,
            extend=%(extend)i,
            paired=%(paired)s,
            bwa=%(BWA)s,
            %(chrstring)s
            uniq=%(uniq)s)''' % locals())

            R.png(E.get_output_file("%s_cpg_coverage_pie.png" % fn))
            R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr,
            type = "pie", cov.level = c(0, 1, 2, 3, 4, 5))''')
            R('''dev.off()''')

            R.png(E.get_output_file("%s_cpg_coverage_hist.png" % fn))
            R('''MEDIPS.plotSeqCoverage(seqCoverageObj=cr,
            type = "hist", t=15)''')
            R('''dev.off()''')

            # note: this file is large
            R('''write.table(cr$cov.res, file=gzfile('%s','w'),
            sep='\t')''' %
              E.get_output_file("%s_saturation_coveredpos.tsv.gz" % fn))

    if 'enrichment' in options.toolset or do_all:
        E.info("CpG enrichment analysis")
        outfile = iotools.open_file(E.get_output_file("enrichment.tsv.gz"),
                                    "w")
        slotnames = (("regions.CG", "regions_CG",
                      "%i"), ("regions.C", "regions_C",
                              "%s"), ("regions.G", "regions_G", "%f"),
                     ("regions.relH", "regions_relH",
                      "%i"), ("regions.GoGe", "regions_GoGe",
                              "%i"), ("genome.CG", "genome_CG",
                                      "%s"), ("genome.C", "genome_C", "%s"),
                     ("genome.G", "genome_G", "%i"), ("genome.relH",
                                                      "genome_relH", "%i"),
                     ("enrichment.score.relH", "enrichment_relH", "%s"),
                     ("enrichment.score.GoGe", "enrichment_GoGe", "%s"))

        outfile.write("\t".join(['sample'] + [x[1] for x in slotnames]) + "\n")
        for fn in options.treatment_files + options.control_files:
            paired = isPaired(fn)
            R('''ce = MEDIPS.CpGenrich(
            file='%(fn)s',
            BSgenome='%(genome_file)s',
            shift=%(shift)i,
            extend=%(extend)i,
            paired=%(paired)s,
            bwa=%(BWA)s,
            %(chrstring)s
            uniq=%(uniq)s)''' % locals())

            outfile.write("%s" % fn)
            for slotname, label, pattern in slotnames:
                value = tuple(R('''ce$%s''' % slotname))
                if len(value) == 0:
                    value = ""
                outfile.write("\t%s" % pattern % value[0])
            outfile.write("\n")
        outfile.close()

    if options.input_rdata:
        E.info("reading R session info from '%s'" % options.input_rdata)
        R('''load('%s')''' % options.input_rdata)

    else:
        if "dmr" in options.toolset or "correlation" in options.toolset \
           or do_all:
            # build four sets
            for x, fn in enumerate(options.treatment_files):
                paired = isPaired(fn)
                E.info("loading '%s'" % fn)
                R('''treatment_R%(x)i = MEDIPS.createSet(
                file='%(fn)s',
                BSgenome='%(genome_file)s',
                shift=%(shift)i,
                extend=%(extend)i,
                window_size=%(window_size)i,
                paired=%(paired)s,
                bwa=%(BWA)s,
                %(chrstring)s
                uniq=%(uniq)s)''' % locals())
            R('''treatment_set = c(%s)''' % ",".join([
                "treatment_R%i" % x
                for x in range(len(options.treatment_files))
            ]))

            if options.control_files:
                for x, fn in enumerate(options.control_files):
                    paired = isPaired(fn)
                    E.info("loading '%s'" % fn)
                    R('''control_R%(x)i = MEDIPS.createSet(
                    file='%(fn)s',
                    BSgenome='%(genome_file)s',
                    shift=%(shift)i,
                    extend=%(extend)i,
                    window_size=%(window_size)i,
                    paired=%(paired)s,
                    bwa=%(BWA)s,
                    %(chrstring)s
                    uniq=%(uniq)s)''' % locals())
                R('''control_set = c(%s)''' % ",".join([
                    "control_R%i" % x
                    for x in range(len(options.control_files))
                ]))

            # build coupling vector
            R('''CS = MEDIPS.couplingVector(pattern="CG",
            refObj = treatment_set[[1]])''')

            if "correlation" in options.toolset or do_all:
                R('''cor.matrix = MEDIPS.correlation(
                c(treatment_set, control_set))''')

                R('''write.table(cor.matrix,
                file='%s',
                sep="\t")''' % E.get_output_file("correlation"))

            if "dmr" in options.toolset or do_all:
                # Data that does not fit the model causes
                # "Error in 1:max_signal_index : argument of length 0"
                # The advice is to set MeDIP=FALSE
                # See: http://comments.gmane.org/
                # gmane.science.biology.informatics.conductor/52319

                if options.is_medip:
                    medip = "TRUE"
                else:
                    medip = "FALSE"
                fdr_method = options.fdr_method

                E.info("applying test for differential methylation")
                R('''meth = MEDIPS.meth(
                MSet1 = treatment_set,
                MSet2 = control_set,
                CSet = CS,
                ISet1 = NULL,
                ISet2 = NULL,
                p.adj = "%(fdr_method)s",
                diff.method = "edgeR",
                MeDIP = %(medip)s,
                CNV = F,
                minRowSum = 1)''' % locals())

                # Note: several Gb in size
                # Output full methylation data table
                R('''write.table(meth,
                file=gzfile('%s', 'w'),
                sep="\t",
                row.names=F,
                quote=F)''' % E.get_output_file("data.tsv.gz"))

                # save R session
                if options.output_rdata:
                    R('''save.image(file='%s', safe=FALSE)''' %
                      E.get_output_file("session.RData"))

    # DMR analysis - test for windows and output
    if "dmr" in options.toolset:

        E.info("selecting differentially methylated windows")

        # test windows for differential methylation
        fdr_threshold = options.fdr_threshold
        R('''tested = MEDIPS.selectSig(meth,
        adj=T,
        ratio=NULL,
        p.value=%(fdr_threshold)f,
        bg.counts=NULL,
        CNV=F)''' % locals())

        R('''write.table(tested,
        file=gzfile('%s', 'w'),
        sep="\t",
        quote=F)''' % E.get_output_file("significant_windows.gz"))

        # select gain and merge adjacent windows
        try:
            R('''gain = tested[which(tested[, grep("logFC", colnames(tested))] > 0),];
            gain_merged = MEDIPS.mergeFrames(frames=gain, distance=1)''')
            E.info('gain output: %s, merged: %s' %
                   (str(R('''dim(gain)''')), str(R('''dim(gain_merged)'''))))
            R('''of=gzfile('%s', 'w');
            write.table(gain_merged,
            file=of,
            sep="\t",
            quote=F,
            row.names=FALSE,
            col.names=FALSE); close(of)''' % E.get_output_file("gain.bed.gz"))
        except rpy2.rinterface.RRuntimeError as msg:
            E.warn("could not compute gain windows: msg=%s" % msg)
        # select loss and merge adjacent windows
        try:
            R('''loss = tested[which(tested[, grep("logFC", colnames(tested))] < 0),];
            loss_merged = MEDIPS.mergeFrames(frames=loss, distance=1)''')
            E.info('loss output: %s, merged: %s' %
                   (str(R('''dim(loss)''')), str(R('''dim(loss_merged)'''))))

            R('''of=gzfile('%s', 'w');
            write.table(loss_merged,
            file=of,
            sep="\t",
            quote=F,
            row.names=F,
            col.names=F); close(of)''' % E.get_output_file("loss.bed.gz"))
        except rpy2.rinterface.RRuntimeError as msg:
            E.warn("could not compute loss windows: msg=%s" % msg)

    # if "rpm" in options.toolset or do_all:
    #     outputfile = E.get_output_file("rpm.wig")
    #     R('''MEDIPS.exportWIG(file = '%(outputfile)s',
    #     data = CONTROL.SET, raw = T, descr = "rpm")''' %
    #       locals())
    #     if options.bigwig:
    #         bigwig(outputfile, contig_sizes)
    #     else:
    #         compress(outputfile)

    # if "rms" in options.toolset or do_all:
    #     outputfile = E.get_output_file("rms.wig")
    #     R('''MEDIPS.exportWIG(file = '%(outputfile)s',
    #     data = CONTROL.SET, raw = F, descr = "rms")''' %
    #       locals())
    #     if options.bigwig:
    #         bigwig(outputfile, contig_sizes)
    #     else:
    #         compress(outputfile)

    # write footer and output benchmark information.
    E.stop()
Ejemplo n.º 25
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument(
        "-m",
        "--method",
        dest="method",
        type=str,
        choices=("add-flank", "add-upstream-flank", "add-downstream-flank",
                 "crop", "crop-unique", "complement-groups", "combine-groups",
                 "filter-range", "join-features", "merge-features", "sanitize",
                 "to-forward-coordinates", "to-forward-strand", "rename-chr"),
        help="method to apply ")

    parser.add_argument("--ignore-strand",
                        dest="ignore_strand",
                        help="ignore strand information.",
                        action="store_true")

    parser.add_argument("--is-gtf",
                        dest="is_gtf",
                        action="store_true",
                        help="input will be treated as gtf.")

    parser.add_argument("-c",
                        "--contigs-tsv-file",
                        dest="input_filename_contigs",
                        type=str,
                        help="filename with contig lengths.")

    parser.add_argument(
        "--agp-file",
        dest="input_filename_agp",
        type=str,
        help="agp file to map coordinates from contigs to scaffolds.")

    parser.add_argument("-g",
                        "--genome-file",
                        dest="genome_file",
                        type=str,
                        help="filename with genome.")

    parser.add_argument("--crop-gff-file",
                        dest="filename_crop_gff",
                        type=str,
                        help="GFF/GTF file to crop against.")

    parser.add_argument(
        "--group-field",
        dest="group_field",
        type=str,
        help="""gff field/attribute to group by such as gene_id, "
        "transcript_id, ... .""")

    parser.add_argument(
        "--filter-range",
        dest="filter_range",
        type=str,
        help="extract all elements overlapping a range. A range is "
        "specified by eithor 'contig:from..to', 'contig:+:from..to', "
        "or 'from,to' .")

    parser.add_argument("--sanitize-method",
                        dest="sanitize_method",
                        type=str,
                        choices=("ucsc", "ensembl", "genome"),
                        help="method to use for sanitizing chromosome names. "
                        ".")

    parser.add_argument(
        "--flank-method",
        dest="flank_method",
        type=str,
        choices=("add", "extend"),
        help="method to use for adding flanks. ``extend`` will "
        "extend existing features, while ``add`` will add new features. "
        ".")

    parser.add_argument("--skip-missing",
                        dest="skip_missing",
                        action="store_true",
                        help="skip entries on missing contigs. Otherwise an "
                        "exception is raised .")

    parser.add_argument(
        "--contig-pattern",
        dest="contig_pattern",
        type=str,
        help="a comma separated list of regular expressions specifying "
        "contigs to be removed when running method sanitize .")

    parser.add_argument(
        "--assembly-report",
        dest="assembly_report",
        type=str,
        help="path to assembly report file which allows mapping of "
        "ensembl to ucsc contigs when running method sanitize .")

    parser.add_argument(
        "--assembly-report-hasids",
        dest="assembly_report_hasIDs",
        type=int,
        help="path to assembly report file which allows mapping of "
        "ensembl to ucsc contigs when running method sanitize .")

    parser.add_argument(
        "--assembly-report-ucsccol",
        dest="assembly_report_ucsccol",
        type=int,
        help="column in the assembly report containing ucsc contig ids"
        ".")

    parser.add_argument(
        "--assembly-report-ensemblcol",
        dest="assembly_report_ensemblcol",
        type=int,
        help="column in the assembly report containing ensembl contig ids")

    parser.add_argument(
        "--assembly-extras",
        dest="assembly_extras",
        type=str,
        help="additional mismatches between gtf and fasta to fix when"
        "sanitizing the genome .")

    parser.add_argument("--extension-upstream",
                        dest="extension_upstream",
                        type=float,
                        help="extension for upstream end .")

    parser.add_argument("--extension-downstream",
                        dest="extension_downstream",
                        type=float,
                        help="extension for downstream end .")

    parser.add_argument("--min-distance",
                        dest="min_distance",
                        type=int,
                        help="minimum distance of features to merge/join .")

    parser.add_argument("--max-distance",
                        dest="max_distance",
                        type=int,
                        help="maximum distance of features to merge/join .")

    parser.add_argument("--min-features",
                        dest="min_features",
                        type=int,
                        help="minimum number of features to merge/join .")

    parser.add_argument("--max-features",
                        dest="max_features",
                        type=int,
                        help="maximum number of features to merge/join .")

    parser.add_argument(
        "--rename-chr-file",
        dest="rename_chr_file",
        type=str,
        help="mapping table between old and new chromosome names."
        "TAB separated 2-column file.")

    parser.set_defaults(input_filename_contigs=False,
                        filename_crop_gff=None,
                        input_filename_agp=False,
                        genome_file=None,
                        rename_chr_file=None,
                        add_up_flank=None,
                        add_down_flank=None,
                        complement_groups=False,
                        crop=None,
                        crop_unique=False,
                        ignore_strand=False,
                        filter_range=None,
                        min_distance=0,
                        max_distance=0,
                        min_features=1,
                        max_features=0,
                        extension_upstream=1000,
                        extension_downstream=1000,
                        sanitize_method="ucsc",
                        flank_method="add",
                        output_format="%06i",
                        skip_missing=False,
                        is_gtf=False,
                        group_field=None,
                        contig_pattern=None,
                        assembly_report=None,
                        assembly_report_hasIDs=1,
                        assembly_report_ensemblcol=4,
                        assembly_report_ucsccol=9,
                        assembly_extras=None)

    (args) = E.start(parser, argv=argv)

    contigs = None
    genome_fasta = None
    chr_map = None

    if args.input_filename_contigs:
        contigs = Genomics.readContigSizes(
            iotools.open_file(args.input_filename_contigs, "r"))

    if args.genome_file:
        genome_fasta = IndexedFasta.IndexedFasta(args.genome_file)
        contigs = genome_fasta.getContigSizes()

    if args.rename_chr_file:
        chr_map = {}
        with open(args.rename_chr_file, 'r') as filein:
            reader = csv.reader(filein, delimiter='\t')
            for row in reader:
                if len(row) != 2:
                    raise ValueError(
                        "Mapping table must have exactly two columns")
                chr_map[row[0]] = row[1]
        if not len(chr_map.keys()) > 0:
            raise ValueError("Empty mapping dictionnary")

    if args.assembly_report:
        df = pd.read_csv(args.assembly_report,
                         comment="#",
                         header=None,
                         sep="\t")
        # fixes naming inconsistency in assembly report: ensembl chromosome
        # contigs found in columnn 0, ensembl unassigned contigs found in
        # column 4.
        if args.assembly_report_hasIDs == 1:
            ucsccol = args.assembly_report_ucsccol
            ensemblcol = args.assembly_report_ensemblcol
            df.loc[df[1] == "assembled-molecule",
                   ensemblcol] = df.loc[df[1] == "assembled-molecule", 0]
            if args.sanitize_method == "ucsc":
                assembly_dict = df.set_index(ensemblcol)[ucsccol].to_dict()
            elif args.sanitize_method == "ensembl":
                assembly_dict = df.set_index(ucsccol)[ensemblcol].to_dict()
            else:
                raise ValueError(''' When using assembly report,
                please specify sanitize method as either
                "ucsc" or "ensembl" to specify direction of conversion
                ''')
        else:
            assembly_dict = {}
        if args.assembly_extras is not None:
            assembly_extras = args.assembly_extras.split(",")
            for item in assembly_extras:
                item = item.split("-")
                assembly_dict[item[0]] = item[1]

    if args.method in ("forward_coordinates", "forward_strand",
                       "add-flank", "add-upstream-flank",
                       "add-downstream-flank") \
       and not contigs:
        raise ValueError("inverting coordinates requires genome file")

    if args.input_filename_agp:
        agp = AGP.AGP()
        agp.readFromFile(iotools.open_file(args.input_filename_agp, "r"))
    else:
        agp = None

    gffs = GTF.iterator(args.stdin)

    if args.method in ("add-upstream-flank", "add-downstream-flank",
                       "add-flank"):

        add_upstream_flank = "add-upstream-flank" == args.method
        add_downstream_flank = "add-downstream-flank" == args.method
        if args.method == "add-flank":
            add_upstream_flank = add_downstream_flank = True

        upstream_flank = int(args.extension_upstream)
        downstream_flank = int(args.extension_downstream)
        extend_flank = args.flank_method == "extend"

        if args.is_gtf:
            iterator = GTF.flat_gene_iterator(gffs)
        else:
            iterator = GTF.joined_iterator(gffs, args.group_field)

        for chunk in iterator:
            is_positive = Genomics.IsPositiveStrand(chunk[0].strand)
            chunk.sort(key=lambda x: (x.contig, x.start))
            lcontig = contigs[chunk[0].contig]

            if extend_flank:
                if add_upstream_flank:
                    if is_positive:
                        chunk[0].start = max(0,
                                             chunk[0].start - upstream_flank)
                    else:
                        chunk[-1].end = min(lcontig,
                                            chunk[-1].end + upstream_flank)
                if add_downstream_flank:
                    if is_positive:
                        chunk[-1].end = min(lcontig,
                                            chunk[-1].end + downstream_flank)
                    else:
                        chunk[0].start = max(0,
                                             chunk[0].start - downstream_flank)
            else:
                if add_upstream_flank:
                    gff = GTF.Entry()
                    if is_positive:
                        gff.copy(chunk[0])
                        gff.end = gff.start
                        gff.start = max(0, gff.start - upstream_flank)
                        chunk.insert(0, gff)
                    else:
                        gff.copy(chunk[-1])
                        gff.start = gff.end
                        gff.end = min(lcontig, gff.end + upstream_flank)
                        chunk.append(gff)
                    gff.feature = "5-Flank"
                    gff.mMethod = "gff2gff"
                if add_downstream_flank:
                    gff = GTF.Entry()
                    if is_positive:
                        gff.copy(chunk[-1])
                        gff.start = gff.end
                        gff.end = min(lcontig, gff.end + downstream_flank)
                        chunk.append(gff)
                    else:
                        gff.copy(chunk[0])
                        gff.end = gff.start
                        gff.start = max(0, gff.start - downstream_flank)
                        chunk.insert(0, gff)
                    gff.feature = "3-Flank"
                    gff.mMethod = "gff2gff"

            if not is_positive:
                chunk.reverse()

            for gff in chunk:
                args.stdout.write(str(gff) + "\n")

    elif args.method == "complement-groups":

        iterator = GTF.joined_iterator(gffs, group_field=args.group_field)

        for chunk in iterator:
            if args.is_gtf:
                chunk = [x for x in chunk if x.feature == "exon"]
                if len(chunk) == 0:
                    continue
            chunk.sort(key=lambda x: (x.contig, x.start))
            x = GTF.Entry()
            x.copy(chunk[0])
            x.start = x.end
            x.feature = "intron"
            for c in chunk[1:]:
                x.end = c.start
                args.stdout.write(str(x) + "\n")
                x.start = c.end

    elif args.method == "combine-groups":

        iterator = GTF.joined_iterator(gffs, group_field=args.group_field)

        for chunk in iterator:
            chunk.sort(key=lambda x: (x.contig, x.start))
            x = GTF.Entry()
            x.copy(chunk[0])
            x.end = chunk[-1].end
            x.feature = "segment"
            args.stdout.write(str(x) + "\n")

    elif args.method == "join-features":
        for gff in combineGFF(gffs,
                              min_distance=args.min_distance,
                              max_distance=args.max_distance,
                              min_features=args.min_features,
                              max_features=args.max_features,
                              merge=False,
                              output_format=args.output_format):
            args.stdout.write(str(gff) + "\n")

    elif args.method == "merge-features":
        for gff in combineGFF(gffs,
                              min_distance=args.min_distance,
                              max_distance=args.max_distance,
                              min_features=args.min_features,
                              max_features=args.max_features,
                              merge=True,
                              output_format=args.output_format):
            args.stdout.write(str(gff) + "\n")

    elif args.method == "crop":
        for gff in cropGFF(gffs, args.filename_crop_gff):
            args.stdout.write(str(gff) + "\n")

    elif args.method == "crop-unique":
        for gff in cropGFFUnique(gffs):
            args.stdout.write(str(gff) + "\n")

    elif args.method == "filter-range":

        contig, strand, interval = None, None, None
        try:
            contig, strand, start, sep, end = re.match(
                "(\S+):(\S+):(\d+)(\.\.|-)(\d+)", args.filter_range).groups()
        except AttributeError:
            pass

        if not contig:
            try:
                contig, start, sep, end = re.match("(\S+):(\d+)(\.\.|-)(\d+)",
                                                   args.filter_range).groups()
                strand = None
            except AttributeError:
                pass

        if not contig:
            try:
                start, end = re.match("(\d+)(\.\.|\,|\-)(\d+)",
                                      args.filter_range).groups()
            except AttributeError:
                raise "can not parse range %s" % args.filter_range
            contig = None
            strand = None

        if start:
            interval = (int(start), int(end))
        else:
            interval = None

        E.debug("filter: contig=%s, strand=%s, interval=%s" %
                (str(contig), str(strand), str(interval)))

        for gff in GTF.iterator_filtered(gffs,
                                         contig=contig,
                                         strand=strand,
                                         interval=interval):
            args.stdout.write(str(gff) + "\n")

    elif args.method == "sanitize":

        def assemblyReport(id):
            if id in assembly_dict.keys():
                id = assembly_dict[id]
            # if not in dict, the contig name is forced
            # into the desired convention, this is helpful user
            # modified gff files that contain additional contigs
            elif args.sanitize_method == "ucsc":
                if not id.startswith("contig") and not id.startswith("chr"):
                    id = "chr%s" % id
            elif args.sanitize_method == "ensembl":
                if id.startswith("contig"):
                    return id[len("contig"):]
                elif id.startswith("chr"):
                    return id[len("chr"):]
            return id

        if args.sanitize_method == "genome":
            if genome_fasta is None:
                raise ValueError("please specify --genome-file= when using "
                                 "--sanitize-method=genome")
            f = genome_fasta.getToken
        else:
            if args.assembly_report is None:
                raise ValueError(
                    "please specify --assembly-report= when using "
                    "--sanitize-method=ucsc or ensembl")
            f = assemblyReport

        skipped_contigs = collections.defaultdict(int)
        outofrange_contigs = collections.defaultdict(int)
        filtered_contigs = collections.defaultdict(int)

        for gff in gffs:
            try:
                gff.contig = f(gff.contig)
            except KeyError:
                if args.skip_missing:
                    skipped_contigs[gff.contig] += 1
                    continue
                else:
                    raise

            if genome_fasta:
                lcontig = genome_fasta.getLength(gff.contig)
                if lcontig < gff.end:
                    outofrange_contigs[gff.contig] += 1
                    continue

            if args.contig_pattern:
                to_remove = [
                    re.compile(x) for x in args.contig_pattern.split(",")
                ]
                if any([x.search(gff.contig) for x in to_remove]):
                    filtered_contigs[gff.contig] += 1
                    continue

            args.stdout.write(str(gff) + "\n")

        if skipped_contigs:
            E.info("skipped %i entries on %i contigs: %s" %
                   (sum(skipped_contigs.values()),
                    len(list(skipped_contigs.keys())), str(skipped_contigs)))

        if outofrange_contigs:
            E.warn(
                "skipped %i entries on %i contigs because they are out of range: %s"
                % (sum(outofrange_contigs.values()),
                   len(list(
                       outofrange_contigs.keys())), str(outofrange_contigs)))

        if filtered_contigs:
            E.info("filtered out %i entries on %i contigs: %s" %
                   (sum(filtered_contigs.values()),
                    len(list(filtered_contigs.keys())), str(filtered_contigs)))

    elif args.method == "rename-chr":
        if not chr_map:
            raise ValueError("please supply mapping file")

        for gff in renameChromosomes(gffs, chr_map):
            args.stdout.write(str(gff) + "\n")

    else:

        for gff in gffs:

            if args.method == "forward_coordinates":
                gff.invert(contigs[gff.contig])

            if args.method == "forward_strand":
                gff.invert(contigs[gff.contig])
                gff.strand = "+"

            if agp:
                # note: this works only with forward coordinates
                gff.contig, gff.start, gff.end = agp.mapLocation(
                    gff.contig, gff.start, gff.end)

            args.stdout.write(str(gff) + "\n")

    E.stop()
Ejemplo n.º 26
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-g",
        "--genome-file",
        dest="genome_file",
        type="string",
        help="filename with Samtools indexed genome [default=%default].")
    parser.add_option("-w",
                      "--window-size",
                      dest="window",
                      type="int",
                      help="Window size for tiling [default=%default].")
    parser.add_option("-s",
                      "--shift-size",
                      dest="shift",
                      type="int",
                      help="Window shift for tiling [default=%default].")

    parser.set_defaults(
        genome_file=None,
        window=1000,
        shift=1000,
        output_file=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    ninput, nunchanged, nchanged = 0, 0, 0

    # Open input file
    E.info("Opening input file: %s" % options.genome_file)
    fasta = IndexedFasta.IndexedFasta(options.genome_file)
    contigs = fasta.getContigSizes(with_synonyms=False)

    # Open output file
    bed = options.stdout
    shift = options.shift

    # Loop over input files and convert to soft clipped
    nwindows = 0
    ncontigs = 0
    for contig, stop in contigs.items():
        ncontigs += 1
        i = 0
        while (i < stop):
            j = min(i + options.window, stop)
            #bed.write( """%(contig)s\t%(i)i\t%(j)i\t%(contig)s:%(i)i..%(j)i\n""" % locals() )
            bed.write("""%(contig)s\t%(i)i\t%(j)i\n""" % locals())
            nwindows += 1
            i += shift

    # Report statistics
    E.info("ncontigs=%i, nwindows=%i" % (ncontigs, nwindows))

    # write footer and output benchmark information.
    E.stop()
Ejemplo n.º 27
0
def main(argv=None):

    if not argv:
        argv = sys.argv

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument("--version", action='version', version="1.0")

    parser.add_argument("-g",
                        "--genome-file",
                        dest="genome_file",
                        type=str,
                        help="filename with genome.")

    parser.add_argument("-i",
                        "--ignore-missing",
                        dest="ignore_missing",
                        action="store_true",
                        help="Ignore transcripts on contigs that are not "
                        "in the genome-file.")

    parser.add_argument("-s",
                        "--restrict-source",
                        dest="restrict_source",
                        type=str,
                        choices=("protein_coding", "pseudogene", "lncRNA"),
                        help="restrict input by source.")

    parser.add_argument("-m",
                        "--method",
                        dest="method",
                        type=str,
                        choices=(
                            "full",
                            "genome",
                            "exons",
                            "promotors",
                            "tts",
                            "regulons",
                            "tts-regulons",
                            "genes",
                            "territories",
                            "tss-territories",
                            "great-domains",
                        ),
                        help="method for defining segments.")

    parser.add_argument("-r",
                        "--territory-extension",
                        dest="radius",
                        type=int,
                        help="radius of a territory.")

    parser.add_argument("-f",
                        "--flank-size",
                        dest="flank",
                        type=int,
                        help="size of the flanking region next to a gene.")

    parser.add_argument(
        "--flank-increment-size",
        dest="increment",
        type=int,
        help="size of increment in flank in genestructure annotation ")

    parser.add_argument("-p",
                        "--promotor-size",
                        dest="promotor",
                        type=int,
                        help="size of a promotor region.")

    parser.add_argument("-u",
                        "--upstream-extension",
                        dest="upstream",
                        type=int,
                        help="size of region upstream of tss.")

    parser.add_argument("-d",
                        "--downstream-extension",
                        dest="downstream",
                        type=int,
                        help="size of region downstream of tss.")

    parser.add_argument("--gene-detail",
                        dest="detail",
                        type=str,
                        choices=("introns+exons", "exons", "introns"),
                        help="level of detail for gene structure annotation ")

    parser.add_argument("--merge-overlapping-promotors",
                        dest="merge_promotors",
                        action="store_true",
                        help="merge overlapping promotors.")

    parser.add_argument(
        "--min-intron-length",
        dest="min_intron_length",
        type=int,
        help="minimum intron length. If the distance between two "
        "consecutive exons is smaller, the region will be marked "
        "'unknown'.")

    parser.add_argument(
        "--is-unsorted",
        dest="is_sorted",
        action="store_false",
        help="sort input before processing. Otherwise, the input is assumed "
        "to be sorted.")

    parser.set_defaults(
        genome_file=None,
        flank=1000,
        increment=1000,
        max_frameshift_length=4,
        min_intron_length=30,
        ignore_missing=False,
        restrict_source=None,
        method="genome",
        radius=50000,
        promotor=5000,
        merge_promotors=False,
        upstream=5000,
        downstream=5000,
        detail="exons",
        is_sorted=True,
    )

    (args) = E.start(parser)

    if args.genome_file:
        fasta = IndexedFasta.IndexedFasta(args.genome_file)
    else:
        raise ValueError("please specify a --genome-file")

    if not args.restrict_source:
        iterator = GTF.iterator(args.stdin)

    elif args.restrict_source:
        iterator = GTF.iterator_filtered(GTF.iterator(args.stdin),
                                         source=args.restrict_source)

    # elif options.method in ("promotors", "tts", "regulons"):
    #     iterator = GTF.iterator_filtered( GTF.iterator(options.stdin), source = "protein_coding")
    # else:
    #     iterator = GTF.iterator(options.stdin)

    if not args.is_sorted:
        iterator = GTF.iterator_sorted(iterator, sort_order="position")

    if args.method == "full" or args.method == "genome":
        segmentor = annotateGenome(iterator, fasta, args)
    elif args.method == "territories":
        segmentor = buildTerritories(iterator, fasta, 'gene', args)
    elif args.method == "tss-territories":
        segmentor = buildTerritories(iterator, fasta, 'tss', args)
    elif args.method == "exons":
        segmentor = annotateExons(iterator, fasta, args)
    elif args.method == "promotors":
        segmentor = annotatePromoters(iterator, fasta, args)
    elif args.method == "regulons":
        segmentor = annotateRegulons(iterator, fasta, True, args)
    elif args.method == "tts-regulons":
        segmentor = annotateRegulons(iterator, fasta, False, args)
    elif args.method == "tts":
        segmentor = annotateTTS(iterator, fasta, args)
    elif args.method == "genes":
        segmentor = annotateGenes(iterator, fasta, args)
    elif args.method == "great-domains":
        segmentor = annotateGREATDomains(iterator, fasta, args)

    E.stop()
Ejemplo n.º 28
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: gtf2fasta.py 2861 2010-02-23 17:36:32Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option(
        "-i",
        "--ignore-missing",
        dest="ignore_missing",
        action="store_true",
        help=
        "Ignore transcripts on contigs that are not in the genome-file [default=%default]."
    )

    parser.add_option(
        "--min-intron-length",
        dest="min_intron_length",
        type="int",
        help=
        "minimum intron length. If the distance between two consecutive exons is smaller, the region will be marked 'unknown' [default=%default]."
    )

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("full", ),
                      help="method to apply [default=%default].")

    parser.set_defaults(
        genome_file=None,
        flank=1000,
        max_frameshift_length=4,
        min_intron_length=30,
        ignore_missing=False,
        restrict_source=None,
        method="full",
        report_step=1000,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if not options.genome_file:
        raise ValueError("an indexed genome is required.")

    fasta = IndexedFasta.IndexedFasta(options.genome_file)

    iterator = GTF.transcript_iterator(GTF.iterator(options.stdin))

    annotateGenome(iterator, fasta, options)

    # write footer and output benchmark information.
    E.stop()
Ejemplo n.º 29
0
def main(argv=None):

    if argv is None:
        argv = sys.argv

    parser = E.ArgumentParser(description=__doc__)

    parser.add_argument(
        "-g", "--genome-file", dest="genome_file", type=str,
        help="filename with genome.")

    parser.add_argument(
        "-a", "--aggregate-by", dest="aggregate", type=str,
        choices=("name", "contig", "track", "none"),
        help="aggregate counts by feature.")

    parser.add_argument(
        "-p", "--add-percent", dest="add_percent", action="store_true",
        help="add percentages.")

    parser.set_defaults(
        genome_file=None,
        aggregate="none",
        add_percent=False,
    )

    (args) = E.start(parser, argv)

    # get files
    if args.genome_file:
        fasta = IndexedFasta.IndexedFasta(args.genome_file)
    else:
        if args.add_percent:
            raise ValueError("--add-percent option requires --genome-file")
        fasta = None

    if args.add_percent and not args.aggregate == "contig":
        raise NotImplementedError(
            "--add-percent option requires --aggregate=contig")

    counts = collections.defaultdict(Counter)
    total = Counter()
    output_totals = True

    if args.aggregate == "track":
        keyf = lambda x: x.track
    elif args.aggregate == "name":
        keyf = lambda x: x.name
    elif args.aggregate == "contig":
        keyf = lambda x: x.contig
    else:
        keyf = lambda x: "all"
        output_totals = False

    for bed in Bed.iterator(args.stdin):
        counts[keyf(bed)].add(bed)
        total.add(bed)

    outf = args.stdout

    key = "track"
    if args.add_percent:
        outf.write("%s\t%s\n" % (key, "\t".join(Counter.headers_percent)))
    else:
        outf.write("%s\t%s\n" % (key, "\t".join(Counter.headers)))

    total_bases = 0
    for key, count in sorted(counts.items()):
        if args.add_percent:
            total_bases += fasta.getLength(key)
            count.setSize(fasta.getLength(key))

        outf.write("%s\t%s\n" % (key, str(count)))

    if output_totals:
        if args.add_percent:
            count.setSize(total_bases)
        outf.write("%s\t%s\n" % ("total", str(total)))
    E.stop()