Ejemplo n.º 1
0
def applyThreshold(infile, fasta, threshold, max_distance=0):
    '''apply threshold to a wig file writing a
    bed-formatted file as output.'''

    c = E.Counter()

    for contig, size in list(
            fasta.getContigSizes(with_synonyms=False).items()):
        c.contigs += 1

        E.debug("processing %s" % contig)

        last_start, last_end = -1, 0

        for start, end, value in block_iterator(infile, contig, size):
            d = start - last_end
            if (d > 0 or value < threshold):
                if last_start >= 0:
                    yield contig, last_start, last_end
                    c.intervals += 1
                last_start = -1
            elif last_start < 0 and value >= threshold:
                last_start = start

            last_end = end

        if last_start >= 0:
            yield contig, last_start, end
            c.intervals += 1

        c.output += 1

    E.info(str(c))
Ejemplo n.º 2
0
def checkRequirementsFromAllModules():

    all_modules = sys.modules
    counter = E.Counter()
    results = []
    for module in list(sys.modules.keys()):
        if all_modules[module] is not None:
            results.extend(
                checkRequirementsFromModule(all_modules[module], counter))
    return counter, results
Ejemplo n.º 3
0
def ReadGene2GOFromFile(infile, synonyms={}, obsolete={}):
    """reads GO mappings for all go_types from a
    file.

    If synonyms is given, goids in synynoms will be translated.
    Terms in *obsolete* will be discarded.

    returns two maps: gene2go maps genes to go categories
    and go2info maps go categories to information.
    """

    gene2gos = {}
    go2infos = {}
    c = E.Counter()

    for line in infile:
        if line[0] == "#":
            continue
        try:
            go_type, gene_id, goid, description, evidence = line[:-1].split(
                "\t")
        except ValueError as msg:
            raise ValueError("parsing error in line '%s': %s" %
                             (line[:-1], msg))
        if go_type == "go_type":
            continue

        c.input += 1

        if goid in synonyms:
            c.synonyms += 1
            goid = synonyms[goid]

        if goid in obsolete:
            c.obsolete += 1
            continue

        gm = GOMatch(goid, go_type, description, evidence)
        gi = GOInfo(goid, go_type, description)
        if go_type not in gene2gos:
            gene2gos[go_type] = {}
            go2infos[go_type] = {}

        gene2go = gene2gos[go_type]
        go2info = go2infos[go_type]

        if gene_id not in gene2go:
            gene2go[gene_id] = []
        gene2go[gene_id].append(gm)
        go2info[goid] = gi
        c.output += 1

    E.debug("read gene2go assignments: %s" % str(c))

    return gene2gos, go2infos
Ejemplo n.º 4
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=("script", "module"),
                      help="type of tests to create [%default].")

    parser.set_defaults(method="script")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) == 0:
        raise ValueError(
            "setup_test.py requires one or more command line arguments")

    targetdir = os.path.dirname(__file__)

    counter = E.Counter()

    for arg in args:
        counter.input += 1
        script_dirname, basename = os.path.split(arg)

        dirname = os.path.join(targetdir, basename)

        if os.path.exists(dirname):
            E.warn("%s already exists - skipping" % basename)
            counter.skipped += 1
            continue

        os.mkdir(dirname)

        with open(os.path.join(dirname, "tests.yaml"), "w") as outf:
            outf.write(YAML_TEMPLATE)

        counter.created += 1

    E.info("%s" % str(counter))

    # write footer and output benchmark information.
    E.Stop()
Ejemplo n.º 5
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--input-fastq-file",
                      dest="input_fastq_file",
                      type="string",
                      help="input fastq file. "
                      "[%default]")

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      action="append",
                      type="choice",
                      choices=("length", ),
                      help="methods to apply [%default]")

    parser.set_defaults(
        methods=[],
        input_fastq_file=None,
    )

    (options, args) = E.start(parser, argv)

    if len(args) == 1:
        options.input_fastq_file = args[0]

    if options.input_fastq_file is None:
        raise ValueError("missing input fastq file")

    counter = E.Counter()

    # note: complete rewrite with Counters, currently only length
    if options.methods != ["length"]:
        raise NotImplementedError()

    with pysam.FastqFile(options.input_fastq_file) as inf:

        for read in inf:
            counter.input += 1
            options.stdout.write(
                "\t".join(map(str, (read.name, len(read.sequence)))) + "\n")

            counter.output += 1

    E.info(counter)
    E.stop()
Ejemplo n.º 6
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-n",
                      "--dry-run",
                      dest="dry_run",
                      action="store_true",
                      help="dry run, do not delete any files [%default]")

    parser.set_defaults(dry_run=False)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    filenames = args

    c = E.Counter()
    for filename in filenames:
        c.checked += 1
        if os.path.exists(filename + ".log"):
            if IOTools.isComplete(filename + ".log"):
                c.complete += 1
                continue

        if IOTools.isComplete(filename):
            c.complete += 1
            continue

        c.incomplete += 1
        E.info('deleting %s' % filename)
        if options.dry_run:
            continue
        os.unlink(filename)
        c.deleted += 1

    E.info(c)

    # write footer and output benchmark information.
    E.Stop()
Ejemplo n.º 7
0
def clean(files, logfile):
    '''clean up files given by glob expressions.

    Files are cleaned up by zapping, i.e. the files are set to size
    0. Links to files are replaced with place-holders.

    Information about the original file is written to `logfile`.

    Arguments
    ---------
    files : list
        List of glob expressions of files to clean up.
    logfile : string
        Filename of logfile.

    '''
    fields = ('st_atime', 'st_blksize', 'st_blocks', 'st_ctime', 'st_dev',
              'st_gid', 'st_ino', 'st_mode', 'st_mtime', 'st_nlink', 'st_rdev',
              'st_size', 'st_uid')

    dry_run = PARAMS.get("dryrun", False)

    if not dry_run:
        if not os.path.exists(logfile):
            outfile = IOTools.open_file(logfile, "w")
            outfile.write("filename\tzapped\tlinkdest\t%s\n" %
                          "\t".join(fields))
        else:
            outfile = IOTools.open_file(logfile, "a")

    c = E.Counter()
    for fn in files:
        c.files += 1
        if not dry_run:
            stat, linkdest = IOTools.zap_file(fn)
            if stat is not None:
                c.zapped += 1
                if linkdest is not None:
                    c.links += 1
                outfile.write(
                    "%s\t%s\t%s\t%s\n" %
                    (fn, time.asctime(time.localtime(time.time())), linkdest,
                     "\t".join([str(getattr(stat, x)) for x in fields])))

    get_logger().info("zapped: %s" % (c))
    outfile.close()

    return c
Ejemplo n.º 8
0
    def pair_iterator(test_vcf, truth_vcf, contig):
        counter = E.Counter()
        test_iter = test_vcf.fetch(contig)
        truth_iter = truth_vcf.fetch(contig)

        test_record = next(test_iter)
        truth_record = next(truth_iter)
        try:
            while 1:
                if test_record.pos < truth_record.pos:
                    test_record = next(test_iter)
                    continue

                elif test_record.pos > truth_record.pos:
                    truth_record = next(truth_iter)
                    continue

                elif len(test_record.alts) > 1:
                    counter.skip_test_truth += 1
                    test_record = next(test_iter)
                    continue

                elif len(truth_record.alts) > 1:
                    counter.skip_multiallelic_truth += 1
                    truth_record = next(truth_iter)
                    continue

                elif test_record.alts != truth_record.alts:
                    counter.skip_genotype_difference += 1
                    test_record = next(test_iter)
                    truth_record = next(truth_iter)
                    continue

                if test_record.ref != truth_record.ref:
                    # todo: deal with indels
                    raise ValueError("mismatching reference bases at position "
                                     "{}:{}".format(test_record.chrom,
                                                    test_record.pos))

                yield test_record, truth_record
                test_record = next(test_iter)
                truth_record = next(truth_iter)

        except StopIteration:
            pass

        E.debug(str(counter))
Ejemplo n.º 9
0
def read_and_randomize_rows(infile, options):
    """read table from stdin and randomize rows, keeping header."""

    c = E.Counter()
    if options.has_headers:
        keep_header = 1
    else:
        keep_header = 0
    for x in range(keep_header):
        c.header += 1
        options.stdout.write(infile.readline())

    lines = infile.readlines()
    c.lines_input = len(lines)
    random.shuffle(lines)
    options.stdout.write("".join(lines))
    c.lines_output = len(lines)
    E.info(c)
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-k",
                      "--keep-header",
                      dest="keep_header",
                      type="int",
                      help="randomize, but keep header in place [%default]")

    parser.set_defaults(keep_header=0)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    inf = options.stdin
    outf = options.stdout
    c = E.Counter()
    for x in range(options.keep_header):
        c.header += 1
        outf.write(inf.readline())

    lines = inf.readlines()
    c.lines_input = len(lines)
    random.shuffle(lines)
    for line in lines:
        outf.write(line)
    c.lines_output = len(lines)

    E.info(c)

    # write footer and output benchmark information.
    E.stop()
Ejemplo n.º 11
0
def read_vcf_positions_into_dataframe(filename, filters=None):

    vcf_in = pysam.VariantFile(filename)

    if filters is None:
        filters = []

    pass_filter = False
    snp_filter = False

    for f in filters:
        if f == "PASS":
            pass_filter = True
        elif f == "SNP":
            snp_filter = True

    records = []
    c = E.Counter()
    for record in vcf_in:
        c.input += 1
        f = record.filter.keys()
        if pass_filter and "PASS" not in f and "." not in f:
            c.removed_pass_filter += 1
            continue
        if snp_filter:
            is_snp = (len(record.ref) == 1 and len(record.alts) == 1
                      and len(record.alts[0]) == 1)
            if not is_snp:
                c.removed_snp_filter += 1
                continue

        c.output += 1
        records.append((record.chrom, record.pos))

    df = pandas.DataFrame.from_records(records, columns=["chrom", "pos"])

    E.info("{}: {}".format(filename, c))

    return df
Ejemplo n.º 12
0
def buildMRBed(infile, outfile):
    '''output bed6 file with methylated regions.

    All regions are output, even the insignificant ones.

    The score is the log fold change.
    '''

    outf = IOTools.openFile(outfile, "w")
    c = E.Counter()
    for row in csv.DictReader(IOTools.openFile(infile), dialect="excel-tab"):
        c.input += 1

        contig, start, end = re.match("(.*):(\d+)-(\d+)",
                                      row["interval_id"]).groups()
        c.output += 1
        outf.write("\t".join((contig, start, end, str(c.input),
                              row["lfold"])) + "\n")

    outf.close()

    E.info("%s" % str(c))
Ejemplo n.º 13
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-d",
                      "--database",
                      dest="database",
                      type="string",
                      help="bioconductor database to use [default=%default].")

    parser.add_option("-m",
                      "--mapping",
                      dest="database",
                      type="string",
                      help="bioconductor mapping to use [default=%default].")

    parser.add_option(
        "-g",
        "--gtf-file",
        dest="filename_gtf",
        type="string",
        help="filename with the gene set in gtf format [default=%default].")

    parser.set_defaults(
        database="mouse4302.db",
        mapping="ENSEMBL",
        filename_gtf=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    prefix = options.database[:-len(".db")]

    mapping_probeset2gene = prefix + options.mapping
    mapping_probeset2loc = prefix + "CHRLOC"

    probeset2gene = getProbeset2Gene(database=options.database, )

    probeset2location = getProbeset2Location(database=options.database, )

    # gtf = GTF.readAndIndex(
    #     GTF.iterator( IOTools.open_file( options.filename_gtf ) ) )

    counts = E.Counter()

    outfile_notfound = IOTools.open_file("notfound.table", "w")

    options.stdout.write("probeset_id\tgene_id\tngenes\n")

    for probeset, locations in probeset2location.items():
        counts.probesets += 1
        gene_ids = probeset2gene[probeset]
        if len(gene_ids) == 0:
            counts.notfound += 1
            continue

        for gene_id in gene_ids:
            options.stdout.write("%s\t%s\t%i\n" %
                                 (probeset, gene_id, len(gene_ids)))
        counts.output += 1

    E.info("%s" % str(counts))

    # write footer and output benchmark information.
    E.stop()
Ejemplo n.º 14
0
def annotateGenome(iterator, fasta, options, default_code=DEFAULT_CODE):
    """annotate a genome given by the indexed *fasta* file and 
    an iterator over gtf annotations.
    """

    annotations = {}
    contig_sizes = fasta.getContigSizes(with_synonyms=False)
    E.info("allocating memory for %i contigs and %i bytes" %
           (len(contig_sizes),
            sum(contig_sizes.values()) * array.array("B").itemsize))
    # AString.AString( "a").itemsize ))

    for contig, size in list(contig_sizes.items()):
        E.debug("allocating %s: %i bases" % (contig, size))
        # annotations[contig] = AString.AString( default_code * size )
        # annotations[contig] = array.array("", default_code * size)
        # Go to list for py3 compatibility, patch
        annotations[contig] = [default_code] * size

    E.info("allocated memory for %i contigs" % len(fasta))

    counter = E.Counter()

    # output splice junctions
    outfile_junctions = E.openOutputFile("junctions")
    outfile_junctions.write(
        "contig\tstrand\tpos1\tpos2\tframe\tgene_id\ttranscript_id\n")
    for gtfs in iterator:

        counter.input += 1

        if counter.input % options.report_step == 0:
            E.info("iteration %i" % counter.input)

        try:
            contig = fasta.getToken(gtfs[0].contig)
        except KeyError as msg:
            E.warn("contig %s not found - annotation ignored" % gtfs[0].contig)
            counter.skipped_contig += 1
            continue

        lcontig = fasta.getLength(contig)

        # make sure that exons are sorted by coordinate
        gtfs.sort(key=lambda x: x.start)

        is_positive = Genomics.IsPositiveStrand(gtfs[0].strand)
        source = gtfs[0].source

        # process non-coding data
        if source in MAP_ENSEMBL:
            code = MAP_ENSEMBL[source]

            intervals = [(x.start, x.end) for x in gtfs]
            addSegments(annotations[contig], intervals, is_positive, code)

        elif source == "protein_coding":

            # collect exons for utr
            exons = [(x.start, x.end) for x in gtfs if x.feature == "exon"]
            cds = [(x.start, x.end) for x in gtfs if x.feature == "CDS"]
            if len(cds) == 0:
                counter.skipped_transcripts += 1
                E.warn("protein-coding transcript %s without CDS - skipped" %
                       gtfs[0].transcript_id)
                continue

            exons = Intervals.truncate(exons, cds)
            start, end = cds[0][0], cds[-1][1]

            UTR5 = [x for x in exons if x[1] < start]
            UTR3 = [x for x in exons if x[0] >= end]

            if not is_positive:
                UTR5, UTR3 = UTR3, UTR5
                splice_code = "S"
            else:
                splice_code = "s"

            addSegments(annotations[contig], UTR5, is_positive, "u")

            addIntrons(annotations[contig], UTR5, is_positive,
                       options.max_frameshift_length)

            addSegments(annotations[contig], UTR3, is_positive, "v")

            addIntrons(annotations[contig], UTR3, is_positive,
                       options.max_frameshift_length)

            # output CDS according to frame
            addCDS(annotations[contig],
                   [x for x in gtfs if x.feature == "CDS"], is_positive)

            # add introns between CDS
            addIntrons(annotations[contig], cds, is_positive,
                       options.max_frameshift_length)

            # output splice junctions
            cds = [x for x in gtfs if x.feature == "CDS"]

            # apply corrections for 1-past end coordinates
            # to point between residues within CDS
            if is_positive:
                ender = lambda x: x.end - 1
                starter = lambda x: x.start
                out_positive = "+"
            else:
                ender = lambda x: lcontig - x.start - 1
                starter = lambda x: lcontig - x.end
                out_positive = "-"
                cds.reverse()

            end = ender(cds[0])
            for c in cds[1:]:
                start = starter(c)
                outfile_junctions.write("%s\t%s\t%i\t%i\t%s\t%s\t%s\n" % (
                    contig,
                    out_positive,
                    end,
                    start,
                    c.frame,
                    c.gene_id,
                    c.transcript_id,
                ))
                end = ender(c)

    E.info("finished reading genes: %s" % str(counter))

    outfile_junctions.close()

    E.info("started counting")
    outfile = E.openOutputFile("counts")
    outputCounts(outfile, annotations)
    outfile.close()

    E.info("started output")
    for k in sorted(annotations.keys()):
        # options.stdout.write(">%s\n%s\n" % (k, annotations[k].tostring()))
        options.stdout.write(">%s\n%s\n" % (k, "".join(annotations[k])))
Ejemplo n.º 15
0
def loadMACS(infile, outfile, bamfile, tablename=None):
    '''load MACS results in *tablename*

    This method loads only positive peaks. It filters peaks by p-value,
    q-value and fold change and loads the diagnostic data and
    re-calculates peakcenter, peakval, ... using the supplied bamfile.

    If *tablename* is not given, it will be :file:`<track>_intervals`
    where track is derived from ``infile`` and assumed to end
    in :file:`.macs`.

    This method creates two optional additional files:

    * if the file :file:`<track>_diag.xls` is present, load MACS 
      diagnostic data into the table :file:`<track>_macsdiag`.

    * if the file :file:`<track>_model.r` is present, call R to
      create a MACS peak-shift plot and save it as :file:`<track>_model.pdf`
      in the :file:`export/MACS` directory.

    This method creates :file:`<outfile>.tsv.gz` with the results
    of the filtering.
    '''

    track = P.snip(os.path.basename(infile), ".macs")
    folder = os.path.dirname(infile)
    if len(folder) > 0:
        infilename = folder + "/" + track + "_peaks.xls"
        filename_diag = folder + "/" + track + "_diag.xls"
        filename_r = folder + "/" + track + "_model.r"
        filename_rlog = folder + "/" + track + ".r.log"
        filename_pdf = track + "_model.pdf"
    else:
        infilename = track + "_peaks.xls"
        filename_diag = track + "_diag.xls"
        filename_r = track + "_model.r"
        filename_rlog = track + ".r.log"
        filename_pdf = track + "_model.pdf"

    if not os.path.exists(infilename):
        E.warn("could not find %s" % infilename)
        P.touch(outfile)
        return

    # create plot by calling R
    if os.path.exists(filename_r):
        if len(folder) > 0:
            statement = '''R --vanilla < %(filename_r)s > %(filename_rlog)s; mv %(filename_pdf)s %(folder)s/%(filename_pdf)s; '''
        else:
            statement = '''R --vanilla < %(filename_r)s > %(filename_rlog)s; '''
        P.run()

    # filter peaks
    shift = getPeakShiftFromMacs(infile)
    assert shift is not None, "could not determine peak shift from MACS file %s" % infile

    E.info("%s: found peak shift of %i" % (track, shift))

    samfiles = [pysam.Samfile(bamfile, "rb")]
    offsets = [shift / 2]

    outtemp = P.getTempFile(".")
    tmpfilename = outtemp.name

    outtemp.write("\t".join((
        "interval_id",
        "contig", "start", "end",
        "npeaks", "peakcenter",
        "length",
        "avgval",
        "peakval",
        "nprobes",
        "pvalue", "fold", "qvalue",
        "macs_summit", "macs_nprobes",
    )) + "\n")
    id = 0

    # get thresholds
    max_qvalue = float(PARAMS["macs_max_qvalue"])
    # min, as it is -10log10
    min_pvalue = float(PARAMS["macs_min_pvalue"])

    counter = E.Counter()
    with IOTools.openFile(infilename, "r") as ins:
        for peak in WrapperMACS.iteratePeaks(ins):

            if peak.fdr > max_qvalue:
                counter.removed_qvalue += 1
                continue
            elif peak.pvalue < min_pvalue:
                counter.removed_pvalue += 1
                continue

            assert peak.start < peak.end

            npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks(
                peak.contig, peak.start, peak.end, samfiles, offsets)

            outtemp.write("\t".join(map(str, (
                id, peak.contig, peak.start, peak.end,
                npeaks, peakcenter, length, avgval, peakval, nreads,
                peak.pvalue, peak.fold, peak.fdr,
                peak.start + peak.summit - 1,
                peak.tags))) + "\n")
            id += 1
            counter.output += 1

    outtemp.close()

    # output filtering summary
    outf = IOTools.openFile("%s.tsv.gz" % outfile, "w")
    outf.write("category\tcounts\n")
    outf.write("%s\n" % counter.asTable())
    outf.close()

    E.info("%s filtering: %s" % (track, str(counter)))
    if counter.output == 0:
        E.warn("%s: no peaks found" % track)

    # load data into table
    if tablename is None:
        tablename = "%s_macs_intervals" % track
    statement = '''cgat csv2db %(csv2db_options)s 
                       --allow-empty-file
                       --add-index=interval_id 
                       --add-index=contig,start
                       --table=%(tablename)s 
                   < %(tmpfilename)s > %(outfile)s '''
    P.run()
    os.unlink(tmpfilename)

    # load diagnostic data
    if os.path.exists(filename_diag):

        tablename = "%s_macsdiag" % track
        statement = '''
        cat %(filename_diag)s 
        | sed "s/FC range.*/fc\\tnpeaks\\tp90\\tp80\\tp70\\tp60\\tp50\\tp40\\tp30\\tp20/" 
        | cgat csv2db %(csv2db_options)s 
                  --map=fc:str 
                  --table=%(tablename)s 
        >> %(outfile)s
        '''
        P.run()
Ejemplo n.º 16
0
def loadIntervalsFromBed(bedfile, track, outfile,
                         bamfiles, offsets):
    '''load intervals from :term:`bed` formatted files into database.

    Re-evaluate the intervals by counting reads within
    the interval. In contrast to the initial pipeline, the
    genome is not binned. In particular, the meaning of the
    columns in the table changes to:

    nProbes: number of reads in interval
    PeakCenter: position with maximum number of reads in interval
    AvgVal: average coverage within interval

    '''

    tmpfile = P.getTempFile(".")

    headers = ("AvgVal", "DisttoStart", "GeneList", "Length", "PeakCenter", "PeakVal", "Position",
               "interval_id", "nCpGs", "nGenes", "nPeaks", "nProbes", "nPromoters", "contig", "start", "end")

    tmpfile.write("\t".join(headers) + "\n")

    avgval, contig, disttostart, end, genelist, length, peakcenter, peakval, position, start, interval_id, ncpgs, ngenes, npeaks, nprobes, npromoters = \
        0, "", 0, 0, "", 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

    mlength = int(PARAMS["calling_merge_min_interval_length"])

    c = E.Counter()

    # count tags
    for bed in Bed.iterator(IOTools.openFile(infile, "r")):

        c.input += 1

        if "name" not in bed:
            bed.name = c.input

        # remove very short intervals
        if bed.end - bed.start < mlength:
            c.skipped_length += 1
            continue

        if replicates:
            npeaks, peakcenter, length, avgval, peakval, nprobes = \
                PipelineChipseq.countPeaks(
                    bed.contig, bed.start, bed.end,
                    samfiles, offsets)

            # nreads can be 0 if the intervals overlap only slightly
            # and due to the binning, no reads are actually in the
            # overlap region.  However, most of these intervals should
            # be small and have already be deleted via the
            # merge_min_interval_length cutoff.  do not output
            # intervals without reads.
            if nprobes == 0:
                c.skipped_reads += 1

        else:
            npeaks, peakcenter, length, avgval, peakval, nprobes = (
                1,
                bed.start +
                (bed.end - bed.start) // 2,
                bed.end - bed.start,
                1,
                1,
                1)

        c.output += 1
        tmpfile.write("\t".join(map(
            str,
            (avgval, disttostart, genelist, length,
             peakcenter, peakval, position, bed.name,
             ncpgs, ngenes, npeaks, nprobes, npromoters,
             bed.contig, bed.start, bed.end))) + "\n")

    if c.output == 0:
        E.warn("%s - no intervals")

    tmpfile.close()

    tmpfilename = tmpfile.name
    tablename = "%s_intervals" % track.asTable()

    statement = '''
    cgat csv2db %(csv2db_options)s
    --allow-empty-file
    --add-index=interval_id
    --table=%(tablename)s
    < %(tmpfilename)s
    > %(outfile)s
    '''

    P.run()
    os.unlink(tmpfile.name)

    E.info("%s\n" % str(c))
Ejemplo n.º 17
0
def loadZinba(infile, outfile, bamfile,
              tablename=None,
              controlfile=None):
    '''load Zinba results in *tablename*

    This method loads only positive peaks. It filters peaks by p-value,
    q-value and fold change and loads the diagnostic data and
    re-calculates peakcenter, peakval, ... using the supplied bamfile.

    If *tablename* is not given, it will be :file:`<track>_intervals`
    where track is derived from ``infile`` and assumed to end
    in :file:`.zinba`.

    If no peaks were predicted, an empty table is created.

    This method creates :file:`<outfile>.tsv.gz` with the results
    of the filtering.

    This method uses the refined peak locations.

    Zinba peaks can be overlapping. This method does not merge
    overlapping intervals.

    Zinba calls peaks in regions where there are many reads inside
    the control. Thus this method applies a filtering step 
    removing all intervals in which there is a peak of
    more than readlength / 2 height in the control.

    .. note:

       Zinba calls peaks that are overlapping.

    '''

    track = P.snip(os.path.basename(infile), ".zinba")
    folder = os.path.dirname(infile)

    infilename = infile + ".peaks"

    outtemp = P.getTempFile(".")
    tmpfilename = outtemp.name

    outtemp.write("\t".join((
        "interval_id",
        "contig", "start", "end",
        "npeaks", "peakcenter",
        "length",
        "avgval",
        "peakval",
        "nprobes",
        "pvalue", "fold", "qvalue",
        "macs_summit", "macs_nprobes",
    )) + "\n")

    counter = E.Counter()

    if not os.path.exists(infilename):
        E.warn("could not find %s" % infilename)
    elif IOTools.isEmpty(infilename):
        E.warn("no data in %s" % infilename)
    else:
        # filter peaks
        shift = getPeakShiftFromZinba(infile)
        assert shift is not None, \
            "could not determine peak shift from Zinba file %s" % infile

        E.info("%s: found peak shift of %i" % (track, shift))

        samfiles = [pysam.Samfile(bamfile, "rb")]
        offsets = [shift / 2]

        if controlfile:
            controlfiles = [pysam.Samfile(controlfile, "rb")]
            readlength = BamTools.estimateTagSize(controlfile)
            control_max_peakval = readlength // 2
            E.info("removing intervals in which control has peak higher than %i reads" %
                   control_max_peakval)
        else:
            controlfiles = None

        id = 0

        # get thresholds
        max_qvalue = float(PARAMS["zinba_fdr_threshold"])

        with IOTools.openFile(infilename, "r") as ins:
            for peak in WrapperZinba.iteratePeaks(ins):

                # filter by qvalue
                if peak.fdr > max_qvalue:
                    counter.removed_qvalue += 1
                    continue

                assert peak.refined_start < peak.refined_end

                # filter by control
                if controlfiles:
                    npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks(peak.contig,
                                                                                     peak.refined_start,
                                                                                     peak.refined_end,
                                                                                     controlfiles,
                                                                                     offsets)

                    if peakval > control_max_peakval:
                        counter.removed_control += 1
                        continue

                # output peak
                npeaks, peakcenter, length, avgval, peakval, nreads = countPeaks(peak.contig,
                                                                                 peak.refined_start,
                                                                                 peak.refined_end,
                                                                                 samfiles,
                                                                                 offsets)

                outtemp.write("\t".join(map(str, (
                    id, peak.contig, peak.refined_start, peak.refined_end,
                    npeaks, peakcenter, length, avgval, peakval, nreads,
                    1.0 - peak.posterior, 1.0, peak.fdr,
                    peak.refined_start + peak.summit - 1,
                    peak.height))) + "\n")
                id += 1
                counter.output += 1

    outtemp.close()

    # output filtering summary
    outf = IOTools.openFile("%s.tsv.gz" % outfile, "w")
    outf.write("category\tcounts\n")
    outf.write("%s\n" % counter.asTable())
    outf.close()

    E.info("%s filtering: %s" % (track, str(counter)))
    if counter.output == 0:
        E.warn("%s: no peaks found" % track)

    # load data into table
    if tablename is None:
        tablename = "%s_intervals" % track

    statement = '''
    cgat csv2db %(csv2db_options)s 
              --allow-empty-file
              --add-index=interval_id 
              --add-index=contig,start
              --table=%(tablename)s 
    < %(tmpfilename)s 
    > %(outfile)s
    '''

    P.run()

    os.unlink(tmpfilename)
Ejemplo n.º 18
0
def aggregateWindowsTagCounts(infiles, outfile, regex="(.*)\..*"):
    '''aggregate output from several ``bedtools coverage`` results.

    ``bedtools coverage`` outputs the following columns for a bed4
    file::

    1 Contig
    2 Start
    3 Stop
    4 Name
    5 The number of features in A that overlapped (by at least one
      base pair) the B interval.
    6 The number of bases in B that had non-zero coverage from features in A.
    7 The length of the entry in B.
    8 The fraction of bases in B that had non-zero coverage from
      features in A.

    This method autodetects the number of columns in the :term:`infiles`
    and selects:

    * bed4: use column 5
    * bed6: use column 7
    * bed12: use column 13

    Arguments
    ---------
    infiles : list
        Input filenames with the output from ``bedtools coverage``
    outfile : string
        Output filename in :term:`tsv` format.
    regex : string
        Regular expression used to extract the track name from the
        filename.  The default removes any suffix.

    '''

    # get bed format
    bed_columns = Bed.getNumColumns(infiles[0])
    # +1 as awk is 1-based
    column = bed_columns - 4 + 1

    src = " ".join([
        """<( zcat %s |
              awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$%s );}')""" %
        (x, column) for x in infiles
    ])
    tmpfile = P.get_temp_filename(".")
    statement = '''paste %(src)s > %(tmpfile)s'''
    P.run(statement)

    # build track names
    tracks = [
        re.search(regex, os.path.basename(x)).groups()[0] for x in infiles
    ]

    outf = IOTools.open_file(outfile, "w")
    outf.write("interval_id\t%s\n" % "\t".join(tracks))

    # filter for uniqueness - keys with the same value as the
    # previous line will be ignored.
    last_gene = None
    c = E.Counter()
    for line in open(tmpfile, "r"):
        c.input += 1
        data = line[:-1].split("\t")
        genes = list(set([data[x] for x in range(0, len(data), 2)]))
        values = [int(data[x]) for x in range(1, len(data), 2)]

        assert len(genes) == 1, \
            "paste command failed, wrong number of genes per line: '%s'" % line
        if genes[0] == last_gene:
            c.duplicates += 1
            continue
        c.output += 1
        outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values))))
        last_gene = genes[0]

    outf.close()

    os.unlink(tmpfile)

    E.info("aggregateWindowsTagCounts: %s" % c)
Ejemplo n.º 19
0
def main(argv=None):
    """script main.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-o",
                      "--output-format",
                      dest="output_format",
                      type="choice",
                      choices=("bedgraph", "wiggle", "bigbed", "bigwig",
                               "bed"),
                      help="output format [default=%default]")

    parser.add_option("-s",
                      "--shift-size",
                      dest="shift",
                      type="int",
                      help="shift reads by a certain amount (ChIP-Seq) "
                      "[%default]")

    parser.add_option("-e",
                      "--extend",
                      dest="extend",
                      type="int",
                      help="extend reads by a certain amount "
                      "(ChIP-Seq) [%default]")

    parser.add_option("-p",
                      "--wiggle-span",
                      dest="span",
                      type="int",
                      help="span of a window in wiggle tracks "
                      "[%default]")

    parser.add_option("-m",
                      "--merge-pairs",
                      dest="merge_pairs",
                      action="store_true",
                      help="merge paired-ended reads into a single "
                      "bed interval [default=%default].")

    parser.add_option("--scale-base",
                      dest="scale_base",
                      type="float",
                      help="number of reads/pairs to scale bigwig file to. "
                      "The default is to scale to 1M reads "
                      "[default=%default]")

    parser.add_option("--scale-method",
                      dest="scale_method",
                      type="choice",
                      choices=(
                          "none",
                          "reads",
                      ),
                      help="scale bigwig output. 'reads' will normalize by "
                      "the total number reads in the bam file that are used "
                      "to construct the bigwig file. If --merge-pairs is used "
                      "the number of pairs output will be used for "
                      "normalization. 'none' will not scale the bigwig file"
                      "[default=%default]")

    parser.add_option("--max-insert-size",
                      dest="max_insert_size",
                      type="int",
                      help="only merge if insert size less that "
                      "# bases. 0 turns of this filter "
                      "[default=%default].")

    parser.add_option("--min-insert-size",
                      dest="min_insert_size",
                      type="int",
                      help="only merge paired-end reads if they are "
                      "at least # bases apart. "
                      "0 turns of this filter. [default=%default]")

    parser.set_defaults(
        samfile=None,
        output_format="wiggle",
        shift=0,
        extend=0,
        span=1,
        merge_pairs=None,
        min_insert_size=0,
        max_insert_size=0,
        scale_method='none',
        scale_base=1000000,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    if len(args) >= 1:
        options.samfile = args[0]
    if len(args) == 2:
        options.output_filename_pattern = args[1]
    if not options.samfile:
        raise ValueError("please provide a bam file")

    # Read BAM file using Pysam
    samfile = pysam.AlignmentFile(options.samfile, "rb")

    # Create temporary files / folders
    tmpdir = tempfile.mkdtemp()
    E.debug("temporary files are in %s" % tmpdir)
    tmpfile_wig = os.path.join(tmpdir, "wig")
    tmpfile_sizes = os.path.join(tmpdir, "sizes")

    # Create dictionary of contig sizes
    contig_sizes = dict(list(zip(samfile.references, samfile.lengths)))
    # write contig sizes
    outfile_size = IOTools.open_file(tmpfile_sizes, "w")
    for contig, size in sorted(contig_sizes.items()):
        outfile_size.write("%s\t%s\n" % (contig, size))
    outfile_size.close()

    # Shift and extend only available for bigwig format
    if options.shift or options.extend:
        if options.output_format != "bigwig":
            raise ValueError(
                "shift and extend only available for bigwig output")

    # Output filename required for bigwig / bigbed computation
    if options.output_format == "bigwig":
        if not options.output_filename_pattern:
            raise ValueError(
                "please specify an output file for bigwig computation.")

        # Define executable to use for binary conversion
        if options.output_format == "bigwig":
            executable_name = "wigToBigWig"
        else:
            raise ValueError("unknown output format `%s`" %
                             options.output_format)

        # check required executable file is in the path
        executable = IOTools.which(executable_name)
        if not executable:
            raise OSError("could not find %s in path." % executable_name)

        # Open outout file
        outfile = IOTools.open_file(tmpfile_wig, "w")
        E.info("starting output to %s" % tmpfile_wig)
    else:
        outfile = IOTools.open_file(tmpfile_wig, "w")
        E.info("starting output to stdout")

    # Set up output write functions
    if options.output_format in ("wiggle", "bigwig"):
        # wiggle is one-based, so add 1, also step-size is 1, so need
        # to output all bases
        if options.span == 1:
            outf = lambda outfile, contig, start, end, val: \
                outfile.write(
                    "".join(["%i\t%i\n" % (x, val)
                             for x in range(start + 1, end + 1)]))
        else:
            outf = SpanWriter(options.span)
    elif options.output_format == "bedgraph":
        # bed is 0-based, open-closed
        outf = lambda outfile, contig, start, end, val: \
            outfile.write("%s\t%i\t%i\t%i\n" % (contig, start, end, val))

    # initialise counters
    ninput, nskipped, ncontigs = 0, 0, 0

    # set output file name
    output_filename_pattern = options.output_filename_pattern
    if output_filename_pattern:
        output_filename = os.path.abspath(output_filename_pattern)

    # shift and extend or merge pairs. Output temporay bed file
    if options.shift > 0 or options.extend > 0 or options.merge_pairs:
        # Workflow 1: convert to bed intervals and use bedtools
        # genomecov to build a coverage file.
        # Convert to bigwig with UCSC tools bedGraph2BigWig

        if options.merge_pairs:
            # merge pairs using bam2bed
            E.info("merging pairs to temporary file")
            counter = _bam2bed.merge_pairs(
                samfile,
                outfile,
                min_insert_size=options.min_insert_size,
                max_insert_size=options.max_insert_size,
                bed_format=3)
            E.info("merging results: {}".format(counter))
            if counter.output == 0:
                raise ValueError("no pairs output after merging")
        else:
            # create bed file with shifted/extended tags
            shift, extend = options.shift, options.extend
            shift_extend = shift + extend
            counter = E.Counter()

            for contig in samfile.references:
                E.debug("output for %s" % contig)
                lcontig = contig_sizes[contig]

                for read in samfile.fetch(contig):
                    pos = read.pos
                    if read.is_reverse:
                        start = max(0, read.pos + read.alen - shift_extend)
                    else:
                        start = max(0, read.pos + shift)

                    # intervals extending beyond contig are removed
                    if start >= lcontig:
                        continue

                    end = min(lcontig, start + extend)
                    outfile.write("%s\t%i\t%i\n" % (contig, start, end))
                    counter.output += 1

        outfile.close()

        if options.scale_method == "reads":
            scale_factor = float(options.scale_base) / counter.output

            E.info("scaling: method=%s scale_quantity=%i scale_factor=%f" %
                   (options.scale_method, counter.output, scale_factor))
            scale = "-scale %f" % scale_factor
        else:
            scale = ""

        # Convert bed file to coverage file (bedgraph)
        tmpfile_bed = os.path.join(tmpdir, "bed")
        E.info("computing coverage")
        # calculate coverage - format is bedgraph
        statement = """bedtools genomecov -bg -i %(tmpfile_wig)s %(scale)s
        -g %(tmpfile_sizes)s > %(tmpfile_bed)s""" % locals()
        E.run(statement)

        # Convert bedgraph to bigwig
        E.info("converting to bigwig")
        tmpfile_sorted = os.path.join(tmpdir, "sorted")
        statement = ("sort -k 1,1 -k2,2n %(tmpfile_bed)s > %(tmpfile_sorted)s;"
                     "bedGraphToBigWig %(tmpfile_sorted)s %(tmpfile_sizes)s "
                     "%(output_filename_pattern)s" % locals())
        E.run(statement)

    else:

        # Workflow 2: use pysam column iterator to build a
        # wig file. Then convert to bigwig of bedgraph file
        # with UCSC tools.
        def column_iter(iterator):
            start = None
            end = 0
            n = None
            for t in iterator:
                if t.pos - end > 1 or n != t.n:
                    if start is not None:
                        yield start, end, n
                    start = t.pos
                    end = t.pos
                    n = t.n
                end = t.pos
            yield start, end, n

        if options.scale_method != "none":
            raise NotImplementedError(
                "scaling not implemented for pileup method")

        # Bedgraph track definition
        if options.output_format == "bedgraph":
            outfile.write("track type=bedGraph\n")

        for contig in samfile.references:
            # if contig != "chrX": continue
            E.debug("output for %s" % contig)
            lcontig = contig_sizes[contig]

            # Write wiggle header
            if options.output_format in ("wiggle", "bigwig"):
                outfile.write("variableStep chrom=%s span=%i\n" %
                              (contig, options.span))

            # Generate pileup per contig using pysam and iterate over columns
            for start, end, val in column_iter(samfile.pileup(contig)):
                # patch: there was a problem with bam files and reads
                # overextending at the end. These are usually Ns, but
                # need to check as otherwise wigToBigWig fails.
                if lcontig <= end:
                    E.warn("read extending beyond contig: %s: %i > %i" %
                           (contig, end, lcontig))
                    end = lcontig
                    if start >= end:
                        continue

                if val > 0:
                    outf(outfile, contig, start, end, val)
            ncontigs += 1

        # Close output file
        if type(outf) == type(SpanWriter):
            outf.flush(outfile)
        else:
            outfile.flush()

        E.info("finished output")

        # Report counters
        E.info("ninput=%i, ncontigs=%i, nskipped=%i" %
               (ninput, ncontigs, nskipped))

        # Convert to binary formats
        if options.output_format == "bigwig":
            outfile.close()

            E.info("starting %s conversion" % executable)
            try:
                retcode = subprocess.call(" ".join(
                    (executable, tmpfile_wig, tmpfile_sizes,
                     output_filename_pattern)),
                                          shell=True)
                if retcode != 0:
                    E.warn("%s terminated with signal: %i" %
                           (executable, -retcode))
                    return -retcode
            except OSError as msg:
                E.warn("Error while executing bigwig: %s" % msg)
                return 1
            E.info("finished bigwig conversion")
        else:
            with open(tmpfile_wig) as inf:
                sys.stdout.write(inf.read())

    # Cleanup temp files
    shutil.rmtree(tmpdir)

    E.stop()
Ejemplo n.º 20
0
def annotateCpGIslands(infiles, outfile):
    '''annotate transcript by absence/presence of CpG islands
    '''
    cpgfile, tssfile = infiles
    cpg = Bed.readAndIndex(IOTools.openFile(cpgfile))

    extension_upstream = PARAMS["cpg_search_upstream"]
    extension_downstream = PARAMS["cpg_search_downstream"]

    c = E.Counter()
    outf = IOTools.openFile(outfile, "w")
    outf.write(
        "transcript_id\tstrand\tstart\tend\trelative_start\trelative_end\n")

    for tss in Bed.iterator(IOTools.openFile(tssfile)):
        c.tss_total += 1

        if tss.strand == "+":
            start, end = tss.start - \
                extension_upstream, tss.start + extension_downstream
        else:
            start, end = tss.end - \
                extension_downstream, tss.end + extension_upstream

        try:
            matches = list(cpg[tss.contig].find(start, end))
        except KeyError:
            c.promotor_without_matches += 1
            continue

        if len(matches) == 0:
            c.promotor_without_matches += 1
            continue

        c.promotor_output += 1
        for match in matches:
            c.matches_total += 1
            genome_start, genome_end, x = match

            l = genome_end - genome_start

            # get relative location of match
            if tss.strand == "+":
                relative_start = genome_start - tss.start
            else:
                relative_start = tss.end - genome_end

            relative_end = relative_start + l

            outf.write("\t".join(
                map(str, (tss.name, tss.strand, genome_start, genome_end,
                          relative_start, relative_end))) + "\n")
            c.matches_output += 1

    outf.close()

    with IOTools.openFile(outfile + ".summary", "w") as outf:
        outf.write("category\tcounts\n")
        outf.write(c.asTable() + "\n")

    E.info(c)
Ejemplo n.º 21
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-i", "--input-bam", dest="input_bam_file", type="string",
        help="input bam file")

    parser.add_option(
        "-f", "--reference-bam", dest="reference_bam_file", type="string",
        help="reference BAM file [%default]")

    parser.add_option(
        "-q", "--query-name-regex", dest="query_name_regex", type="string",
        help="regular expression to apply on query name. "
        "Potentially required to match samtools sort order and should "
        "evaluate to an integer [%default]")

    parser.set_defaults(
        input_bam_file=None,
        reference_bam_file=None,
        query_name_regex=None,
    )

    (options, args) = E.start(parser, argv, add_output_options=True)

    if len(args) == 2:
        options.input_bam_file = args[0]
        options.reference_bam_file = args[1]

    if options.input_bam_file is None:
        raise ValueError("please supply a BAM file as input")

    if options.reference_bam_file is None:
        raise ValueError("please supply a BAM file as reference")

    # update paths to absolute
    options.input_bam_file = os.path.abspath(options.input_bam_file)
    options.reference_bam_file = os.path.abspath(options.reference_bam_file)

    if not os.path.exists(options.input_bam_file):
        raise OSError("input bam file {} does not exist".format(
            options.input_bam_file))

    if not os.path.exists(options.reference_bam_file):
        raise OSError("reference bam file {} does not exist".format(
            options.reference_bam_file))

    bam_in = pysam.AlignmentFile(options.input_bam_file)
    ref_in = pysam.AlignmentFile(options.reference_bam_file)

    outf_mapped = E.open_output_file("mapped")
    outf_mapped.write("\t".join(
        ["read",
         "length",
         "status",
         "overlap",
         "comp_contig",
         "comp_start",
         "comp_end",
         "ref_contig",
         "ref_start",
         "ref_end",
         "shared_misaligned",
         "shared_aligned",
         "shared_insertion",
         "shared_deletion",
         "comp_aligned",
         "comp_insertion",
         "comp_deletion",
         "ref_aligned",
         "ref_insertion",
         "ref_deletion"]) + "\n")

    outf_missing = E.open_output_file("missing")
    outf_missing.write("\t".join(
        ["read", "length", "status", "aligned",
         "insertion", "deletion"]) + "\n")

    counter = E.Counter()

    if options.query_name_regex:
        rx = re.compile(options.query_name_regex)

    def extract_query(x):
        return int(rx.search(x).groups()[0])

    qname_fn = None
    if options.query_name_regex:
        qname_fn = extract_query

    for reads_cmp, read_ref in group_pairs(iterate_read_pairs(
            bam_in.fetch(until_eof=True),
            ref_in.fetch(until_eof=True),
            qname_fn=qname_fn)):

        if len(reads_cmp) == 0:
            counter.missing += 1
            pairs_ref = set(read_ref.get_aligned_pairs())
            outf_missing.write("\t".join(
                map(str, (
                    read_ref.query_name,
                    read_ref.query_length,
                    "missing") +
                    count_pairs(pairs_ref))) + "\n")
            continue

        if len(reads_cmp) > 1:
            # multiple matches
            counter.multi_mapping += 1
            prefix = "multi_"
        else:
            counter.unique_mapping += 1
            prefix = "unique_"

        is_mapped = False
        for read_cmp in reads_cmp:

            counter.paired += 1

            if read_cmp.is_unmapped:
                counter.unmapped += 1
                pairs_ref = set(read_ref.get_aligned_pairs())
                outf_missing.write("\t".join(
                    map(str, (
                        read_ref.query_name,
                        read_ref.query_length,
                        "unmapped") +
                        count_pairs(pairs_ref))) + "\n")
                continue

            overlap = max(0, (min(read_cmp.reference_end,
                                  read_ref.reference_end) -
                              max(read_cmp.reference_start,
                                  read_ref.reference_start)))

            pairs_cmp = set(read_cmp.get_aligned_pairs())
            pairs_ref = set(read_ref.get_aligned_pairs())
            shared_cmp = pairs_cmp.intersection(pairs_ref)
            unique_cmp = pairs_cmp.difference(pairs_ref)
            missaligned = len([x for x, y in unique_cmp
                               if x is not None and y is not None])

            if read_cmp.reference_name != read_ref.reference_name or \
               overlap == 0:
                status = "mismapped"
            else:
                counter.overlap += 1
                status = "mapped"
                is_mapped = True

            outf_mapped.write("\t".join(
                map(str, (read_cmp.query_name,
                          read_cmp.query_length,
                          prefix + status,
                          overlap,
                          read_cmp.reference_name,
                          read_cmp.reference_start,
                          read_cmp.reference_end,
                          read_ref.reference_name,
                          read_ref.reference_start,
                          read_ref.reference_end,
                          missaligned) +
                    count_pairs(shared_cmp) +
                    count_pairs(pairs_cmp) +
                    count_pairs(pairs_ref))) + "\n")
        else:
            if is_mapped:
                status = "mapped"
            else:
                status = "mismapped"

            counter[prefix + status] += 1

    with E.open_output_file("summary") as outf:
        outf.write("category\tcounts\n")
        outf.write(counter.asTable() + "\n")

    E.stop()
def writeSequencesForIntervals(track,
                               filename,
                               dbhandle,
                               full=False,
                               halfwidth=None,
                               maxsize=None,
                               proportion=None,
                               masker=[],
                               offset=0,
                               shuffled=False,
                               num_sequences=None,
                               min_sequences=None,
                               order="peakval",
                               shift=None,
                               stranded=False):
    '''build a sequence set for motif discovery. Intervals are taken from
    the table <track>_intervals in the database *dbhandle* and save to
    *filename* in :term:`fasta` format.

    If num_shuffles is set, shuffled copies are created as well with
    the shuffled number appended to the filename.

    The sequences are masked before shuffling (is this appropriate?)

    If *full* is set, the whole intervals will be output, otherwise
    only the region around the peak given by *halfwidth*

    If *maxsize* is set, the output is truncated at *maxsize* characters
    in order to create jobs that take too long.

    If proportion is set, only the top *proportion* intervals are output
    (sorted by peakval).

    If *num_sequences* is set, the first *num_sequences* will be used.

    *masker* can be a combination of
        * dust, dustmasker: apply dustmasker
        * softmask: mask softmasked genomic regions

    *order* is the order by which peaks should be sorted. Possible
    values are 'peakval' (peak value, descending order), 'score' (peak
    score, descending order)

    If *shift* is set, intervals will be shifted. ``leftright``
    creates two intervals on the left and right of the actual
    interval. The intervals will be centered around the mid-point and
    truncated the same way as the main intervals.

    '''
    cc = dbhandle.cursor()

    orderby = ""
    if order == "peakval":
        orderby = " ORDER BY peakval DESC"
    elif order == "max":
        orderby = " ORDER BY score DESC"
    elif order != "random":
        raise ValueError(
            "Unknown value passed as order parameter, check your ini file")

    tablename = "%s_intervals" % P.tablequote(track)
    statement = '''SELECT contig, start, end, interval_id, score, strand, peakcenter 
                       FROM %(tablename)s 
                       ''' % locals() + orderby

    cc.execute(statement)
    data = cc.fetchall()
    cc.close()

    E.debug("Got %s intervals for track %s" % (len(data), track))
    if len(data) == 0:
        P.touch(filename)
        return

    data = truncateList(data, track, proportion, min_sequences, num_sequences,
                        order == "random")

    beds = bedsFromList(data)

    L.info("writeSequencesForIntervals %s: masker=%s" % (track, str(masker)))

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], PARAMS["genome"]))

    # At the moment the pipeline retrieves from the DB the bed regions and they will
    # always be in the positive strand but if this were to change. The regions retrieved from
    # the negative strand will be counted from the end of the chromosome and not the beginning without this.
    # This should be tested.
    fasta.setConverter(IndexedFasta.getConverter("zero-single-open"))

    # modify the ranges
    if shift == "leftright":
        beds = shitfBeds(beds)

    if halfwidth and not full:
        beds = centreAndCrop(beds, halfwidth)

    sequences = getFASTAFromBed(beds, fasta, stranded, offset, maxsize)

    if shuffled:
        sequences = shuffleFasta(sequences)

    c = E.Counter()
    outs = IOTools.open_file(filename, "w")
    for masker in masker:
        if masker not in ("unmasked", "none", None):
            ids, sequences = zip(*[(x.title, x.sequence) for x in sequences])
            sequences = maskSequences(sequences, masker)
            sequences = (FastaRecord(id, seq)
                         for id, seq in zip(ids, sequences))

    with IOTools.open_file(filename, "w") as outs:

        for sequence in sequences:
            c.input += 1
            if len(sequence.sequence) == 0:
                c.empty += 1
                continue
            if len(sequence.sequence) < 0:
                c.too_short += 1
                continue

            outs.write(">%s\n%s\n" % (sequence.title, sequence.sequence))
            c.output += 1
        outs.close()

    E.info("%s" % c)

    return c.output
Ejemplo n.º 23
0
def mergeAndFilterGTF(infile, outfile, logfile,
                      genome,
                      max_intron_size=None,
                      remove_contigs=None,
                      rna_file=None):

    '''sanitize transcripts file for cufflinks analysis.
    Merge exons separated by small introns (< 5bp).
    Transcripts will be ignored that
       * have very long introns (`max_intron_size`) (otherwise,
         cufflinks complains)
       * are located on contigs to be ignored (usually: chrM, _random, ...)
    Optionally remove transcripts based on repetitive sequences by
       supplying a repetitve `rna_file`.
    This method preserves all features in a gtf file (exon, CDS, ...).
    Arguments
    ---------
    infile : string
       Input filename in :term:`gtf` format
    outfile : string
       Output filename in :term:`gtf` format
    logfile : string
       Output filename for logging information.
    genome : string
       Filename (without extension) of indexed genome file
       in :term:`fasta` format.
    max_intron_size : int
       Remove transripts with introns larger than this value.
    remove_contigs : string
       Remove transcripts on contigs matching this regular
       expression string.
    rna_file : string
       Filename of :term:`gff` formatted file with repetetive
       sequences. If given, all transcripts overlapping any regions
       in this file will be removed.
    Returns
    -------
    kept_genes : dict
        a dictionary of all gene_ids that have been kept.
        '''

    c = E.Counter()

    outf = gzip.open(outfile, "w")

    E.info("filtering by contig and removing long introns")
    contigs = set(IndexedFasta.IndexedFasta(genome).getContigs())

    rx_contigs = None
    #
    if remove_contigs is not None:
        rx_contigs = re.compile(remove_contigs)
        E.info("removing contigs %s" % remove_contigs)

    rna_index = None
    if rna_file is not None:
        if not os.path.exists(rna_file):
            E.warn("file '%s' to remove repetetive rna does not exist" %
                   rna_file)
        else:
            rna_index = GTF.readAndIndex(
                GTF.iterator(IOTools.open_file(rna_file, "r")))
            E.info("removing ribosomal RNA in %s" % rna_file)

    gene_ids = {}

    logf = IOTools.open_file(logfile, "w")
    logf.write("gene_id\ttranscript_id\treason\n")

    for all_exons in GTF.transcript_iterator(
            GTF.iterator(IOTools.open_file(infile))):

        c.input += 1

        e = all_exons[0]
        # filtering
        if e.contig not in contigs:
            c.missing_contig += 1
            logf.write(
                "\t".join((e.gene_id, e.transcript_id,
                           "missing_contig")) + "\n")
            continue

        if rx_contigs and rx_contigs.search(e.contig):
            c.remove_contig += 1
            logf.write(
                "\t".join((e.gene_id, e.transcript_id,
                           "remove_contig")) + "\n")
            continue

        if rna_index and all_exons[0].source != 'protein_coding':
            found = False
            for exon in all_exons:
                if rna_index.contains(e.contig, e.start, e.end):
                    found = True
                    break
            if found:
                logf.write(
                    "\t".join((e.gene_id, e.transcript_id,
                               "overlap_rna")) + "\n")
                c.overlap_rna += 1
                continue

        is_ok = True

        # keep exons and cds separate by grouping by feature
        all_exons.sort(key=lambda x: x.feature)
        new_exons = []

        for feature, exons in itertools.groupby(
                all_exons, lambda x: x.feature):

            tmp = sorted(list(exons), key=lambda x: x.start)

            gene_ids[tmp[0].transcript_id] = tmp[0].gene_id

            l, n = tmp[0], []

            for e in tmp[1:]:
                d = e.start - l.end
                if max_intron_size and d > max_intron_size:
                    is_ok = False
                    break
                elif d < 5:
                    l.end = max(e.end, l.end)
                    c.merged += 1
                    continue

                n.append(l)
                l = e

            n.append(l)
            new_exons.extend(n)

            if not is_ok:
                break

        if not is_ok:
            logf.write(
                "\t".join((e.gene_id, e.transcript_id,
                           "bad_transcript")) + "\n")
            c.skipped += 1
            continue

        new_exons.sort(key=lambda x: (x.start, x.gene_id, x.transcript_id))

        for e in new_exons:
            outf.write("%s\n" % str(e))
            c.exons += 1

        c.output += 1

    outf.close()
    L.info("%s" % str(c))

    return gene_ids
def readSegments(infile,
                 indexed_workspace,
                 truncate=False,
                 format="gtf",
                 keep_ambiguous=False,
                 remove_overhangs=False):
    """read segments from infile.

    segments not overlapping with indexed_workspace are removed.

    If :attr: truncate is given, segments extending beyond the workspace
    are truncated.

    returns a list of segments for each contig in a dictionary
    """
    counter = E.Counter()

    segments = collections.defaultdict(list)

    def addSegment(contig, start, end, counter):
        if contig in indexed_workspace:
            r = indexed_workspace[contig].find(start, end)
            if not r:
                counter.nskipped += 1
                return
            if len(r) > 1:
                counter.nambiguous += 1
                if not keep_ambiguous:
                    return
            if truncate:
                for x in r:
                    wstart, wend = x.start, x.end
                    rstart, rend = max(start, wstart), min(end, wend)
                    if start < wstart or end > wend:
                        counter.ntruncated += 1
                    segments[contig].append((rstart, rend))
                    counter.added += 1
            elif remove_overhangs:
                for x in r:
                    wstart, wend = x.start, x.end
                    rstart, rend = max(start, wstart), min(end, wend)
                    if start < wstart or end > wend:
                        counter.overhangs += 1
                        break
                else:
                    segments[contig].append((start, end))
            else:
                segments[contig].append((start, end))
                counter.added += 1

            counter.nkept += 1

    if format == "gtf":
        gtf_iterator = GTF.flat_gene_iterator(GTF.iterator(infile))

        for gene in gtf_iterator:
            # get start and end ignoring introns
            # contig, start, end = gene[0].contig, min( [x.start for x in gene] ), max( [x.end for x in gene] )

            contig, coords = gene[0].contig, [(x.start, x.end) for x in gene]
            counter.ninput += 1
            for start, end in coords:
                addSegment(contig, start, end, counter)

    elif format == "bed":
        bed_iterator = Bed.iterator(infile)
        for bed in bed_iterator:
            counter.ninput += 1
            addSegment(bed.contig, bed.start, bed.end, counter)

    E.info("read segments: %s" % str(counter))

    return segments
Ejemplo n.º 25
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-s",
                      "--summarise",
                      dest="summarise",
                      type="choice",
                      choices=("level-counts", "taxa-counts", "individual"),
                      help="summarise the taxa counts - no. phyla etc")

    parser.add_option("--output-map",
                      dest="output_map",
                      action="store_true",
                      help="ouput map of taxonomy")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if options.output_map:
        found = []
        options.stdout.write("""Domain\t \
        kingdom\t \
        phylum\t \
        class\t \
        order\t \
        family\t \
        genus\t \
        species\n""")
        # only output the mapping file - do not continue
        # summarise regardless of the specified options
        for lca in LCA.iterate(options.stdin):

            # if bacteria or archaea the kingdom will
            # be the domain
            if lca.domain == "Bacteria" or lca.domain == "Archaea":
                kingdom = lca.domain
            else:
                kingdom = lca.kingdom

            hierarchy = [
                lca.domain, kingdom, lca.phylum, lca._class, lca.order,
                lca.family, lca.genus, lca.species
            ]
            if hierarchy in found:
                continue
            else:
                found.append(hierarchy)
                options.stdout.write("\t".join(hierarchy) + "\n")
        return

    if options.summarise == "level-counts":
        level_counts = collections.defaultdict(set)
        total = 0
        nreads_domain = 0
        nreads_kingdom = 0
        nreads_kingdom_plus = 0
        nreads_phylum = 0
        nreads_phylum_plus = 0
        nreads_class = 0
        nreads_class_plus = 0
        nreads_order = 0
        nreads_order_plus = 0
        nreads_family = 0
        nreads_family_plus = 0
        nreads_genus = 0
        nreads_genus_plus = 0
        nreads_species = 0
        nreads_species_plus = 0
        nreads_subspecies = 0
        nreads_subspecies_plus = 0

        c = E.Counter()
        for lca in LCA.iterate(options.stdin):
            total += 1
            if lca.domain != "NA":
                nreads_domain += 1
                level_counts["domain"].add(lca.domain)
            else:
                c.kingdom_unmapped += 1

            if lca.kingdom != "NA":
                nreads_kingdom += 1
                level_counts["kingdom"].add(lca.kingdom)
            else:
                c.kingdom_unmapped += 1

            if lca.kingdom_plus != "NA":
                nreads_kingdom_plus += 1
                level_counts["kingdom+"].add(lca.kingdom_plus)
            else:
                c.kingdom_plus_unmapped += 1

            if lca.phylum != "NA":
                nreads_phylum += 1
                level_counts["phylum"].add(lca.phylum)
            else:
                c.phylum_unmapped += 1

            if lca.phylum_plus != "NA":
                nreads_phylum_plus += 1
                level_counts["phylum+"].add(lca.phylum_plus)
            else:
                c.phylum_plus_unmapped += 1

            if lca._class != "NA":
                nreads_class += 1
                level_counts["class"].add(lca._class)
            else:
                c.class_unmapped += 1

            if lca._class_plus != "NA":
                nreads_class_plus += 1
                level_counts["class+"].add(lca._class_plus)
            else:
                c.class_plus_unmapped += 1

            if lca.order != "NA":
                nreads_order += 1
                level_counts["order"].add(lca.order)
            else:
                c.order_unmapped += 1

            if lca.order_plus != "NA":
                nreads_order_plus += 1
                level_counts["order+"].add(lca.order_plus)
            else:
                c.order_plus_unmapped += 1

            if lca.family != "NA":
                nreads_family += 1
                level_counts["family"].add(lca.family)
            else:
                c.family_unmapped += 1

            if lca.family != "NA":
                nreads_family_plus == 1
                level_counts["family+"].add(lca.family_plus)
            else:
                c.family_plus_unmapped += 1

            if lca.genus != "NA":
                nreads_genus += 1
                level_counts["genus"].add(lca.genus)
            else:
                c.genus_unmapped += 1

            if lca.genus_plus != "NA":
                nreads_genus_plus == 1
                level_counts["genus+"].add(lca.genus_plus)
            else:
                c.genus_plus_unmapped += 1

            if lca.species != "NA":
                nreads_species += 1
                level_counts["species"].add(lca.species)
            else:
                c.species_unmapped += 1

            if lca.species_plus != "NA":
                nreads_species_plus += 1
                level_counts["species+"].add(lca.species_plus)
            else:
                c.species_plus_unmapped += 1

            # removed subspecies mapping for the time
            # being

            # if lca.subspecies != "NA":
            #     nreads_subspecies += 1
            #     level_counts["subspecies"].add(lca.subspecies)
            # else:
            #     c.subspecies_unmapped += 1

            # if lca.subspecies_plus != "NA":
            #     nreads_subspecies_plus += 1
            #     level_counts["subspecies+"].add(lca.subspecies_plus)
            # else:
            #     c.subspecies_plus_unmapped += 1

        options.stdout.write("\t".join([
            "ndomain", "nkingdom", "nkingdom+", "nphylum", "nphylum+",
            "nclass", "nclass+", "norder", "norder+", "nfamily", "nfamily+",
            "ngenus", "ngenus+", "nspecies", "nspecies+", "nseqkingdom",
            "nseqkingdom+", "nseqphylum", "nseqphylum+", "nseqclass",
            "nseqclass+", "nseqorder", "nseqorder+", "nseqfamily",
            "nseqfamily+", "nseqgenus", "nseqgenus+", "nseqspecies",
            "nseqspecies+"
        ]) + "\n")

        options.stdout.write("\t".join(
            map(str, [
                len(level_counts["domain"]),
                len(level_counts["kingdom"]),
                len(level_counts["kingdom+"]),
                len(level_counts["phylum"]),
                len(level_counts["phylum+"]),
                len(level_counts["class"]),
                len(level_counts["class+"]),
                len(level_counts["order"]),
                len(level_counts["order+"]),
                len(level_counts["family"]),
                len(level_counts["family+"]),
                len(level_counts["genus"]),
                len(level_counts["genus+"]),
                len(level_counts["species"]),
                len(level_counts["species+"]), nreads_domain, nreads_kingdom,
                nreads_phylum, nreads_phylum_plus, nreads_class,
                nreads_class_plus, nreads_order, nreads_order_plus,
                nreads_family, nreads_family_plus, nreads_genus,
                nreads_genus_plus, nreads_species, nreads_species_plus
            ])) + "\n")
    elif options.summarise == "taxa-counts":
        unmapped = collections.defaultdict(int)
        total = 0
        taxa_counts = {
            "domain": collections.defaultdict(int),
            "kingdom": collections.defaultdict(int),
            "kingdom+": collections.defaultdict(int),
            "phylum": collections.defaultdict(int),
            "phylum+": collections.defaultdict(int),
            "class": collections.defaultdict(int),
            "class+": collections.defaultdict(int),
            "order": collections.defaultdict(int),
            "order+": collections.defaultdict(int),
            "family": collections.defaultdict(int),
            "family+": collections.defaultdict(int),
            "genus": collections.defaultdict(int),
            "genus+": collections.defaultdict(int),
            "species": collections.defaultdict(int),
            "species+": collections.defaultdict(int)
        }

        c = E.Counter()
        for lca in LCA.iterate(options.stdin):
            total += 1
            if lca.domain != "NA":
                taxa_counts["domain"][lca.domain] += 1
            else:
                c.kingdom_unmapped += 1
                unmapped["domain"] += 1
            if lca.kingdom != "NA":
                taxa_counts["kingdom"][lca.kingdom] += 1
            else:
                c.kingdom_unmapped += 1
                unmapped["kingdom"] += 1
            if lca.kingdom_plus != "NA":
                taxa_counts["kingdom+"][lca.kingdom_plus] += 1
            else:
                c.kingdom_plus_unmapped += 1
                unmapped["kingdom+"] += 1
            if lca.phylum != "NA":
                taxa_counts["phylum"][lca.phylum] += 1
            else:
                c.phylum_unmapped += 1
                unmapped["phylum"] += 1
            if lca.phylum_plus != "NA":
                taxa_counts["phylum+"][lca.phylum_plus] += 1
            else:
                c.phylum_plus_unmapped += 1
                unmapped["phylum+"] += 1
            if lca._class != "NA":
                taxa_counts["class"][lca._class] += 1
            else:
                c.class_unmapped += 1
                unmapped["class"] += 1
            if lca._class_plus != "NA":
                taxa_counts["class+"][lca._class_plus] += 1
            else:
                c.class_plus_unmapped += 1
                unmapped["class+"] += 1
            if lca.order != "NA":
                taxa_counts["order"][lca.order] += 1
            else:
                c.order_unmapped += 1
                unmapped["order"] += 1
            if lca.order_plus != "NA":
                taxa_counts["order+"][lca.order_plus] += 1
            else:
                c.order_plus_unmapped += 1
                unmapped["order+"] += 1
            if lca.family != "NA":
                taxa_counts["family"][lca.family] += 1
            else:
                c.family_unmapped += 1
                unmapped["family"] += 1
            if lca.family_plus != "NA":
                taxa_counts["family+"][lca.family_plus] += 1
            else:
                c.family_plus_unmapped += 1
                unmapped["family+"] += 1
            if lca.genus != "NA":
                taxa_counts["genus"][lca.genus] += 1
            else:
                c.genus_unmapped += 1
                unmapped["genus"] += 1
            if lca.genus_plus != "NA":
                taxa_counts["genus+"][lca.genus_plus] += 1
            else:
                c.genus_plus_unmapped += 1
                unmapped["genus+"] += 1
            if lca.species != "NA":
                taxa_counts["species"][lca.species] += 1
            else:
                c.species_unmapped += 1
                unmapped["species"] += 1
            if lca.species_plus != "NA":
                taxa_counts["species+"][lca.species_plus] += 1
            else:
                c.species_plus_unmapped += 1
                unmapped["species+"] += 1

        options.stdout.write("level\ttaxa\tcount\tproportion\trpm\n")
        for level, taxa_count in sorted(taxa_counts.items()):
            total_level = total - unmapped[level]
            for taxa, count in sorted(taxa_count.items()):
                options.stdout.write("\t".join([
                    level, taxa,
                    str(count), "{:.8}".format(float(count) /
                                               total_level), "{:.8}".
                    format(float(count) / (float(total_level) / 1000000))
                ]) + "\n")

        E.info(c)

    elif options.summarise == "individual":
        # each read is output with its respective
        # taxon assignments
        options.stdout.write("\t".join([
            "id", "domain", "kingdom", "kingdom+", "phylum", "phylum+",
            "class", "class+", "order", "order+", "family", "family+", "genus",
            "genus+", "species", "species+"
        ]) + "\n")
        for lca in LCA.iterate(options.stdin):
            options.stdout.write("\t".join([
                lca.identifier, lca.domain, lca.kingdom, lca.kingdom_plus,
                lca.phylum, lca.phylum_plus, lca._class, lca._class_plus,
                lca.order, lca.order_plus, lca.family, lca.family_plus,
                lca.genus, lca.genus_plus, lca.species, lca.species_plus
            ]) + "\n")

    # write footer and output benchmark information.
    E.stop()
Ejemplo n.º 26
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=('reconcile', ),
                      help="method to apply [default=%default].")

    parser.add_option("-c",
                      "--chop-identifier",
                      dest="chop",
                      action="store_true",
                      help="whether or not to trim last character of the  "
                      "sequence name. For example sometimes ids in the first "
                      "file in the pair will end with \1 and the second "
                      "with \2. If --chop-identifier is not specified "
                      "then the results will be wrong [default=%default].")

    parser.add_option("-u",
                      "--unpaired",
                      dest="unpaired",
                      action="store_true",
                      help="whether or not to write out unpaired reads "
                      "to a separate file")

    parser.add_option("--id-pattern-1",
                      dest="id_pattern_1",
                      help="If specified will use the first group from the"
                      "pattern to determine the ID for the first read",
                      default=None)

    parser.add_option("--id-pattern-2",
                      dest="id_pattern_2",
                      help="As above but for read 2",
                      default=None)

    parser.add_option("-o",
                      "--output-filename-pattern",
                      dest="output_pattern",
                      type="string",
                      help="pattern for output files [default=%default].")

    parser.set_defaults(
        method="reconcile",
        chop=False,
        unpaired=False,
        output_pattern="%s.fastq.gz",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError(
            "please supply at least two fastq files on the commandline")

    fn1, fn2 = args
    c = E.Counter()

    if options.id_pattern_1:
        id1_getter = PatternGetter(options.id_pattern_1)
    else:
        id1_getter = plain_getter

    if options.id_pattern_2:
        id2_getter = PatternGetter(options.id_pattern_2)
    else:
        id2_getter = plain_getter

    if options.method == "reconcile":

        # IMS: switching to no store second set of read names and only use
        # lazily. Since generators don't have a size must keep track
        id_lengths = {fn1: 0, fn2: 0}

        def getIds(infile, id_getter=plain_getter):
            '''return ids in infile.'''
            aread = infile.readline
            while True:
                l = [aread().rstrip("\r\n") for i in range(4)]
                if not l[0]:
                    break
                r = id_getter(l[0].split()[0])
                # decide if to chop read number off
                id_lengths[infile.name] += 1
                if options.chop:
                    yield r[:-1]
                else:
                    yield r

        def write(outfile,
                  infile,
                  take,
                  unpaired_file=None,
                  id_getter=plain_getter):
            '''filter fastq files with ids in take.'''
            aread = infile.readline
            while True:
                l = [aread().rstrip("\r\n") for i in range(4)]
                if not l[0]:
                    break
                r = id_getter(l[0].split()[0])
                if options.chop:
                    r = r[:-1]
                if r not in take:
                    if unpaired_file is None:
                        continue
                    else:
                        unpaired_file.write("\n".join(l) + "\n")
                else:
                    outfile.write("\n".join(l) + "\n")

        E.info("reading first in pair")
        inf1 = IOTools.open_file(fn1)
        ids1 = set(getIds(inf1, id1_getter))

        E.info("reading second in pair")
        inf2 = IOTools.open_file(fn2)
        # IMS: No longer keep as a set, but lazily evaluate into intersection
        # leads to large memory saving for large inf2, particularly if
        # inf1 is small.
        ids2 = getIds(inf2, id2_getter)
        take = ids1.intersection(ids2)

        E.info("first pair: %i reads, second pair: %i reads, "
               "shared: %i reads" %
               (id_lengths[fn1], id_lengths[fn2], len(take)))

        if options.unpaired:
            unpaired_filename = IOTools.open_file(
                options.output_pattern % "unpaired", "w")
        else:
            unpaired_filename = None

        with IOTools.open_file(options.output_pattern % "1", "w") as outf:
            inf = IOTools.open_file(fn1)
            E.info("writing first in pair")
            write(outf, inf, take, unpaired_filename, id1_getter)

        with IOTools.open_file(options.output_pattern % "2", "w") as outf:
            inf = IOTools.open_file(fn2)
            E.info("writing second in pair")
            write(outf, inf, take, unpaired_filename, id2_getter)

        if options.unpaired:
            unpaired_filename.close()

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.stop()
Ejemplo n.º 27
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-b",
                      "--reference-bed-file",
                      dest="reference_bed_file",
                      type="string",
                      help="reference bed file "
                      "[%default]")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("lvc-comparison", ),
                      help="methods to apply [%default]")

    parser.set_defaults(method="lvc-comparison",
                        reference_fasta_file=None,
                        input_bed_file=None,
                        size_bins=(1000, 10000, 100000),
                        output_sets=True,
                        region_string=None)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    reference_set = collections.defaultdict(quicksect.IntervalTree)

    E.info("reading reference bed file from {}".format(
        options.reference_bed_file))
    with IOTools.open_file(options.reference_bed_file) as inf:
        for record in pysam.tabix_iterator(inf, pysam.asBed()):
            mm = reference_set[record.contig]
            mm.add(record.start, record.end)
    E.info("read reference intervals on {} contigs: {}".format(
        len(list(reference_set.keys())), ",".join(list(reference_set.keys()))))

    if options.output_sets:
        output_tp = E.open_output_file("tp")
        output_fp = E.open_output_file("fp")
        output_fn = E.open_output_file("fn")
    else:
        output_tp = None
        output_fp = None
        output_fn = None

    if options.method == "lvc-comparison":
        c = E.Counter()

        found = set()
        counts = {}
        names = set()
        nsize_bins = len(options.size_bins)
        for bin in range(len(options.size_bins) + 1):
            counts[bin] = dict([(x, collections.defaultdict(int))
                                for x in ("tp", "fn", "fp", "test", "truth")])

        for record in pysam.tabix_iterator(options.stdin, pysam.asBed()):
            if record.contig not in reference_set:
                c.ignored_no_contig += 1
                continue

            c.test += 1
            matches = reference_set[record.contig].search(
                record.start, record.end)
            size = record.end - record.start
            bin = get_size_bin(size, options.size_bins)

            if len(matches) == 0:
                c.fp += 1
                status = "fp"
                if output_fp:
                    output_fp.write(str(record) + "\n")
            elif len(matches) >= 1:
                c.tp += 1
                status = "tp"
                if output_tp:
                    output_tp.write(str(record) + "\n")
                # todo: overlap criteria

                # record found
                for match in matches:
                    found.add((record.contig, match.start, match.end))

            name = record.name.split(",")[0]
            names.add(name)
            counts[bin]["test"][name] += 1
            counts[bin][status][name] += 1

        outf = options.stdout

        with IOTools.open_file(options.reference_bed_file) as inf:
            for record in pysam.tabix_iterator(inf, pysam.asBed()):
                c.truth += 1
                bin = get_size_bin(record.end - record.start,
                                   options.size_bins)
                counts[bin]["truth"]["all"] += 1

                key = (record.contig, record.start, record.end)
                if key not in found:
                    c.fn += 1
                    counts[bin]["fn"]["all"] += 1

        outf.write("\t".join(("category", "size", "test", "tp", "fp", "truth",
                              "fn")) + "\n")

        for name in sorted(names):
            for bin in range(len(options.size_bins) + 1):
                if bin == len(options.size_bins):
                    size_bin = ">={}".format(options.size_bins[-1])
                else:
                    size_bin = "<{}".format(options.size_bins[bin])
                outf.write("\t".join(
                    map(str, (
                        name,
                        size_bin,
                        counts[bin]["test"][name],
                        counts[bin]["tp"][name],
                        counts[bin]["fp"][name],
                        counts[bin]["truth"]["all"],
                        counts[bin]["fn"]["all"],
                    ))) + "\n")

    E.info(str(c))
    E.stop()
Ejemplo n.º 28
0
def pslMap(options):
    """thread psl alignments using intervals.

    """

    if options.format == "gtf":
        use_copy = False
    else:
        use_copy = True

    c = E.Counter()

    min_length = options.min_aligned

    for match, qx, tx in iterator_psl_intervals(options):

        map_query2target = match.getMapQuery2Target()

        c.input += 1

        # if no filter on qx or tx, use full segment
        if qx is None:
            qx = [(match.mQueryFrom, match.mQueryTo, 0)]
        elif tx is None:
            tx = [(match.mSbjctFrom, match.mSbjctTo, 0)]

        E.debug('matches in query: %s' % qx)
        E.debug('matches in target: %s' % tx)

        # if no overlap: return
        if not qx or not tx:
            c.skipped += 1
            E.debug("no matches in query or target - skipped")
            continue

        for query in qx:

            qstart, qend, qval = query

            # skip elements that are too small
            if qend - qstart < min_length:
                E.debug("query too small - skipped at %s:%i-%i" %
                        (match.mQueryId, qstart, qend))
                c.skipped_small_queries += 1
                continue

            E.debug("working on query %s:%i-%i" %
                    (match.mQueryId, qstart, qend))

            mqstart, mqend = (
                map_query2target.mapRowToCol(
                    qstart,
                    alignlib_lite.py_RIGHT),
                map_query2target.mapRowToCol(
                    qend,
                    alignlib_lite.py_LEFT))

            if match.strand == "-":
                qstart, qend = match.mQueryLength - \
                    qend, match.mQueryLength - qstart

            for target in tx:

                tstart, tend, tval = target
                if (tstart >= mqend or tend <= mqstart):
                    E.debug("no overlap: %i-%i (%i-%i) - %i-%i" % (
                        qstart, qend, mqstart, mqend, tstart, tend))
                    continue
                if tend - tstart < min_length:
                    E.debug("target length too short: %i-%i - %i-%i" % (
                        qstart, qend, tstart, tend))
                    continue

                new = alignlib_lite.py_makeAlignmentBlocks()

                if use_copy:
                    # do copy with range filter
                    if options.loglevel >= 3:

                        mtstart, mtend = map_query2target.mapColToRow(
                            tstart), map_query2target.mapColToRow(tend)

                        E.debug(
                            ("query: %i-%i (len=%i)-> %i-%i(len=%i); "
                             "target: %i-%i (len=%i)-> %i-%i (len=%i)") %
                            (qstart, qend,
                             qend - qstart,
                             mqstart, mqend,
                             mqend - mqstart,
                             tstart, tend,
                             tend - tstart,
                             mtstart, mtend,
                             mtend - mtstart))

                    alignlib_lite.py_copyAlignment(
                        new,
                        map_query2target,
                        qstart, qend,
                        tstart, tend)
                else:
                    # do copy with alignment filter
                    map_query = qval
                    if map_query:
                        tmp = alignlib_lite.py_makeAlignmentBlocks()
                        alignlib_lite.py_copyAlignment(
                            tmp, map_query2target, map_query,
                            alignlib_lite.py_RR)
                        if options.loglevel >= 5:
                            options.stdlog.write(
                                "######## mapping query ###########\n")
                            options.stdlog.write(
                                "# %s\n" %
                                str(alignlib_lite.py_AlignmentFormatEmissions(
                                    map_query2target)))
                            options.stdlog.write(
                                "# %s\n" % str(
                                    alignlib_lite.py_AlignmentFormatEmissions(
                                        map_query)))
                            options.stdlog.write(
                                "# %s\n" % str(
                                    alignlib_lite.py_AlignmentFormatEmissions(
                                        tmp)))
                    else:
                        tmp = map_query2target

                    map_target = tval
                    if map_target:
                        new = alignlib_lite.py_makeAlignmentBlocks()
                        alignlib_lite.py_copyAlignment(
                            new, tmp, map_target, alignlib_lite.py_CR)
                        if options.loglevel >= 5:
                            options.stdlog.write(
                                "######## mapping target ###########\n")
                            options.stdlog.write(
                                "# before: %s\n" %
                                str(alignlib_lite.py_AlignmentFormatEmissions(
                                    tmp)))
                            options.stdlog.write(
                                "# map   : %s\n" %
                                str(alignlib_lite.py_AlignmentFormatEmissions(
                                    map_target)))
                            options.stdlog.write(
                                "# after : %s\n" %
                                str(alignlib_lite.py_AlignmentFormatEmissions(
                                    new)))
                    else:
                        new = tmp

                if options.loglevel >= 4:
                    E.debug("putative match with intervals: %s and %s: %i-%i" %
                            (str(query), str(target), qstart, qend))
                    if options.loglevel >= 5:
                        E.debug(
                            "input : %s" % str(
                                alignlib_lite.py_AlignmentFormatEmissions(
                                    map_query2target)))
                        E.debug("final : %s" %
                                str(alignlib_lite.py_AlignmentFormatEmissions(
                                    new)))

                    if new.getLength() > 0:
                        n = match.copy()
                        n.fromMap(new, use_strand=True)
                        E.info("match : %s" % (str(n)))

                if new.getNumAligned() > options.min_aligned:
                    n = match.copy()
                    n.fromMap(new, use_strand=True)
                    options.stdout.write(str(n) + "\n")
                    c.output += 1
                else:
                    c.discarded += 1
                break
            else:
                c.nooverlap += 1

    E.info("map: %s" % str(c))
Ejemplo n.º 29
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "--input-filename-fasta", dest="input_filename_fasta", type="string",
        help="filename with reference sequence in fasta format [%default]")

    parser.add_option(
        "--input-filename-bam", dest="input_filename_bam", type="string",
        help="filename with aligned reads [%default]")

    parser.add_option(
        "--method", dest="methods", type="choice", action="append",
        choices=["add-strelka-genotype",
                 "lift-over"],
        help="methods to apply [%default]")

    parser.add_option(
        "--input-filename-chain", dest="input_filename_chain", type="string",
        help="filename with alignment chain for lift-over [%default]")

    parser.add_option(
        "--normal-sample-regex", dest="normal_sample_regex", type="string",
        help="regular expression to apply to header to identify normal "
        "sample id [%default]")

    parser.add_option(
        "--output-filename-unmapped", dest="output_filename_unmapped", type="string",
        help="filename with variants that could not be lifted over [%default]")

    parser.set_defaults(
        input_filename_fasta=None,
        input_filename_bam=None,
        input_filename_vcf="-",
        sample_size=0.001,
        region_size=20,
        methods=[],
        normal_sample_regex=None,
        input_filename_chain=None,
        output_filename_unmapped=None,
    )

    (options, args) = E.start(parser,
                              argv=argv,
                              add_output_options=True)

    if len(args) > 0:
        options.input_filename_vcf = args[0]

    vcf_in = pysam.VariantFile(options.input_filename_vcf)

    if "lift-over" in options.methods:
        if options.input_filename_chain is None:
            raise ValueError("--method=lift-over requires --input-filename-chain")
        if not os.path.exists(options.input_filename_chain):
            raise OSError("file {} with chain data does not exist".format(
                options.input_filename_chain))
        E.info("reading chain from {}".format(options.input_filename_chain))
        with IOTools.open_file(options.input_filename_chain) as inf:
            map_chain, map_contig2length = read_liftover_chain(inf)

    if options.input_filename_fasta:
        fasta = pysam.FastaFile(options.input_filename_fasta)
    else:
        fasta = None

    if options.input_filename_bam:
        bam = pysam.AlignmentFile(options.input_filename_bam)
    else:
        bam = None

    outf = options.stdout

    c = E.Counter()

    if "add-strelka-genotype" in options.methods:
        map_nt2gt = {"ref": "0/0",
                     "het": "0/1",
                     "hom": "1/1",
                     "conflict": "."}

        map_tumour2gt = {"ref": "0/0",
                         "het": "0/1",
                         "hom": "1/1"}

        header = str(vcf_in.header).splitlines()

        header.insert(
            len(header) - 1,
            '##FORMAT=<ID=GT,Number=1,Type=String,Description='
            '"Genotypes of reference and alternative alleles, '
            'added by CGATCore vcf2vcf.">')

        header = "\n".join(header)
        if options.normal_sample_regex:
            normal_sample = re.search(" -bam-file \S+/([^/]+)_S\d+.bam", header).groups()[0]
        else:
            normal_sample = "NORMAL"

        is_first = True

        for record in vcf_in:
            c.input += 1

            if "GT" in record.format:
                if is_first:
                    outf.write(header + "\n")
                    is_first = False
                outf.write(str(record))
                c.has_gt += 1
                continue

            gt_normal = map_nt2gt[record.info["NT"]]
            gt_tumour = record.info["SGT"]
            norm, tumour = gt_tumour.split("->")
            if gt_tumour[0] in "ACGT":
                alts = record.alts
                if alts is None:
                    c.no_alt += 1
                    continue

                if len(record.alts) > 1:
                    c.multi_allelic += 1
                    continue

                _map_tumour2gt = {
                    record.alts[0]: "1",
                    record.ref: "0"}
                try:
                    gt_tumour = "/".join(
                        sorted([_map_tumour2gt[x] for x in tumour]))
                except KeyError:
                    gt_tumour = "."
                    c.ambigous_genotype += 1
            else:
                gt_tumour = map_tumour2gt[tumour]

            fields = str(record)[:-1].split("\t")
            # FORMAT
            fields[8] = ":".join(("GT", fields[8]))
            # SAMPLES
            # makes a few assumptions, fix!
            header_insert_normal = False
            if len(fields) == 11:
                fields[9] = ":".join((gt_normal, fields[9]))
                fields[10] = ":".join((gt_tumour, fields[10]))
            elif len(fields) == 10:
                header_insert_normal = True
                values = fields[9].split(":")
                fields.append(":".join((gt_tumour, fields[9])))
                fields[9] = ":".join([gt_normal] + ["."] * len(values))
            else:
                raise NotImplementedError()

            if is_first:
                if not header_insert_normal:
                    outf.write(header + "\n")
                else:
                    header = re.sub(r"\tFORMAT\t",
                                    "\tFORMAT\t%s\t" % normal_sample, header)
                    outf.write(header + "\n")
                is_first = False
            outf.write("\t".join(fields) + "\n")
            c.output += 1

    elif "lift-over" in options.methods:
        header = str(vcf_in.header).splitlines()

        if fasta:
            # validate contig size
            expected_lengths = dict(list(zip(fasta.references, fasta.lengths)))
        else:
            expected_lengths = map_contig2length

        # update contig names and sizes in VCF header
        header = [x for x in header if not x.startswith("##contig")]
        header[-1:-1] = ["##contig=<ID={},length={}>".format(
            contig, length) for contig, length in sorted(expected_lengths.items())]

        header.insert(
            len(header) - 1,
            '##liftover=<CHAIN={},REFERENCE={}>'.format(
                options.input_filename_chain,
                options.input_filename_fasta))
        outf.write("\n".join(header) + "\n")

        unmapped_contigs = set()
        unknown_contigs = set()

        trans_genotypes = str.maketrans("01", "10")

        if fasta:
            # validate contig size
            expected_lengths = dict(list(zip(fasta.references, fasta.lengths)))
            for contig, length in list(map_contig2length.items()):
                if contig in expected_lengths:
                    if length != expected_lengths[contig]:
                        raise ValueError(
                            "contig lengths mismatch. For contig {} chain files "
                            "says {}, but fasta files says {}".format(
                                contig, length, expected_lengths[contig]))
            E.info("contig sizes in chain file and fasta files correspond.")

        if options.output_filename_unmapped:
            outfile_unmapped = IOTools.open_file(options.output_filename_unmapped, "w")
            outfile_unmapped.write("\n".join(header) + "\n")
        else:
            outfile_unmapped = None

        for record in vcf_in:
            c.input += 1

            try:
                mm = map_chain[record.contig]
            except KeyError:
                c.skipped_unmapped_contig += 1
                unmapped_contigs.add(record.contig)
                if outfile_unmapped:
                    outfile_unmapped.write("skipped_unmapped_contig\t{}".format(str(record)))
                continue

            try:
                m = mm.search(record.start, record.stop)
            except AttributeError:
                c.skipped_mapping_error += 1
                if outfile_unmapped:
                    outfile_unmapped.write("skipped_mapping_error\t{}".format(str(record)))
                continue

            if len(m) == 0:
                c.skipped_unmapped_position += 1
                if outfile_unmapped:
                    outfile_unmapped.write("skipped_unmapped_position\t{}".format(str(record)))
                continue
            elif len(m) > 1:
                c.skipped_multimapping_position += 1
                if outfile_unmapped:
                    outfile_unmapped.write("skipped_multimapping_position\t{}".format(str(record)))
                continue

            m = m[0]
            y_contig, y_start, y_end, y_invert = m.data

            if y_invert:
                y_pos = y_end - (record.start - m.start)
            else:
                y_pos = (record.start - m.start) + y_start

            if fasta:
                try:
                    ref_base = fasta.fetch(y_contig, y_pos, y_pos + len(record.ref)).upper()
                except KeyError:
                    c.skipped_unknown_contig += 1
                    unknown_contigs.add(y_contig)
                    ref_base = None
                    continue

            swap_alleles = False
            if ref_base:
                error = False
                if ref_base == record.ref:
                    c.matches += 1
                else:
                    if len(record.alts) == 1:
                        alt_base = record.alts[0]
                        if ref_base == alt_base:
                            swap_alleles = True
                            c.allele_swap_variant += 1
                        else:
                            c.error_mismatch_variant += 1
                            error = "mismatch"
                    else:
                        error = "multi-mismatch"
                        c.error_multi_mismatch_variant += 1

                if error:
                    if outfile_unmapped:
                        outfile_unmapped.write("{}\t{}".format(error, str(record)))
                    c.skipped_error_variant += 1
                    continue

            fields = str(record)[:-1].split("\t")
            fields[0] = y_contig
            fields[1] = str(y_pos)

            if swap_alleles:
                fields[4] = alt_base
                fields[5] = ref_base
                # update genotype fields
                keep = False
                for idx in range(9, len(fields)):
                    gt, rest = fields[idx].split(":", 1)
                    keep = keep or "0" in gt
                    fields[idx] = ":".join((gt.translate(trans_genotypes), rest))

                # remove reference only calls
                if not keep:
                    if outfile_unmapped:
                        outfile_unmapped.write("reference_call\t{}".format(str(record)))
                    c.skipped_allele_swap_reference += 1
                continue

            c.output += 1
            outf.write("\t".join(fields) + "\n")

        c.unmapped_contigs = len(unmapped_contigs)
        c.unknown_contigs = len(unknown_contigs)

        E.info(c.asTable())
        if unknown_contigs:
            E.info("unknown contigs: {}".format(",".join(sorted(unknown_contigs))))
        if unmapped_contigs:
            E.info("unmapped contigs: {}".format(",".join(sorted(unmapped_contigs))))

    E.stop()
Ejemplo n.º 30
0
def findTATABox(infiles, outfile):
    '''find TATA box in promotors. There are several matrices to choose from:

    M00216 V$TATA_C Retroviral TATA box
    M00252 V$TATA_01 cellular and viral TATA box elements
    M00311 V$ATATA_B Avian C-type TATA box
    M00320 V$MTATA_B Muscle TATA box
    '''

    # 1. create fasta file - look for TATA box
    #
    bedfile, genomefile = infiles

    statement = '''
    slopBed -i %(bedfile)s
            -l %(tata_search_upstream)i
            -r %(tata_search_downstream)i
            -s
            -g %(genomefile)s
    | cgat bed2fasta 
       --use-strand
       --genome=%(genome_dir)s/%(genome)s
       --log=%(outfile)s.log
    > %(outfile)s.fasta
    '''

    P.run()

    match_executable = '/ifs/data/biobase/transfac/match/bin/match_linux64'
    match_matrix = '/ifs/data/biobase/transfac/dat/matrix.dat'
    match_profile = 'minFP_good.prf'
    match_profile = outfile + ".prf"

    prf = '''tata.prf
prf to minimize sum of both errors - derived from minSUM.prf
 MIN_LENGTH 300
0.0
 1.000 0.716 0.780 M00216 V$TATA_C
 1.000 0.738 0.856 M00252 V$TATA_01
 1.000 0.717 0.934 M00311 V$ATATA_B
 1.000 0.711 0.784 M00320 V$MTATA_B
//
'''

    with IOTools.openFile(match_profile, "w") as outf:
        outf.write(prf)

    # -u : uniq - only one best match per sequence
    statement = '''
         %(match_executable)s
         %(match_matrix)s
         %(outfile)s.fasta
         %(outfile)s.match
         %(match_profile)s
         -u
    >> %(outfile)s.log
    '''
    P.run()

    transcript2pos = {}
    for entry in FastaIterator.iterate(IOTools.openFile(outfile + ".fasta")):
        transcript_id, contig, start, end, strand = re.match(
            "(\S+)\s+(\S+):(\d+)..(\d+)\s+\((\S)\)", entry.title).groups()
        transcript2pos[transcript_id] = (contig, int(start), int(end), strand)

    MATCH = collections.namedtuple(
        "MATCH",
        "pid transfac_id pos strand core_similarity matrix_similarity sequence"
    )

    def _grouper(infile):
        r = []
        keep = False
        for line in infile:
            if line.startswith("Inspecting sequence ID"):
                keep = True
                if r:
                    yield pid, r
                r = []
                pid = re.match("Inspecting sequence ID\s+(\S+)",
                               line).groups()[0]
                continue
            elif line.startswith(" Total"):
                break

            if not keep:
                continue
            if line[:-1].strip() == "":
                continue
            transfac_id, v, core_similarity, matrix_similarity, sequence = [
                x.strip() for x in line[:-1].split("|")
            ]
            pos, strand = re.match("(\d+) \((\S)\)", v).groups()
            r.append(
                MATCH._make((pid, transfac_id, int(pos), strand,
                             float(core_similarity), float(matrix_similarity),
                             sequence)))

        yield pid, r

    offset = PARAMS["tata_search_upstream"]

    outf = IOTools.openFile(outfile + ".table.gz", "w")
    outf.write("\t".join(("transcript_id", "strand", "start", "end",
                          "relative_start", "relative_end", "transfac_id",
                          "core_similarity", "matrix_similarity",
                          "sequence")) + "\n")

    bedf = IOTools.openFile(outfile, "w")

    c = E.Counter()
    found = set()
    for transcript_id, matches in _grouper(IOTools.openFile(outfile +
                                                            ".match")):
        contig, seq_start, seq_end, strand = transcript2pos[transcript_id]
        c.promotor_with_matches += 1
        nmatches = 0
        found.add(transcript_id)
        for match in matches:

            c.matches_total += 1
            lmatch = len(match.sequence)
            if match.strand == "-":
                c.matches_wrong_strand += 1
                continue

            # get genomic location of match
            if strand == "+":
                genome_start = seq_start + match.pos
            else:
                genome_start = seq_end - match.pos - lmatch

            genome_end = genome_start + lmatch

            # get relative location of match
            if strand == "+":
                tss_start = seq_start + offset
                relative_start = genome_start - tss_start
            else:
                tss_start = seq_end - offset
                relative_start = tss_start - genome_end

            relative_end = relative_start + lmatch

            outf.write("\t".join(
                map(str, (transcript_id, strand, genome_start, genome_end,
                          relative_start, relative_end, match.transfac_id,
                          match.core_similarity, match.matrix_similarity,
                          match.sequence))) + "\n")
            c.matches_output += 1
            nmatches += 1

            bedf.write("\t".join(
                map(str, (contig, genome_start, genome_end, transcript_id,
                          strand, match.matrix_similarity))) + "\n")

        if nmatches == 0:
            c.promotor_filtered += 1
        else:
            c.promotor_output += 1

    c.promotor_total = len(transcript2pos)
    c.promotor_without_matches = len(
        set(transcript2pos.keys()).difference(found))

    outf.close()
    bedf.close()

    with IOTools.openFile(outfile + ".summary", "w") as outf:
        outf.write("category\tcounts\n")
        outf.write(c.asTable() + "\n")

    E.info(c)