Python as_listの例、cgatcore.pipeline.as_list Pythonの例

コード例 #1

0

ファイルを表示

def checkFileExistence(infile, outfile):
    '''check whether file exists.

    Files are uncompressed before checking existence.
    '''
    track = P.snip(infile, ".log")
    compute_file_metrics(infile,
                         outfile,
                         metric="file",
                         suffixes=P.as_list(
                             P.as_list(PARAMS.get('%s_regex_exist' % track,
                                                  ""))))

コード例 #2

0

ファイルを表示

def buildCheckSums(infile, outfile):
    '''build checksums for files in the build directory.

    Files are uncompressed before computing the checksum
    as gzip stores meta information such as the time stamp.
    '''
    track = P.snip(infile, ".log")
    compute_file_metrics(infile,
                         outfile,
                         metric="md5sum",
                         suffixes=P.as_list(
                             P.as_list(PARAMS.get('%s_regex_md5' % track,
                                                  ""))))

コード例 #3

0

ファイルを表示

def buildLineCounts(infile, outfile):
    '''compute line counts.

    Files are uncompressed before computing the number of lines.
    '''
    track = P.snip(infile, ".log")
    compute_file_metrics(infile,
                         outfile,
                         metric="wc -l",
                         suffixes=P.as_list(
                             P.as_list(
                                 PARAMS.get('%s_regex_linecount' % track,
                                            ""))))

コード例 #4

0

ファイルを表示

ファイル: motifs.py プロジェクト: kevinrue/cgat-flow

def runTomTom(infile, outfile):
    '''compare ab-initio motifs against tomtom.'''

    tmpdir = P.get_temp_dir(".")
    databases = " ".join(P.as_list(P.get_params()["tomtom_databases"]))

    target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]),
                               "tomtom", outfile)

    if iotools.is_empty(infile):
        E.warn("input is empty - no computation performed")
        iotools.touch_file(outfile)
        return

    statement = '''
    tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log
    '''

    P.run(statement)

    # copy over results
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        # ignore "file exists" exception
        pass

    if os.path.exists(target_path):
        shutil.rmtree(target_path)
    shutil.move(tmpdir, target_path)

    shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)

コード例 #5

0

ファイルを表示

ファイル: pipeline_motifs.py プロジェクト: tw7649116/cgat-flow

def exportMotifDiscoverySequences(infile, outfile):
    '''export sequences for motif discovery.

    This method requires the _interval tables.

    For motif discovery, only the sequences with the highest S/N ratio
    are supplied.

    1. The top *motifs_proportion* intervals sorted by peakval
    2. Only a region +/- *motifs_halfwidth* around the peak
    3. At least *motifs_min_sequences*. If there are not enough sequences
          to start with, all will be used.
    4. At most *motifs_max_size* sequences will be output.

    '''
    track = P.snip(infile, "_intervals.load")
    dbhandle = connect()

    p = P.substitute_parameters(**locals())
    nseq = motifs.writeSequencesForIntervals(
        track,
        outfile,
        dbhandle,
        full=False,
        masker=P.as_list(p['motifs_masker']),
        halfwidth=int(p["motifs_halfwidth"]),
        maxsize=int(p["motifs_max_size"]),
        proportion=p["motifs_proportion"],
        min_sequences=p["motifs_min_sequences"],
        num_sequences=p["motifs_num_sequences"],
        order=p['motifs_score'])

    if nseq == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        iotools.touch_file(outfile)

コード例 #6

0

ファイルを表示

def run_test(infile, outfile):
    '''run a test.

    Multiple targets are run iteratively.
    '''

    track = P.snip(outfile, ".log")
    pipeline_name = PARAMS.get("%s_pipeline" % track, track[len("test_"):])

    pipeline_targets = P.as_list(PARAMS.get("%s_target" % track, "full"))

    # do not run on cluster, mirror
    # that a pipeline is started from
    # the head node
    #to_cluster = False

    template_statement = ("cd %%(track)s.dir; "
                          "xvfb-run -d cgatflow %%(pipeline_name)s "
                          "%%(pipeline_options)s "
                          "%%(workflow_options)s make %s "
                          "-L ../%%(outfile)s "
                          "-S ../%%(outfile)s.stdout "
                          "-E ../%%(outfile)s.stderr")

    if len(pipeline_targets) == 1:
        statement = template_statement % pipeline_targets[0]
        P.run(statement, ignore_errors=True, job_memory="unlimited")
    else:
        statements = []
        for pipeline_target in pipeline_targets:
            statements.append(template_statement % pipeline_target)
        P.run(statement, ignore_errors=True, job_memory="unlimited")

コード例 #7

0

ファイルを表示

def get_repeat_gff(outfile):
    """This task downloads UCSC repetetive RNA types.
    """
    ModuleTrna.getRepeatDataFromUCSC(
        dbhandle=connectToUCSC(),
        repclasses=P.as_list(PARAMS["ucsc_rnatypes"]),
        outfile=outfile,
        remove_contigs_regex=PARAMS["ucsc_remove_contigs"],
        job_memory="3G")

コード例 #8

0

ファイルを表示

def importRepeatsFromUCSC(outfile):
    """This task downloads UCSC repeats types as identified
    in the configuration file.
    """
    gtfsubset.getRepeatDataFromUCSC(dbhandle=connectToUCSC(),
                                    repclasses=P.as_list(
                                        PARAMS["ucsc_repeattypes"]),
                                    outfile=outfile,
                                    job_memory=PARAMS["job_memory"])

コード例 #9

0

ファイルを表示

def importRNAAnnotationFromUCSC(outfile):
    """This task downloads UCSC repetetive RNA types.
    """
    gtfsubset.getRepeatDataFromUCSC(
        dbhandle=connectToUCSC(),
        repclasses=P.as_list(PARAMS["ucsc_rnatypes"]),
        outfile=outfile,
        remove_contigs_regex=PARAMS["ncbi_remove_contigs"],
        job_memory=PARAMS["job_memory"])

コード例 #10

0

ファイルを表示

ファイル: motifs.py プロジェクト: kevinrue/cgat-flow

def runMEME(track, outfile, dbhandle):
    '''run MEME to find motifs.

    In order to increase the signal/noise ratio,
    MEME is not run on all intervals but only the
    top 10% of intervals (peakval) are used.
    Also, only the segment of 200 bp around the peak
    is used and not the complete interval.

    * Softmasked sequence is converted to hardmasked
      sequence to avoid the detection of spurious motifs.

    * Sequence is run through dustmasker

    This method is deprecated - use runMEMEOnSequences instead.
    '''
    # job_options = "-l mem_free=8000M"

    target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]),
                               "meme", outfile)

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(P.get_params()["genome_dir"],
                     P.get_params()["genome"]))

    tmpdir = P.get_temp_dir(".")
    tmpfasta = os.path.join(tmpdir, "in.fa")

    nseq = writeSequencesForIntervals(
        track,
        tmpfasta,
        dbhandle,
        full=False,
        masker=P.as_list(P.get_params()['motifs_masker']),
        halfwidth=int(P.get_params()["meme_halfwidth"]),
        maxsize=int(P.get_params()["meme_max_size"]),
        proportion=P.get_params()["meme_proportion"],
        min_sequences=P.get_params()["meme_min_sequences"])

    if nseq == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        iotools.touch_file(outfile)
    else:
        statement = '''
        meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log
        '''
        P.run(statement)

        collectMEMEResults(tmpdir, target_path, outfile)

コード例 #11

0

ファイルを表示

def exportIntervalSequences(infile, outfile, track, method):
    '''export sequences for motif discovery.

    This method requires the _interval tables.

    For motif discovery, only the sequences with the highest S/N ratio
    are supplied.

    1. The top *motifs_proportion* intervals sorted by peakval
    2. Only a region +/- *motifs_halfwidth* around the peak
    3. At least *motifs_min_sequences*. If there are not enough sequences
          to start with, all will be used.
    4. At most *motifs_max_size* sequences will be output.

    '''
    dbhandle = connect()

    try:
        halfwidth = int(PARAMS[method+"_halfwidth"])
        full = False
    except ValueError:
        full = True
        halfwidth = None

    try:
        maxsize = int(PARAMS[method+"_max_size"])
    except ValueError:
        maxsize = None

    nseq = PipelineMotifs.writeSequencesForIntervals(
        track,
        outfile,
        dbhandle,
        full=full,
        masker=P.as_list(PARAMS[method+'_masker']),
        halfwidth=halfwidth,
        maxsize=maxsize,
        num_sequences=PARAMS[method+"_num_sequences"],
        proportion=PARAMS[method+"_proportion"],
        min_sequences=PARAMS[method+"_min_sequences"],
        order=PARAMS[method+'_score'])

    if nseq == 0:
        E.warn("%s: no sequences - %s skipped" % (outfile, method))
        P.touch_file(outfile)

コード例 #12

0

ファイルを表示

ファイル: pipeline_readqc.py プロジェクト: tw7649116/cgat-flow

    def processReads(infile, outfiles):
        '''process reads from .fastq and other sequence files.
        '''
        trimmomatic_options = P.get_params()["trimmomatic_options"]

        if P.get_params()["auto_remove"]:
            trimmomatic_options = " ILLUMINACLIP:%s:%s:%s:%s:%s:%s " % (
                "contaminants.fasta",
                P.get_params()["trimmomatic_mismatches"],
                P.get_params()["trimmomatic_p_thresh"],
                P.get_params()["trimmomatic_c_thresh"],
                P.get_params()["trimmomatic_min_adapter_len"],
                P.get_params()["trimmomatic_keep_both_reads"]) + trimmomatic_options

        elif P.get_params()["trimmomatic_adapter"]:
            trimmomatic_options = " ILLUMINACLIP:%s:%s:%s:%s:%s:%s " % (
                P.get_params()["trimmomatic_adapter"],
                P.get_params()["trimmomatic_mismatches"],
                P.get_params()["trimmomatic_p_thresh"],
                P.get_params()["trimmomatic_c_thresh"],
                P.get_params()["trimmomatic_min_adapter_len"],
                P.get_params()["trimmomatic_keep_both_reads"]) + trimmomatic_options

        job_threads = P.get_params()["threads"]
        job_memory = "12G"

        track = re.match(REGEX_TRACK, infile).groups()[0]

        m = preprocess.MasterProcessor(
            save=P.get_params()["save"],
            summarize=P.get_params()["summarize"],
            threads=P.get_params()["threads"],
            qual_format=P.get_params()['qual_format'])

        for tool in P.as_list(P.get_params()["preprocessors"]):

            if tool == "fastx_trimmer":
                m.add(preprocess.FastxTrimmer(
                    P.get_params()["fastx_trimmer_options"],
                    threads=P.get_params()["threads"]))
            elif tool == "trimmomatic":
                m.add(preprocess.Trimmomatic(
                    trimmomatic_options,
                    threads=P.get_params()["threads"]))
            elif tool == "sickle":
                m.add(preprocess.Sickle(
                    P.get_params()["sickle_options"],
                    threads=P.get_params()["threads"]))
            elif tool == "trimgalore":
                m.add(preprocess.Trimgalore(
                    P.get_params()["trimgalore_options"],
                    threads=P.get_params()["threads"]))
            elif tool == "flash":
                m.add(preprocess.Flash(
                    P.get_params()["flash_options"],
                    threads=P.get_params()["threads"]))
            elif tool == "reversecomplement":
                m.add(preprocess.ReverseComplement(
                    P.get_params()["reversecomplement_options"]))
            elif tool == "pandaseq":
                m.add(preprocess.Pandaseq(
                    P.get_params()["pandaseq_options"],
                    threads=P.get_params()["threads"]))
            elif tool == "cutadapt":
                cutadapt_options = P.get_params()["cutadapt_options"]
                if P.get_params()["auto_remove"]:
                    cutadapt_options += " -a file:contaminants.fasta "
                m.add(preprocess.Cutadapt(
                    cutadapt_options,
                    threads=P.get_params()["threads"],
                    untrimmed=P.get_params()['cutadapt_reroute_untrimmed'],
                    process_paired=P.get_params()["cutadapt_process_paired"]))
            else:
                raise NotImplementedError("tool '%s' not implemented" % tool)

        statement = m.build((infile,), "processed.dir/trimmed-", track)
        P.run(statement)

コード例 #13

0

ファイルを表示

ファイル: pipeline_assembly.py プロジェクト: Vikash84/MetaSequencing

def checkFile(infile, outfile):
    seqdat = PipelineAssembly.SequencingData(infile)
    outf = open(outfile, 'w')
    outf.write(
        "name\t{}\nformat\t{}\ncompressed\t{}\npaired\t{}\ninterleaved\t{}\n".
        format(seqdat.filename, seqdat.fileformat, seqdat.compressed,
               seqdat.paired, seqdat.interleaved))
    outf.close()


##################################################
#Run Selected Assemblers
##################################################

#get the list of assemblers to run on the data
ASSEMBLERS = P.as_list(PARAMS.get("Assembler_assemblers", ""))


###################################################
# Run Megahit
###################################################
@active_if("megahit" in ASSEMBLERS)
@follows(checkFile)
@follows(mkdir("megahit_out.dir"))
@transform(SEQUENCEFILES, SEQUENCEFILES_REGEX,
           r"megahit_out.dir/\1_complete.log")
def runMegahit(infile, outfile):
    job_memory = str(PARAMS["Megahit_clus_memory"]) + "G"
    job_threads = int(PARAMS["Megahit_clus_threads"])
    seqdat = PipelineAssembly.SequencingData(infile)
    assembler = PipelineAssembly.Megahit(seqdat, "megahit_out.dir", PARAMS)

コード例 #14

0

ファイルを表示

ファイル: pipeline_enumerate.py プロジェクト: Vikash84/MetaSequencing

import subprocess

###################################################
###################################################
###################################################
# Pipeline configuration
###################################################
# load options from the config file
import cgatcore.pipeline as P
P.get_parameters([
    "%s/pipeline.yml" % __file__[:-len(".py")], "../pipeline.yml",
    "pipeline.yml"
])
PARAMS = P.PARAMS

FEATURES = P.as_list(PARAMS.get("General_feature_list"))
FEATUREPAIRS = P.as_list(PARAMS.get("General_feature_pairs"))
FEATUREPAIRS = [
    "{}_BY_{}".format(x.split(":")[0],
                      x.split(":")[1]) for x in FEATUREPAIRS
]
ALLFEATURES = FEATURES + FEATUREPAIRS

from pipeline_assembly import PipelineAssembly
from pipeline_enumerate import PipelineEnumerate
from pipeline_filter import PipelineFilter

#get all files within the directory to process
SEQUENCEFILES = ("*.fasta", "*.fasta.gz", "*.fasta.1.gz", "*.fasta.1", "*.fna",
                 "*.fna.gz", "*.fna.1.gz", "*.fna.1", "*.fa", "*.fa.gz",
                 "*.fa.1.gz", "*.fa.1", "*.fastq", "*.fastq.gz",

コード例 #15

0

ファイルを表示

    ["%s/pipeline.yml" % os.path.splitext(__file__)[0],
     "../pipeline.yml",
     "pipeline.yml"])

dbname = PARAMS['db_name']
unmapped = enrichment.getUnmapped(PARAMS)
outfilesuffixes = ["_genestoterms.tsv",
                   "_termstogenes.tsv",
                   "_termstodetails.tsv",
                   "_termstoont.tsv"]

unmappedouts = [["annotations.dir/%s%s" % (u, s)
                 for s in outfilesuffixes]
                for u in unmapped]

hpatissues = P.as_list(PARAMS.get('hpa_tissue', []))
hpatissues = ['clean_backgrounds.dir/%s_hpa_background.tsv'
              % tissue.replace(" ", "_") for tissue in hpatissues]

########################################################
# Set up database connection
########################################################


def connect():
    '''utility function to connect to database.

    Use this method to connect to the pipeline database.
    Additional databases can be attached here as well.

    Returns an sqlite3 database handle.

コード例 #16

0

ファイルを表示

           regex(".*/(.*).bed.gz"),
           r"motifs/\1.control.fasta")
def exportMotifControlSequences(infile, outfile):
    '''for each interval, export the left and right
    sequence segment of the same size.
    '''
    PipelineMotifs.exportSequencesFromBedFile(
        infile, outfile,
        masker=PARAMS['motifs_masker'],
        mode="leftright")


############################################################
############################################################
############################################################
@active_if("meme" in P.as_list(PARAMS["methods"]) or
           "disc_meme" in P.as_list(PARAMS["methods"]))
@transform(loadIntervals,
           suffix("_intervals.load"),
           ".meme.fasta")
def exportMemeIntervalSequences(infile, outfile):
    
    track = os.path.basename(P.snip(infile, "_intervals.load"))

    exportIntervalSequences(infile, outfile, track, "meme")


############################################################
@follows(mkdir("meme.dir"))
@active_if("meme" in P.as_list(PARAMS["methods"]))
@transform(exportMemeIntervalSequences, regex("(.+).meme.fasta"),

コード例 #17

0

ファイルを表示

ファイル: pipeline_motifs.py プロジェクト: tw7649116/cgat-flow

def getAssociatedBAMFiles(track):
    '''return a list of BAM files associated with a track.

    By default, this method searches for ``track.bam`` file in the
    current directory and returns an offset of 0.

    Associations can be defined in the .yml file in the section
    [bams]. For example, the following snippet associates track
    track1 with the bamfiles :file:`track1.bam` and :file:`track2.bam`::

       [bams]
       track1=track1.bam,track2.bam

    Glob expressions are permitted.

    Offsets are used to shift tags in ChIP experiments. Offsets
    need to be defined in the [offsets] sections. If no offsets
    are defined, the method returns a list of 0 offsets.

    Offsets need to be defined in the same order as the bam files::

       [offsets]
       track1=120,200

    returns a list of BAM files and offsets.

    Default tracks and offsets can be specified using a placeholder ``%``. The
    following will associate all tracks with the same bam file::

        [bams]
        %=all.bam

    '''
    fn = track.asFile()
    bamfiles = glob.glob("%s.bam" % fn)

    if bamfiles == []:
        if "bams_%s" % fn.lower() in PARAMS:
            for ff in P.as_list(PARAMS["bams_%s" % fn.lower()]):
                bamfiles.extend(glob.glob(ff))
        else:
            for pattern, value in P.CONFIG.items("bams"):
                if "%" in pattern:
                    p = re.sub("%", "\S+", pattern)
                    if re.search(p, fn, re.IGNORECASE):
                        bamfiles.extend(glob.glob(value))

    offsets = []
    if "offsets_%s" % fn.lower() in PARAMS:
        offsets = list(map(int, P.as_list(PARAMS["offsets_%s" % fn.lower()])))
    else:
        for pattern, value in P.CONFIG.items("offsets"):
            if "%" in pattern:
                p = re.sub("%", "\S+", pattern)
                if re.search(p, fn, re.IGNORECASE):
                    offsets.extend(list(map(int, value.split(","))))

    if offsets == []:
        offsets = [0] * len(bamfiles)

    if len(bamfiles) != len(offsets):
        raise ValueError("number of BAM files %s is not the "
                         "same as number of offsets: %s" %
                         (str(bamfiles), str(offsets)))

    return bamfiles, offsets

コード例 #18

0

ファイルを表示

PARAMS = P.get_parameters([
    "%s/pipeline.yml" % os.path.splitext(__file__)[0], "../pipeline.yml",
    "pipeline.yml"
])

# WARNING: pipeline names with underscores in their name are not allowed
TESTS = sorted(
    set([
        "test_{}".format(x.split("_")[1]) for x in PARAMS.keys()
        if x.startswith("test_")
    ]))


# obtain prerequisite generic data
@files([(None, "%s.tgz" % x)
        for x in P.as_list(PARAMS.get("prerequisites", ""))])
def setupPrerequisites(infile, outfile):
    '''setup pre-requisites.

    These are tar-balls that are unpacked, but not run.
    '''

    #to_cluster = False
    track = P.snip(outfile, ".tgz")

    # obtain data - should overwrite pipeline.yml file
    statement = '''
    wget --no-check-certificate -O %(track)s.tgz %(data_url)s/%(track)s.tgz'''
    P.run(statement)

    tf = tarfile.open(outfile)

コード例 #19

0

ファイルを表示

def compareCheckSums(infiles, outfile):
    '''compare checksum files against existing reference data.
    '''

    outf = iotools.open_file(outfile, "w")
    outf.write("\t".join((
        ("track", "status", "job_finished", "nfiles", "nref", "missing",
         "extra", "different", "different_md5", "different_lines", "same",
         "same_md5", "same_lines", "same_exist", "files_missing",
         "files_extra", "files_different_md5", "files_different_lines"))) +
               "\n")

    for infile in infiles:
        E.info("working on {}".format(infile))
        track = P.snip(infile, ".stats")

        logfiles = glob.glob(track + "*.log")
        job_finished = True
        for logfile in logfiles:
            is_complete = iotools.is_complete(logfile)
            E.debug("logcheck: {} = {}".format(logfile, is_complete))
            job_finished = job_finished and is_complete

        reffile = track + ".ref"

        # regular expression of files to test only for existence
        regex_exist = PARAMS.get('%s_regex_exist' % track, None)
        if regex_exist:
            regex_exist = re.compile("|".join(P.as_list(regex_exist)))

        regex_linecount = PARAMS.get('%s_regex_linecount' % track, None)
        if regex_linecount:
            regex_linecount = re.compile("|".join(P.as_list(regex_linecount)))

        regex_md5 = PARAMS.get('%s_regex_md5' % track, None)
        if regex_md5:
            regex_md5 = re.compile("|".join(P.as_list(regex_md5)))

        if not os.path.exists(reffile):
            raise ValueError('no reference data defined for %s' % track)

        cmp_data = pandas.read_csv(iotools.open_file(infile),
                                   sep="\t",
                                   index_col=0)

        ref_data = pandas.read_csv(iotools.open_file(reffile),
                                   sep="\t",
                                   index_col=0)

        shared_files = set(cmp_data.index).intersection(ref_data.index)
        missing = set(ref_data.index).difference(cmp_data.index)
        extra = set(cmp_data.index).difference(ref_data.index)

        different = set(shared_files)

        # remove those for which only check for existence
        if regex_exist:
            same_exist = set([x for x in different if regex_exist.search(x)])

            different = set(
                [x for x in different if not regex_exist.search(x)])
        else:
            same_exist = set()

        # select those for which only check for number of lines
        if regex_linecount:
            check_lines = [x for x in different if regex_linecount.search(x)]

            dd = (cmp_data['nlines'][check_lines] !=
                  ref_data['nlines'][check_lines])
            different_lines = set(dd.index[dd])
            different = different.difference(check_lines)

            dd = (cmp_data['nlines'][check_lines] == ref_data['nlines']
                  [check_lines])
            same_lines = set(dd.index[dd])

        else:
            different_lines = set()
            same_lines = set()

        # remainder - check md5
        if regex_md5:
            check_md5 = [x for x in different if regex_md5.search(x)]

            dd = (cmp_data['md5'][check_md5] != ref_data['md5'][check_md5])
            different_md5 = set(dd.index[dd])

            dd = (cmp_data['md5'][check_md5] == ref_data['md5'][check_md5])
            same_md5 = set(dd.index[dd])

        else:
            different_md5 = set()
            same_md5 = set()

        if job_finished and (len(missing) + len(extra) + len(different_md5) +
                             len(different_lines) == 0):
            status = "OK"
        else:
            status = "FAIL"

        outf.write("\t".join(
            map(str, (
                track,
                status,
                job_finished,
                len(cmp_data),
                len(ref_data),
                len(missing),
                len(extra),
                len(different_md5) + len(different_lines),
                len(different_md5),
                len(different_lines),
                len(same_md5) + len(same_lines) + len(same_exist),
                len(same_md5),
                len(same_lines),
                len(same_exist),
                ",".join(missing),
                ",".join(extra),
                ",".join(different_md5),
                ",".join(different_lines),
            ))) + "\n")

    outf.close()

コード例 #20

0

ファイルを表示

                    (entry.gene_id, transcript2gene_dict[entry.transcript_id]))
        else:
            transcript2gene_dict[entry.transcript_id] = entry.gene_id

    with iotools.open_file(outfile, "w") as outf:
        outf.write("transcript_id\tgene_id\n")
        for key, value in sorted(transcript2gene_dict.items()):
            outf.write("%s\t%s\n" % (key, value))


###################################################
# count-based quantifiers
###################################################


@active_if("featurecounts" in P.as_list(PARAMS["quantifiers"]))
@follows(mkdir("featurecounts.dir"))
@transform(["%s.bam" % x.asFile() for x in BAM_TRACKS], regex("(\S+).bam"),
           add_inputs(PARAMS['geneset']), [
               r"featurecounts.dir/\1/transcripts.tsv.gz",
               r"featurecounts.dir/\1/genes.tsv.gz"
           ])
def runFeatureCounts(infiles, outfiles):
    '''
    Counts reads falling into "features" - in each transcript and
    each gene.

    A read is counted as overlapping with a feature if at least one bp
    overlaps.

    Pairs and strandedness can be used to resolve reads falling into