def summariseGTFs(outfile):
    job_memory = str(PARAMS["Merge_memory"]) + "G"
    statement = "python {}scripts/annotationSummaryTable.py --gtf-dir combined_annotations.dir --annot-output {} --orf-output {}".format(
        os.path.dirname(__file__).rstrip("pipelines"), outfile,
        "report.dir/orf_summary.tsv")
    P.run(statement)
예제 #2
0
def run_report(clean=True,
               with_pipeline_status=True,
               pipeline_status_format="svg"):
    '''run cgatreport.

    This will also run ruffus to create an svg image of the pipeline
    status unless *with_pipeline_status* is set to False. The image
    will be saved into the export directory.

    '''

    params = P.get_params()

    if with_pipeline_status:
        targetdir = params["exportdir"]
        if not os.path.exists(targetdir):
            os.mkdir(targetdir)

        ruffus.pipeline_printout_graph(
            os.path.join(targetdir, "pipeline.%s" % pipeline_status_format),
            pipeline_status_format, ["full"],
            checksum_level=params["ruffus_checksums_level"])

    dirname, basename = os.path.split(P.get_caller().__file__)

    report_engine = params.get("report_engine", "cgatreport")
    assert report_engine in ('sphinxreport', 'cgatreport')

    docdir = os.path.join(dirname, "pipeline_docs",
                          iotools.snip(basename, ".py"))
    themedir = os.path.join(dirname, "pipeline_docs", "themes")
    relpath = os.path.relpath(docdir)
    trackerdir = os.path.join(docdir, "trackers")

    # use a fake X display in order to avoid windows popping up
    # from R plots.
    xvfb_command = iotools.which("xvfb-run")

    # permit multiple servers using -d option
    if xvfb_command:
        xvfb_command += " -d "
    else:
        xvfb_command = ""

    # if there is no DISPLAY variable set, xvfb runs, but
    # exits with error when killing process. Thus, ignore return
    # value.
    # print os.getenv("DISPLAY"), "command=", xvfb_command
    if not os.getenv("DISPLAY"):
        erase_return = "|| true"
    else:
        erase_return = ""

    if os.path.exists("conf.py"):
        conf_dir = os.path.abspath(".")
    else:
        conf_dir = os.path.join(os.path.dirname(__file__), "configuration")

    # in the current version, xvfb always returns with an error, thus
    # ignore these.
    erase_return = "|| true"

    if clean:
        clean = "rm -rf report _cache _static;"
    else:
        clean = ""

    # with sphinx >1.3.1 the PYTHONPATH needs to be set explicitely as
    # the virtual environment seems to be stripped. It is thus set to
    # the contents of the current sys.path
    syspath = ":".join(sys.path)

    statement = '''
    %(clean)s
    (export SPHINX_DOCSDIR=%(docdir)s;
    export SPHINX_THEMEDIR=%(themedir)s;
    export PYTHONPATH=%(syspath)s;
    %(xvfb_command)s
    %(report_engine)s-build
    --num-jobs=%(report_threads)s
    sphinx-build
    -b html
    -d %(report_doctrees)s
    -c %(conf_dir)s
    -j %(report_threads)s
    %(docdir)s %(report_html)s
    >& report.log %(erase_return)s )
    '''

    P.run(statement)

    E.info(
        'the report is available at %s' %
        os.path.abspath(os.path.join(params['report_html'], "contents.html")))
예제 #3
0
파일: go.py 프로젝트: tw7649116/cgat-flow
def runGOFromFiles(outfile,
                   outdir,
                   fg_file,
                   bg_file=None,
                   go_file=None,
                   ontology_file=None,
                   samples=None,
                   minimum_counts=0,
                   pairs=False,
                   gene2name=None):
    """check for GO enrichment.

    The gene lists are supplied by files.
    This method is a wrapper for `runGO.py`.

    Arguments
    ---------
    outfile : string
        Output filename
    outdir : string
        Output directory for auxiliary files
    fg_file : string
        Gene list of foreground.
    bg_file : string
        Gene list for background. If None, all genes
        with GO annotations are used as background.
    go_file : string
        Filename with Gene-to-GO assignments
    ontology_file : string
        Filename with ontology information.
    samples : int
        Number of samples for empirical FDR. If not given, use
        BH FDR.
    minimum_counts : int
        Minimum number of observations in a GO category
        required in order for using it.
    pairs : bool
       If True, each category for each pair of gene sets will
       be tested for differential enrichment.
    gene2name : string
        Filename with a gene-to-genename information.
    """

    if ontology_file is None:
        ontology_file = PARAMS.get("go_ontology", None)

    options = []
    if ontology_file:
        options.append("--filename-ontology=%(ontology_file)s" % locals())

    if bg_file is not None:
        options.append("--background-tsv-file=%(bg_file)s" % locals())

    if samples is not None:
        options.append("--fdr")
        options.append("--sample-size=%(samples)i" % locals())
        options.append("--fdr-method=empirical")
    else:
        options.append("--fdr")
        options.append("--fdr-method=BH")

    if pairs:
        options.append("--pairwise")

    if gene2name:
        options.append("--gene2name-map-tsv-file=%s" % gene2name)

    options = " ".join(options)
    statement = '''
    cgat runGO
    --filename-input=%(go_file)s
    --genes-tsv-file=%(fg_file)s
    --output-filename-pattern='%(outdir)s/%%(set)s.%%(go)s.%%(section)s'
    --min-counts=%(minimum_counts)i
    --log=%(outfile)s.log
    %(options)s
    > %(outfile)s'''

    P.run(statement)
예제 #4
0
def cellrangerCount(infile, outfile):
    '''
    Execute the cell ranger pipleline for all samples.
    '''
    # set key parameters
    transcriptome = PARAMS["cellranger_transcriptome"]

    if transcriptome is None:
        raise ValueError('"cellranger_transcriptome" parameter not set'
                         ' in file "pipeline.yml"')

    if not os.path.exists(transcriptome):
        raise ValueError('The specified "cellranger_transcriptome"'
                         ' file does not exist')

    # set the maximum number of jobs for cellranger
    max_jobs = PARAMS["cellranger_maxjobs"]

    # parse the sample name and expected cell number
    library_id, cellnumber, batch, trash = os.path.basename(infile).split(".")

    # build lists of the sample files
    seq_folders = []
    sample_ids = []

    # Parse the list of sequencing runs (i.e., paths) for the sample
    with open(infile, "r") as sample_list:
        for line in sample_list:
            seq_folder_path = line.strip()
            if seq_folder_path != "":
                seq_folders.append(seq_folder_path)
                sample_ids.append(os.path.basename(seq_folder_path))

    input_fastqs = ",".join(seq_folders)
    input_samples = ",".join(sample_ids)

    id_tag = library_id + "-count"

    log_file = id_tag + ".log"

    ## send one job script to slurm queue which arranges cellranger run
    ## hard-coded to ensure enough resources
    job_threads = 6
    job_memory = "24000M"
    statement = (
        '''cellranger count
                   --id %(id_tag)s
                   --fastqs %(input_fastqs)s
                   --sample %(input_samples)s
                   --transcriptome %(transcriptome)s
                   --expect-cells %(cellnumber)s
                   --chemistry %(cellranger_chemistry)s
                   --jobmode=slurm
                   --maxjobs=%(max_jobs)s
                   --nopreflight
            &> %(log_file)s
        ''')

    P.run(statement)

    IOTools.touch_file(outfile)
예제 #5
0
def subsetAndDownsample(infiles, outfile):
    '''
    Generate datasets that include subsets of the 10x samples.

    Optionally downsample UMI counts to normalise between samples.
    '''

    outdir = os.path.dirname(outfile)
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    agg_matrix_dir = os.path.join(os.path.dirname(infiles[0]),
                                  "agg.processed.dir")

    sample_table = pd.read_csv(infiles[1], sep="\t")

    subsets = [k.split("_", 1)[1] for k in PARAMS.keys()
               if k.startswith("datasets_")]

    # Titles of fields encoded in filenames
    name_field_titles = PARAMS["name_field_titles"]

    if PARAMS["downsampling_enabled"]:
        downsampling_function = PARAMS['downsampling_function']
    else:
        downsampling_function = "no"

    downsampling_apply = PARAMS["downsampling_apply"]

    job_memory = PARAMS["postprocess_memory"]

    statements = []

    for subset in subsets:

        if subset == "all":
            if not PARAMS["datasets_all"]:
                continue

            sample_ids = set(sample_table["sample_id"].values)
            sample_ids_str = ",".join(sample_ids)

        else:

            sample_ids = PARAMS["datasets" + "_" + subset]

            sample_ids_str = ",".join([x.strip() for x in
                                       sample_ids.split(",")])

        out_dir = os.path.join(os.path.dirname(outfile),
                               subset)

        tenx_dir = PARAMS["tenx_dir"]

        log_file = outfile.replace(".sentinel",
                                   "." + subset + ".log")

        statement = '''Rscript %(tenx_dir)s/R/cellranger_subsetAndDownsample.R
                       --tenxdir=%(agg_matrix_dir)s
                       --sampleids=%(sample_ids_str)s
                       --downsample=%(downsampling_function)s
                       --apply=%(downsampling_apply)s
                       --samplenamefields=%(name_field_titles)s
                       --outdir=%(out_dir)s
                       &> %(log_file)s
                    ''' % locals()

        statements.append(statement)

    P.run(statements)

    IOTools.touch_file(outfile)
예제 #6
0
def build_report():
    scriptloc = "/".join(os.path.dirname(
        sys.argv[0]).split("/")[0:-1]) + "/scripts/assembly_report.Rmd"
    statement = 'R -e "rmarkdown::render(\'{}\',output_file=\'{}/report.dir/assembly_report.html\')" --args {}/contigs.dir/Contigs.Summary'.format(
        scriptloc, os.getcwd(), os.getcwd())
    P.run(statement)
예제 #7
0
    def run(self, infile, outfile, params):

        options = []
        reference_fasta = params.reference_fasta
        reference_fasta_map = build_reference_fasta_map(
            params.reference_fasta_map)
        reference_label = None
        use_target_regions = True
        if params.reference_fasta:
            map_path2name = dict([(x[1], x[0])
                                  for x in list(reference_fasta_map.items())])
            if params.reference_fasta == "auto":

                fasta = resolve_argument(list(reference_fasta_map.values()),
                                         ",").split(",")

                reference_fasta, diffs = get_reference_for_bam(
                    infile, fastafiles=fasta)

                if reference_fasta:
                    options.append("--ref-seq {}".format(reference_fasta))
                    reference_label = map_path2name[reference_fasta]
                elif diffs:
                    E.warn(
                        "attempted to detect reference fasta, but unable to do so. "
                        "diffs: {}".format(diffs))
                else:
                    E.warn("sequence dict is empty, BAM likely to be empty. "
                           "target_regions will be ignored")
                    use_target_regions = False
            else:
                options.append("--ref-seq {}".format(params.reference_fasta))
                reference_label = map_path2name.get(params.reference_fasta,
                                                    None)

        if params.target_regions and use_target_regions:
            target_regions = get_associated_file(params, reference_label,
                                                 "target_regions")
            # convert to 1-based coordinates and decompress
            if target_regions.endswith(".bed.gz"):
                target_regions = (
                    "<(zcat {} "
                    "| awk '{{printf(\"%%s\\t%%i\\t%%i\\n\", $1, $2+1, $3)}}')"
                    .format(target_regions))
            options.append("--target-regions {}".format(target_regions))

        options = " ".join(options)
        if not os.path.exists(outfile + ".tmp"):
            try:
                retval = P.run("{params.path} stats "
                               "{self.options} "
                               "{options} "
                               "{infile} "
                               "2> {outfile}.log "
                               "> {outfile}.tmp; ".format(**locals()),
                               job_memory="16G")
            except OSError as e:
                E.warn("input file {} gave the following errors: {}".format(
                    infile, str(e)))
                return None
        else:
            retval = None

        def split_output(lines):
            is_comment = True
            section, body = None, []
            for line in lines:
                if line.startswith("#"):
                    if body:
                        yield section, body
                    body = []
                    is_comment = True
                else:
                    # the following preserves new-line
                    line = re.sub("\t#.*", "", line)
                    fields = line[:-1].split("\t")
                    section = fields[0]
                    body.append(fields[1:])
                    is_comment = False

            if body:
                yield section, body

        # split into separate files for upload
        with IOTools.open_file(outfile + ".tmp") as inf:
            for section, body in split_output(inf):
                try:
                    tablename, columns = self._map_section_to_table[section]
                except KeyError:
                    continue

                output_file = self.map_table_to_file(tablename, outfile)
                with IOTools.open_file(output_file, "w") as outf:

                    if len(columns) > 1 and columns[1].startswith("VAR_"):
                        outf.write("{}\t{}\n".format(columns[0],
                                                     columns[1][4:]))
                        for data in body:
                            outf.write("{}\t{}\n".format(
                                data[0], ",".join(data)))
                    else:
                        outf.write("\t".join(columns) + "\n")
                        # remove first column, which contains the identifier
                        outf.write("\n".join(["\t".join(x)
                                              for x in body]) + "\n")

        os.rename(outfile + ".tmp", outfile)

        return retval
def mergeTables(outfile):
    statement=PipelineHumann2.humann2Merge(outfile,PARAMS)
    P.run(statement)
def normTables(infile,outfile):
    if re.search("coverage",infile):
        statement="ln -s ../{} {}".format(infile,outfile)
    else:
        statement=PipelineHumann2.humann2Norm(infile,outfile,PARAMS)
    P.run(statement)
예제 #10
0
def flagstat(infile, outfile):
    statement = '''samtools flagstat %(infile)s > %(outfile)s'''
    P.run(statement)
예제 #11
0
def runHumann2(infile, outfile):
    job_threads = int(PARAMS["Humann2_threads"])
    job_memory = PARAMS["Humann2_memory"]+"G"
    statement = PipelineHumann2.humann2Call(infile,PARAMS)
    P.run(statement)
예제 #12
0
def idxstats(infile, outfile):
    statement = '''samtools idxstats %(infile)s > %(outfile)s'''
    P.run(statement)
예제 #13
0
def multiqc(infiles, outfile):
    statement = ''' export LC_ALL=en_US.UTF-8 && export LANG=en_US.UTF-8 
    && multiqc . -f -n %(outfile)s '''
    P.run(statement)
예제 #14
0
def fastqc(infile, outfile):
    statement = '''fastqc %(infile)s > %(outfile)s.log'''
    P.run(statement)
예제 #15
0
def mapReads(infiles, outfile):
    '''Map reads to the genome using BWA '''
    job_threads = PARAMS["bwa_threads"]
    m = PipelineMapping.BWA()
    statement = m.build((infiles, ), outfile)
    P.run()
예제 #16
0
def picard_rmdup(infile, outfile):
    final_memory = str(int(params['picard_memory']) + 2) + 'g'
    statement = """picard -Xmx%(picard_memory)sg MarkDuplicates REMOVE_DUPLICATES = true
                    I= %(infile)s O=%(outfile)s M= %(outfile)s.metrics
                    && samtools index %(outfile)s"""
    P.run(statement, job_queue=params['q'], job_memory=final_memory)
예제 #17
0
def summariseContigs(infile, outfile):
    #summarise each contigs file
    statement = PipelineAssembly.SummariseContigs(infile, outfile)
    P.run(statement)
예제 #18
0
def filter_read(infile, outfile):
    statement = """samtools view -h -b -f1 -f2 -F 4 -F 0x100%(infile)s chr{1..19} > %(outfile)s
    && samtools index %(outfile)s"""
    P.run(statement, job_queue=params['q'])
예제 #19
0
def indexBams(infile, outfile):
    '''index merged bams'''

    statement = f'''samtools index -b {infile} {outfile}'''

    P.run(statement)
예제 #20
0
def macs2(infiles, outfile):
    treatment, control = infiles
    name = outfile.replace('_peaks.narrowPeak', "").replace('peaks/', '')
    statement = """macs2 callpeak -t %(treatment)s -c %(control)s
                   -n %(name)s -f BAM -g %(macs2_genome)s --outdir peaks"""
    P.run(statement, job_queue=params["q"])
예제 #21
0
    def run(self, infile, outfile, params):
        # TODO: bam_fastqc_sequence_length_distribution.tsv may
        # contain ranges such as '30-31'. Convert to beginning of
        # range like in this perl command:
        #
        # perl -p -i -e "s/\-\d+//"
        # *.dir/bam_fastqc.dir/bam_fastqc.tsv.bam_fastqc_sequence_length_distribution.tsv

        if infile.endswith(".gz"):
            prefix = IOTools.snip(os.path.basename(infile[:-3]))
        else:
            prefix = IOTools.snip(os.path.basename(infile))

        outdir = os.path.dirname(outfile)

        datafile = os.path.join(outdir, "{}_fastqc".format(prefix),
                                "fastqc_data.txt")

        if not os.path.exists(datafile):
            if not os.path.exists(outdir):
                os.makedirs(outdir)

            retval = P.run(
                "{params.path} "
                "{params.options} "
                "--extract "
                "--outdir {outdir} "
                "{infile} "
                ">& {outfile} ".format(**locals()), **params._asdict())
        else:
            IOTools.touch_file(outfile)
            retval = None

        def _split_output(lines):
            body, header, section, status = [], None, None, None
            for line in lines:
                if line.startswith("##FastQC"):
                    continue
                elif line.startswith("#"):
                    header, body = line[1:-1].split("\t"), []
                elif line.startswith(">>END_MODULE"):
                    yield section, header, body, status
                    body, header, section, status = [], None, None, None
                elif line.startswith(">>"):
                    section, status = line[2:-1].split("\t")
                else:
                    fields = line[:-1].split("\t")
                    body.append(fields)

        # split into separate files for upload
        summary_data = []
        with IOTools.open_file(datafile) as inf:
            for section, header, body, status in _split_output(inf):
                if len(body) == 0:
                    continue
                summary_data.append((section, status))
                tablename = "{}_".format(self.name) + re.sub(
                    " ", "_", section).lower()
                if tablename not in self.tablenames:
                    raise ValueError(
                        "unknown tablename {}, expected one of {}".format(
                            tablename, self.tablenames))
                output_file = ".".join((outfile, tablename, "tsv"))
                with open(output_file, "w") as outf:
                    outf.write("\t".join([x.lower() for x in header]) + "\n")
                    # remove first column, which contains the identifier
                    outf.write("\n".join(["\t".join(x) for x in body]) + "\n")

        output_file = ".".join(
            (outfile, "{}_summary".format(self.name), "tsv"))
        with IOTools.open_file(output_file, "w") as outf:
            outf.write("section\tstatus\n")
            for section, status in summary_data:
                outf.write("{}\t{}\n".format(section, status))

        return retval
예제 #22
0
def fastqc(infile, outfile):
    statement = "fastqc --nogroup -o fastqc %(infile)s > %(outfile)s.log"
    P.run(statement, job_queue=params["q"])
예제 #23
0
def postprocessAggrMatrix(infiles, outfile):
    '''
    Post-process the cellranger aggr count matrix.

    Batch, sample_name and aggregation ID metadata are added.

    Optionally cells with barcodes shared (within sequencing batch)
    between samples can be removed (known index hopping on Illumina 4000).
    '''

    outdir = os.path.dirname(outfile)
    if not os.path.exists(outdir):
        os.mkdir(outdir)

    infile = infiles[0]

    sample_table = infiles[1]

    agg_dir = os.path.dirname(infile)
    out_dir = os.path.dirname(outfile)

    # Clean barcode hopping
    if PARAMS["postprocess_barcodes"]:
        hopping = "--hopping"
    else:
        hopping = ""

    # Additional options
    options = PARAMS["postprocess_options"]

    mexdir = PARAMS["postprocess_mexdir"]

    if mexdir is None:
        raise ValueError('"postprocess_mexdir" parameter not set'
                         ' in file "pipeline.yml"')

    tenxdir = os.path.join(agg_dir, mexdir)
    if not os.path.exists(tenxdir):
        raise ValueError('The specified "postprocess_mexdir"'
                         ' directory does not exist in directory ' + agg_dir)

    job_memory = PARAMS["postprocess_memory"]

    blacklist = PARAMS["postprocess_blacklist"]

    log_file = outfile.replace(".sentinel", ".log")

    statement = '''Rscript %(tenx_dir)s/R/cellranger_postprocessAggrMatrix.R
                   --tenxdir=%(tenxdir)s
                   --sampletable=%(sample_table)s
                   --samplenamefields=%(name_field_titles)s
                   --downsample=no
                   %(hopping)s
                   --blacklist=%(blacklist)s
                   %(options)s
                   --outdir=%(out_dir)s
                   &> %(log_file)s
                '''

    P.run(statement)

    IOTools.touch_file(outfile)
예제 #24
0
def multiqc(infiles, outfile):
    statement = """export LC_ALL=en_US.UTF-8; export LANG=en_US.UTF-8;
    multiqc fastqc/ -f -n %(outfile)s"""
    P.run(statement, job_queue=params["q"])
예제 #25
0
def combine_means(infiles, outfile):
    infiles = " ".join(infiles)
    statement = ("cat %(infiles)s " "> %(outfile)s ".format(**locals()))
    P.run(statement)
예제 #26
0
def picard(infile, outfile):
    final_memory = str(int(params['picard_memory']) + 2) + 'g'
    statement = """picard -Xmx%(picard_memory)sg CollectAlignmentSummaryMetrics 
                    R=%(picard_genome)s I= %(infile)s O=%(outfile)s"""
    P.run(statement, job_queue=params['q'], job_memory=final_memory)
예제 #27
0
    def processReads(infile, outfiles):
        '''process reads from .fastq and other sequence files.
        '''
        trimmomatic_options = P.get_params()["trimmomatic_options"]

        if P.get_params()["auto_remove"]:
            trimmomatic_options = " ILLUMINACLIP:%s:%s:%s:%s:%s:%s " % (
                "contaminants.fasta",
                P.get_params()["trimmomatic_mismatches"],
                P.get_params()["trimmomatic_p_thresh"],
                P.get_params()["trimmomatic_c_thresh"],
                P.get_params()["trimmomatic_min_adapter_len"],
                P.get_params()["trimmomatic_keep_both_reads"]) + trimmomatic_options

        elif P.get_params()["trimmomatic_adapter"]:
            trimmomatic_options = " ILLUMINACLIP:%s:%s:%s:%s:%s:%s " % (
                P.get_params()["trimmomatic_adapter"],
                P.get_params()["trimmomatic_mismatches"],
                P.get_params()["trimmomatic_p_thresh"],
                P.get_params()["trimmomatic_c_thresh"],
                P.get_params()["trimmomatic_min_adapter_len"],
                P.get_params()["trimmomatic_keep_both_reads"]) + trimmomatic_options

        job_threads = P.get_params()["threads"]
        job_memory = "12G"

        track = re.match(REGEX_TRACK, infile).groups()[0]

        m = preprocess.MasterProcessor(
            save=P.get_params()["save"],
            summarize=P.get_params()["summarize"],
            threads=P.get_params()["threads"],
            qual_format=P.get_params()['qual_format'])

        for tool in P.as_list(P.get_params()["preprocessors"]):

            if tool == "fastx_trimmer":
                m.add(preprocess.FastxTrimmer(
                    P.get_params()["fastx_trimmer_options"],
                    threads=P.get_params()["threads"]))
            elif tool == "trimmomatic":
                m.add(preprocess.Trimmomatic(
                    trimmomatic_options,
                    threads=P.get_params()["threads"]))
            elif tool == "sickle":
                m.add(preprocess.Sickle(
                    P.get_params()["sickle_options"],
                    threads=P.get_params()["threads"]))
            elif tool == "trimgalore":
                m.add(preprocess.Trimgalore(
                    P.get_params()["trimgalore_options"],
                    threads=P.get_params()["threads"]))
            elif tool == "flash":
                m.add(preprocess.Flash(
                    P.get_params()["flash_options"],
                    threads=P.get_params()["threads"]))
            elif tool == "reversecomplement":
                m.add(preprocess.ReverseComplement(
                    P.get_params()["reversecomplement_options"]))
            elif tool == "pandaseq":
                m.add(preprocess.Pandaseq(
                    P.get_params()["pandaseq_options"],
                    threads=P.get_params()["threads"]))
            elif tool == "cutadapt":
                cutadapt_options = P.get_params()["cutadapt_options"]
                if P.get_params()["auto_remove"]:
                    cutadapt_options += " -a file:contaminants.fasta "
                m.add(preprocess.Cutadapt(
                    cutadapt_options,
                    threads=P.get_params()["threads"],
                    untrimmed=P.get_params()['cutadapt_reroute_untrimmed'],
                    process_paired=P.get_params()["cutadapt_process_paired"]))
            else:
                raise NotImplementedError("tool '%s' not implemented" % tool)

        statement = m.build((infile,), "processed.dir/trimmed-", track)
        P.run(statement)
예제 #28
0
def flagstats(infile, outfile):
    statement = """samtools flagstats %(infile)s > %(outfile)s"""
    P.run(statement, job_queue=params['q'])
예제 #29
0
파일: go.py 프로젝트: tw7649116/cgat-flow
def createGOFromGeneOntology(infile, outfile):
    """get GO assignments from Geneontology.org

    GO terms are mapped to ensembl gene names via uniprot identifiers.

    Configuration
    -------------
    geneontology_file
       Filename on geneontology database, e.g.,
       gene_association.goa_human.gz
    database_name
       Pipeline database name

    Arguments
    ---------
    infile : string
        Unused
    outfile : string
        Output filename
    """

    filename = os.path.join(os.path.dirname(outfile), "geneontology.goa.gz")
    if not os.path.exists(filename):
        statement = '''
        wget -O %(filename)s http://cvsweb.geneontology.org/cgi-bin/cvsweb.cgi/go/gene-associations/%(go_geneontology_file)s?rev=HEAD
    '''

        P.run(statement)

    # see http://www.geneontology.org/gene-associations/readme/goa.README
    Data = collections.namedtuple(
        "Data",
        "db db_object_id db_object_symbol qualifier goid dbreference evidence "
        " with_id aspect "
        " db_object_name synonym db_object_type "
        " taxon_id date assigned_by "
        " annotation_extension"
        " gene_product_form_id")

    dbh = sqlite3.connect(PARAMS["database_name"])
    cc = dbh.cursor()
    map_uniprot2ensembl = dict(
        cc.execute("SELECT DISTINCT gene_name, gene_id FROM transcript_info").
        fetchall())
    map_goid2description = dict(
        cc.execute("SELECT DISTINCT go_id, description FROM go_assignments").
        fetchall())

    aspect2name = {
        "P": "biol_process",
        "F": "mol_function",
        "C": "cell_location"
    }

    c = E.Counter()
    found_uniprot, found_genes, notfound_uniprot = set(), set(), set()
    outf = iotools.open_file(outfile, "w")
    outf.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n")
    for line in iotools.open_file(filename):
        if line.startswith("!"):
            continue
        c.input += 1
        data = Data._make(line[:-1].split("\t"))

        if data.db_object_symbol in map_uniprot2ensembl:
            gene_id = map_uniprot2ensembl[data.db_object_symbol]
            found_uniprot.add(data.db_object_symbol)
            found_genes.add(gene_id)
            outf.write(
                "%s\t%s\t%s\t%s\t%s\n" %
                (aspect2name[data.aspect], gene_id, data.goid,
                 map_goid2description.get(data.goid, ""), data.evidence))
            c.output += 1

        else:
            c.notfound += 1
            notfound_uniprot.add(data.db_object_symbol)

    c.found_genes = len(found_genes)
    c.found_uniprot = len(found_uniprot)
    c.notfound_uniprot = len(notfound_uniprot)

    E.info("%s" % str(c))
    E.info("not found=%s" % str(notfound_uniprot))
    outf.close()
def meganAnnot(infile, outfile):
    job_memory = str(PARAMS["Blast2lca_memory"]) + "G"
    job_threads = int(PARAMS["Blast2lca_threads"])
    #generate call to blast2lca
    statement = PipelineAnnotate.runBlast2Lca(infile, outfile, PARAMS)
    P.run(statement)