Python Experiment Examples

Programming Language: Python

Namespace/Package Name: CGATCore

Class/Type: Experiment

Examples at hotexamples.com: 30

Python Experiment - 30 examples found. These are the top rated real world Python examples of CGATCore.Experiment extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

OptionParser(30)

Counter(30)

Start(22)

Stop(21)

get_output_file(8)

debug(7)

error(5)

GetHeader(4)

getOutputFile(4)

GetFooter(3)

GetParams(3)

OptionGroup(2)

execute(2)

MultiLineFormatter(1)

Example #1

Show file

def DumpGOFromDatabase(outfile, dbhandle, options):
    """read go assignments from database.

    and dump them into a flatfile.
    (one to many mapping of genes to GO categories)
    and a dictionary of go-term to go information
    """

    E.info("category\ttotal\tgenes\tcategories")

    all_genes = collections.defaultdict(int)
    all_categories = collections.defaultdict(int)
    all_ntotal = 0

    outfile.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n")

    for go_type in options.ontology:

        genes = collections.defaultdict(int)
        categories = collections.defaultdict(int)
        ntotal = 0
        statement = GetGOStatement(go_type, options.database_name,
                                   options.species)

        results = Database.executewait(dbhandle, statement,
                                       retries=0).fetchall()

        for result in results:
            outfile.write("\t".join(map(str, (go_type, ) + result)) + "\n")
            gene_id, goid, description, evidence = result
            genes[gene_id] += 1
            categories[goid] += 1
            ntotal += 1
            all_genes[gene_id] += 1
            all_categories[goid] += 1
            all_ntotal += 1

        E.info("%s\t%i\t%i\t%i" %
               (go_type, ntotal, len(genes), len(categories)))

    E.info("%s\t%i\t%i\t%i" %
           ("all", all_ntotal, len(all_genes), len(all_categories)))

    return

Example #2

Show file

def splitFiles(infile, nchunks, out_dir):
    '''
    Give files names based on splitting into an arbitrary number of chunks
    '''

    df = pd.read_table(infile, sep="\t", header=0, index_col=0)
    total = len(df.index.tolist())

    # split into aribitrary number of chunks, or arbitrary chunk size?
    # small n bad for large input size, large n bad for small input size
    # set min/max chunk size, e.g. 100 genes minimum, 500 maximum?

    if total/nchunks < 100:
        step = 100
        E.warn("too few genes in each chunk, resetting to 100 genes per chunk")
    elif total/nchunks > 500:
        step = 500
        E.warn("too many genes per chunk, resetting to 500 genes per chunk")
    else:
        step = total/nchunks
        E.info("chunking input file into %i chunks" % step)

    file_pattern = infile.split("/")[1].rstrip("-expression.tsv")
    idx = 0
    for i in range(step, total, step):
        start = "%s" % idx
        end = "%s" % i
        file_name = "%s/%s-%s_%s-split.tsv" % (out_dir,
                                               file_pattern,
                                               start,
                                               end)
        with open(file_name, "w") as file_handle:
            file_handle.write(file_name + "\n")
        idx = i

    # final file
    start = "%s" % idx
    end = "%s" % total
    file_name = "%s/%s-%s_%s-split.tsv" % (out_dir,
                                           file_pattern,
                                           start,
                                           end)
    with open(file_name, "w") as file_handle:
        file_handle.write(file_name + "\n")

Example #3

Show file

    def parseHeader(self, infile, outfile, options):
        """parse header in infile."""
        # skip comments until header
        while 1:
            l = infile.readline()
            if not l:
                break
            if self.header_regex:
                if self.header_regex.search(l):
                    break
            elif l[0] != "#":
                break
            options.stdlog.write(l)

        # print only the first header and check if
        # all the headers are the same.
        if self.header:
            if self.header != l:
                raise ValueError("inconsistent header in file %s\n"
                                 "got=%s\nexpected=%s" %
                                 (infile, l, self.header))
        else:
            outfile.write(l)
            self.header = l
            self.nfields = l.count("\t")
            if self.nfields == 0:
                E.warn("only single column in header: %s" % l[:-1])

            if self.mFieldIndex is None and self.mFieldName:
                try:
                    self.mFieldIndex = self.header.split("\t").index(
                        self.mFieldName)
                except ValueError:
                    E.warn("no mapping, can not find field %s in %s" %
                           (self.mFieldName, self.header))
                    self.mFieldName = None

                E.debug("substituting field: %s, %s" %
                        (self.mFieldName, self.mFieldIndex))

Example #4

Show file

def annotate(infile, annotation_file, outfile):
    '''
    annotate infile with annotations from
    annotation gtf file
    '''
    inf = open(infile)
    header = inf.readline()
    include = set()

    E.info("reading genes to keep")
    for line in inf.readlines():
        data = line[:-1].split("\t")
        gene_id = data[8].strip('"')
        include.add(gene_id)

    E.info("reading annotations file")
    annotations = {}
    for gtf in GTF.iterator(IOTools.openFile(annotation_file)):
        if gtf.gene_id in include:
            annotations[gtf.gene_id] = \
                [gtf.gene_name, gtf.species, gtf.description]

    inf = open(infile)
    header = inf.readline()

    E.info("writing results with annotations")
    outf = open(outfile, "w")
    outf.write(header.strip("\n") +
               "\tgene_name\tspecies_centroid\tdescription\n")
    for line in inf.readlines():
        data = line[:-1].split("\t")
        gene_id = data[8].strip('"')
        try:
            outf.write("\t".join(data + annotations[gene_id]) + "\n")
        except KeyError:
            outf.write("\t".join(data + ["NA", "NA", "NA"]) + "\n")
    outf.close()

Example #5

Show file

def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--test",
                      dest="test",
                      type="string",
                      help="supply help")

    parser.add_option("--plot-type",
                      dest="plot_type",
                      type="choice",
                      choices=["manhattan", "qqplot", "epistasis"],
                      help="plot type to generate")

    parser.add_option("--resolution",
                      dest="resolution",
                      type="choice",
                      choices=["genome_wide", "chromosome", "fine_map"],
                      help="the resolution of plotting, wether the plot "
                      "depicts the whole genome, a single chromosome or "
                      "a specific locus")

    parser.add_option("--file-format",
                      dest="file_format",
                      type="choice",
                      choices=["plink", "cassi", "cassi_covar"],
                      help="input file format, used to parse the file "
                      "properly")

    parser.add_option("--save-path",
                      dest="save_path",
                      type="string",
                      help="path and filename to save image to")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    parser.set_defaults(resolution="genome_wide",
                        plot_type="manhattan",
                        file_format="plink")

    # if the input is a list of files, split them
    infile = argv[-1]
    infiles = infile.split(",")

    # need to parse epistasis output slightly differently
    if options.plot_type == "epistasis":
        epi = True
    else:
        epi = False

    if len(infiles) > 1:
        results = gwas.GWASResults(assoc_file=infiles,
                                   epistasis=epi,
                                   file_format=options.file_format)
    elif len(infiles) == 1:
        results = gwas.GWASResults(assoc_file=infile,
                                   epistasis=epi,
                                   file_format=options.file_format)
    else:
        raise IOError("no input files detected, please specifiy association "
                      "results files as the last command line argument")

    if options.plot_type == "manhattan":
        df = results.plotManhattan(resolution=options.resolution,
                                   save_path=options.save_path)
    elif options.plot_type == "qqplot":
        results.plotQQ(save_path=options.save_path,
                       resolution=options.resolution)
    elif options.plot_type == "epistasis":
        results.plotEpistasis(save_path=options.save_path,
                              resolution=options.resolution)
    else:
        pass

    # only output appended results for Manhattan plot, not qqplot
    try:
        df.to_csv(options.stdout, sep="\t", index=None)
    except UnboundLocalError:
        pass

    # write footer and output benchmark information.
    E.stop()

Example #6

Show file

def mergeVariants(variants):
    '''merge overlapping variants.

    Overlapping variants occur if there are two deletions
    at the same location:

        WT      ACTG  
        Allele1 -CT-   
        Allele2 ----

    This will be encoded by samtools as (0-based coordinates)::

        0 * -A/ACTG
        3 * -G/-G

    This upsets the re-constitution algoritm.

    This method separates these two variants into two non-overlapping
    variants making use of variable length deletions.

        0 * -A/-A
        1 * ---G/-CTG

    Another case:

        WT      ACTG  
        Allele1 ACT-   
        Allele2 ----

    This will be encoded by samtools as (0-based coordinates)::

        0 * */-ACTG
        3 * -G/*

    This method separates these two as::

        0 * */-ACT
        3 * -G/-G

    '''

    if len(variants) == 0:
        return []

    # sorts by start and then end
    variants.sort()
    merged_variants = []

    def _add(offset, dest, src):
        for x, c in enumerate(src):
            dest[x + offset] = c

    def _split(seq0, seq1):
        # split
        was_0, was_1 = seq0[0] == "-", seq1[0] == "-"
        for x, cc in enumerate(zip(seq0, seq1)):
            is_0, is_1 = cc[0] == "-", cc[1] == "-"
            # yield all changes
            if (is_0 ^ was_0) or (is_1 ^ was_1):
                yield x, was_0, was_1
            was_0, was_1 = is_0, is_1

        yield x + 1, was_0, was_1

    last = variants[0]
    for this in variants[1:]:

        if this.start < last.end and \
                this.action == "-" and \
                last.action == "-":

            E.warn("merging overlapping deletions: %s and %s" %
                   (str(last), str(this)))

            mend = max(last.end, this.end)
            mstart = min(this.start, last.start)
            l = mend - mstart

            seq0 = list("-" * l)
            seq1 = list("-" * l)

            _add(last.start - mstart, seq0, last.variantseqs[0])
            _add(last.start - mstart, seq1, last.variantseqs[1])
            _add(this.start - mstart, seq0, this.variantseqs[0])
            _add(this.start - mstart, seq1, this.variantseqs[1])

            last_x = 0
            n = []
            for x, was_0, was_1 in _split(seq0, seq1):
                if last_x == x:
                    continue

                this = ExtendedVariant._make((
                    mstart + last_x,
                    mstart + x,
                    "*",
                    last.action,
                    was_0 ^ was_1,
                    ["".join(seq0[last_x:x]), "".join(seq1[last_x:x])],
                ))
                n.append(this)
                last_x = x

            E.warn("overlapping deletions merged in %i blocks as: %s" %
                   (len(n), list(map(str, n))))
            merged_variants.extend(n[:-1])
            this = n[-1]
        else:
            merged_variants.append(last)

        last = this

    merged_variants.append(last)

    return merged_variants

Example #7

Show file

def main(argv=None):

    parser = getOptionParser()

    (options, args) = E.Start(parser, add_cluster_options=True)

    if len(args) == 0:
        raise ValueError(
            "command line argument missing - see usage information")

    options.renumber_column = [x.split(":") for x in options.renumber_column]

    cmd = args[0]
    if len(args) > 1:
        cmd += " '" + "' '".join(args[1:]) + "'"

    if options.dry_run:

        cmd = re.sub("%DIR%", "", cmd)
        retcode = subprocess.call(cmd,
                                  shell=True,
                                  stdin=sys.stdin,
                                  stdout=sys.stdout,
                                  cwd=os.getcwd(),
                                  close_fds=True)
        E.Stop()
        sys.exit(0)

    failed_requests = []
    started_requests = []
    niterations = 0

    if not options.collect:
        tmpdir = os.path.abspath(tempfile.mkdtemp(dir=options.tmpdir))

        E.info(" working in directory %s" % tmpdir)

        if options.split_at_lines:
            chunk_iterator = chunk_iterator_lines
            args = (options.split_at_lines, )
        elif options.split_at_column:
            chunk_iterator = chunk_iterator_column
            args = (options.split_at_column - 1, options.max_files)
        elif options.split_at_regex:
            chunk_iterator = chunk_iterator_regex_split
            args = (re.compile(options.split_at_regex), 0, options.chunksize,
                    options.max_lines)
        elif options.group_by_regex:
            chunk_iterator = chunk_iterator_regex_group
            args = (re.compile(options.group_by_regex), 0, options.chunksize)
        else:
            raise ValueError("please specify a way to chunk input data")

        data = [(x, cmd, options, None, options.subdirs)
                for x in chunk_iterator(options.stdin,
                                        args,
                                        prefix=tmpdir,
                                        use_header=options.input_header)]

        started_requests = [(x[0], x[0] + ".out") for x in data]

        if len(data) == 0:
            E.warn("no data received")
            E.Stop()
            sys.exit(0)

        if options.method == "multiprocessing":
            pool = Pool(options.cluster_num_jobs)
            results = pool.map(runCommand, data, chunksize=1)
        elif options.method == "drmaa":
            results = []
            runDRMAA(data, environment=options.environment)
        elif options.method == "threads":
            pool = ThreadPool(options.cluster_num_jobs)
            results = pool.map(runCommand, data, chunksize=1)

        niterations = 0
        for retcode, filename, cmd, logfile, iterations in results:
            niterations += iterations
            if not hasFinished(retcode, filename, options.output_tag, logfile):
                failed_requests.append((filename, cmd))

    else:
        tmpdir = options.collect
        started_requests = [(x[:-4], x) for x in glob.glob(tmpdir + "/*.out")]

        E.info("collecting %i files from %s" % (len(started_requests), tmpdir))

    if failed_requests:
        for fn, cmd in failed_requests:
            E.error("failed request: filename= %s, cmd= %s" % (fn, cmd))
    else:
        E.info("building result from %i parts" % len(started_requests))

        if options.renumber:
            mapper = MapperLocal(pattern=options.renumber)
        else:
            mapper = MapperEmpty()

        # deal with stdout
        name = None
        index = None

        for pattern, column in options.renumber_column:

            if re.search(pattern, "stdout"):
                try:
                    index = int(column) - 1
                except ValueError:
                    name = column
                    break

        if options.binary:
            ResultBuilderBinary()(started_requests, options.stdout, options)
        else:
            regex = None
            if options.output_regex_header:
                regex = re.compile(options.output_regex_header)
            ResultBuilder(mapper=mapper,
                          field_index=index,
                          field_name=name,
                          header_regex=regex)(started_requests, options.stdout,
                                              options)

        # deal with logfiles : combine them into a single file
        rr = re.search("'--log=(\S+)'", cmd) or re.search("'--L\s+(\S+)'", cmd)
        if rr:
            E.info("logging output goes to %s" % rr.groups()[0])
            logfile = IOTools.openFile(rr.groups()[0], "a")
            ResultBuilderLog()([(x[0], "%s.log" % x[0])
                                for x in started_requests], logfile, options)
            logfile.close()

        # deal with other files
        if options.subdirs:

            files = glob.glob("%s/*.dir/*" % tmpdir)
            # remove directory
            filenames = set([os.path.basename(x) for x in files])
            xx = len(".out")

            for filename in filenames:

                _, filetype = os.path.splitext(filename)

                name = None
                index = None

                for pattern, column in options.renumber_column:
                    if re.search(pattern, filename):
                        try:
                            index = int(column) - 1
                        except ValueError:
                            name = column
                        break

                if options.binary:
                    builder = ResultBuilderBinary(mapper=mapper)
                elif filetype in (".fa", ".fasta"):
                    builder = ResultBuilderFasta(mapper=mapper)
                elif filetype in (".mali", ):
                    builder = ResultBuilderFasta(mapper=MapperEmpty())
                elif filetype in (".psl"):
                    builder = ResultBuilderPSL(mapper=mapper)
                elif filetype in (".gtf", ".gff"):
                    builder = ResultBuilderGFF(mapper=mapper,
                                               field_index=index,
                                               field_name=name)
                elif filetype in (".png"):
                    builder = ResultBuilderCopies(mapper=mapper)
                else:
                    builder = ResultBuilder(mapper=mapper,
                                            field_index=index,
                                            field_name=name)

                E.debug("chose the following builder for %s: %s: %s" %
                        (filename, filetype, str(builder)))

                E.info("collecting results for %s" % filename)

                input_filenames = []
                for fi, fn in started_requests:
                    fn = fn[:-xx] + ".dir/" + filename
                    if os.path.exists(fn):
                        input_filenames.append((fi, fn))

                E.info("output of %i files goes to %s" %
                       (len(filenames), filename))

                outfile = IOTools.openFile(options.output_pattern % filename,
                                           "w")
                builder(input_filenames, outfile, options)
                outfile.close()

    if not options.debug and (not options.resume or not options.collect):
        if len(failed_requests) == 0:
            E.info("removing directory %s" % tmpdir)
            shutil.rmtree(tmpdir)
        else:
            E.info("directory %s not removed due to %i failed jobs" %
                   (tmpdir, len(failed_requests)))

    E.info("job control: nstarted=%i, nfinished=%i, nerrors=%i, nrepeats=%i" %
           (len(started_requests), len(started_requests) -
            len(failed_requests), len(failed_requests), niterations))

    E.Stop()

Example #8

Show file

def runDRMAA(data, environment):
    '''run jobs in data using drmaa to connect to the cluster.'''

    # SNS: Error dection now taken care of with Cluster.py
    # expandStatement function

    # working directory - needs to be the one from which the
    # the script is called to resolve input files.
    cwd = os.getcwd()

    session = drmaa.Session()
    session.initialize()

    jobids = []
    kwargs = {}

    for filename, cmd, options, tmpdir, subdirs in data:

        from_stdin, to_stdout = True, True

        if subdirs:
            outdir = "%s.dir/" % (filename)
            os.mkdir(outdir)
            cmd = re.sub("%DIR%", outdir, cmd)

        x = re.search("'--log=(\S+)'", cmd) or re.search("'--L\s+(\S+)'", cmd)
        if x:
            logfile = filename + ".log"
            cmd = cmd[:x.start()] + "--log=%s" % logfile + cmd[x.end():]
        else:
            logfile = filename + ".out"

        if "%STDIN%" in cmd:
            cmd = re.sub("%STDIN%", filename, cmd)
            from_stdin = False

        if "%STDOUT%" in cmd:
            cmd = re.sub("%STDOUT%", filename + ".out", cmd)
            to_stdout = False

        cmd = " ".join(re.sub("\t+", " ", cmd).split("\n"))
        E.info("running statement:\n%s" % cmd)

        job_script = tempfile.NamedTemporaryFile(dir=os.getcwd(),
                                                 delete=False,
                                                 mode="w+t")
        job_script.write("#!/bin/bash\n")  # -l -O expand_aliases\n" )
        job_script.write(Cluster.expandStatement(cmd) + "\n")
        job_script.close()

        job_path = os.path.abspath(job_script.name)

        os.chmod(job_path, stat.S_IRWXG | stat.S_IRWXU)

        # get session for process - only one is permitted

        job_name = os.path.basename(kwargs.get("outfile", "farm.py"))

        options_dict = vars(options)
        options_dict["workingdir"] = os.getcwd()

        if options.job_memory:
            job_memory = options.job_memory
        elif options.cluster_memory_default:
            job_memory = options.cluster_memory_default
        else:
            job_memory = "2G"

        jt = Cluster.setupDrmaaJobTemplate(session, options_dict, job_name,
                                           job_memory)

        jt.remoteCommand = job_path

        # update the environment
        e = {'BASH_ENV': options.bashrc}
        if environment:
            for en in environment:
                try:
                    e[en] = os.environ[en]
                except KeyError:
                    raise KeyError(
                        "could not export environment variable '%s'" % en)
        jt.jobEnvironment = e

        # SNS: Native specifation setting abstracted
        # to Pipeline/Cluster.setupDrmaaJobTemplate()

        # use stdin for data
        if from_stdin:
            jt.inputPath = ":" + filename

        # set paths.

        # later: allow redirection of stdout and stderr to files
        # could this even be across hosts?
        if to_stdout:
            jt.outputPath = ":" + filename + ".out"
        else:
            jt.outputPath = ":" + filename + ".stdout"

        jt.errorPath = ":" + filename + ".err"

        jobid = session.runJob(jt)
        jobids.append((jobid, job_path, filename, cmd, logfile))

    E.debug("%i jobs have been submitted" % len(jobids))

    results = []

    for jobid, job_path, filename, cmd, logfile in jobids:

        try:
            retval = session.wait(jobid, drmaa.Session.TIMEOUT_WAIT_FOREVER)
        except Exception as msg:
            # ignore message 24 in PBS
            # code 24: drmaa: Job finished but resource usage information
            # and/or termination status could not be provided.":
            if not msg.message.startswith("code 24"):
                raise
            retval = None

        if retval and retval.exitStatus != 0:
            raise OSError("Child was terminated by signal %i: \n%s\n" %
                          (retval.exitStatus, cmd))

        results.append((retval, filename, cmd, logfile, 1))

        os.unlink(job_path)

    session.deleteJobTemplate(jt)
    session.exit()

Example #9

Show file

File: bam2geneprofile.py Project: AndreasHegerGenomics/cgat-apps

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
        """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=(
                          "geneprofile",
                          "tssprofile",
                          "utrprofile",
                          "intervalprofile",
                          "midpointprofile",
                          "geneprofilewithintrons",
                          "geneprofileabsolutedistancefromthreeprimeend",
                          "separateexonprofile",
                          "separateexonprofilewithintrons",
                      ),
                      help='counters to use. Counters describe the '
                      'meta-gene structure to use. '
                      'Note using geneprofilewithintrons, or '
                      'geneprofileabsolutedistancefromthreeprimeend will '
                      'automatically turn on the --use-base-accuracy option'
                      '[%default].')

    parser.add_option("-b",
                      "--bam-file",
                      "--bedfile",
                      "--bigwigfile",
                      dest="infiles",
                      metavar="BAM",
                      type="string",
                      action="append",
                      help="BAM/bed/bigwig files to use. Do not mix "
                      "different types [%default]")

    parser.add_option("-c",
                      "--control-bam-file",
                      dest="controlfiles",
                      metavar="BAM",
                      type="string",
                      action="append",
                      help="control/input to use. Should be of the same "
                      "type as the bam/bed/bigwig file"
                      " [%default]")

    parser.add_option("-g",
                      "--gtf-file",
                      dest="gtffile",
                      type="string",
                      metavar="GTF",
                      help="GTF file to use. "
                      "[%default]")

    parser.add_option("--normalize-transcript",
                      dest="transcript_normalization",
                      type="choice",
                      choices=("none", "max", "sum", "total-max", "total-sum"),
                      help="normalization to apply on each transcript "
                      "profile before adding to meta-gene profile. "
                      "[%default]")

    parser.add_option("--normalize-profile",
                      dest="profile_normalizations",
                      type="choice",
                      action="append",
                      choices=("all", "none", "area", "counts", "background"),
                      help="normalization to apply on meta-gene "
                      "profile normalization. "
                      "[%default]")

    parser.add_option(
        "-r",
        "--reporter",
        dest="reporter",
        type="choice",
        choices=("gene", "transcript"),
        help="report results for genes or transcripts."
        " When 'genes` is chosen, exons across all transcripts for"
        " a gene are merged. When 'transcript' is chosen, counts are"
        " computed for each transcript separately with each transcript"
        " contributing equally to the meta-gene profile."
        " [%default]")

    parser.add_option("-i",
                      "--shift-size",
                      dest="shifts",
                      type="int",
                      action="append",
                      help="shift reads in :term:`bam` formatted file "
                      "before computing densities (ChIP-Seq). "
                      "[%default]")

    parser.add_option("-a",
                      "--merge-pairs",
                      dest="merge_pairs",
                      action="store_true",
                      help="merge pairs in :term:`bam` formatted "
                      "file before computing "
                      "densities (ChIP-Seq). "
                      "[%default]")

    parser.add_option("-u",
                      "--use-base-accuracy",
                      dest="base_accuracy",
                      action="store_true",
                      help="compute densities with base accuracy. The default "
                      "is to only use the start and end of the aligned region "
                      "(RNA-Seq) "
                      "[%default]")

    parser.add_option("-e",
                      "--extend",
                      dest="extends",
                      type="int",
                      action="append",
                      help="extend reads in :term:`bam` formatted file "
                      "(ChIP-Seq). "
                      "[%default]")

    parser.add_option("--resolution-upstream",
                      dest="resolution_upstream",
                      type="int",
                      help="resolution of upstream region in bp "
                      "[%default]")

    parser.add_option("--resolution-downstream",
                      dest="resolution_downstream",
                      type="int",
                      help="resolution of downstream region in bp "
                      "[%default]")

    parser.add_option("--resolution-upstream-utr",
                      dest="resolution_upstream_utr",
                      type="int",
                      help="resolution of upstream UTR region in bp "
                      "[%default]")

    parser.add_option("--resolution-downstream-utr",
                      dest="resolution_downstream_utr",
                      type="int",
                      help="resolution of downstream UTR region in bp "
                      "[%default]")

    parser.add_option("--resolution-cds",
                      dest="resolution_cds",
                      type="int",
                      help="resolution of cds region in bp "
                      "[%default]")

    parser.add_option("--resolution-first-exon",
                      dest="resolution_first",
                      type="int",
                      help="resolution of first exon in gene, in bp"
                      "[%default]")

    parser.add_option("--resolution-last-exon",
                      dest="resolution_last",
                      type="int",
                      help="resolution of last exon in gene, in bp"
                      "[%default]")

    parser.add_option("--resolution-introns",
                      dest="resolution_introns",
                      type="int",
                      help="resolution of introns region in bp "
                      "[%default]")

    parser.add_option("--resolution-exons-absolute-distance-topolya",
                      dest="resolution_exons_absolute_distance_topolya",
                      type="int",
                      help="resolution of exons absolute distance "
                      "topolya in bp "
                      "[%default]")

    parser.add_option("--resolution-introns-absolute-distance-topolya",
                      dest="resolution_introns_absolute_distance_topolya",
                      type="int",
                      help="resolution of introns absolute distance "
                      "topolya in bp "
                      "[%default]")

    parser.add_option("--extension-exons-absolute-distance-topolya",
                      dest="extension_exons_absolute_distance_topolya",
                      type="int",
                      help="extension for exons from the absolute "
                      "distance from the topolya in bp "
                      "[%default]")

    parser.add_option(
        "--extension-introns-absolute-distance-topolya",
        dest="extension_introns_absolute_distance_topolya",
        type="int",
        help="extension for introns from the absolute distance from "
        "the topolya in bp [%default]")

    parser.add_option("--extension-upstream",
                      dest="extension_upstream",
                      type="int",
                      help="extension upstream from the first exon in bp"
                      "[%default]")

    parser.add_option("--extension-downstream",
                      dest="extension_downstream",
                      type="int",
                      help="extension downstream from the last exon in bp"
                      "[%default]")

    parser.add_option("--extension-inward",
                      dest="extension_inward",
                      type="int",
                      help="extension inward from a TSS start site in bp"
                      "[%default]")

    parser.add_option("--extension-outward",
                      dest="extension_outward",
                      type="int",
                      help="extension outward from a TSS start site in bp"
                      "[%default]")

    parser.add_option("--scale-flank-length",
                      dest="scale_flanks",
                      type="int",
                      help="scale flanks to (integer multiples of) gene length"
                      "[%default]")

    parser.add_option(
        "--control-factor",
        dest="control_factor",
        type="float",
        help="factor for normalizing control and foreground data. "
        "Computed from data if not set. "
        "[%default]")

    parser.add_option("--output-all-profiles",
                      dest="output_all_profiles",
                      action="store_true",
                      help="keep individual profiles for each "
                      "transcript and output. "
                      "[%default]")

    parser.add_option("--counts-tsv-file",
                      dest="input_filename_counts",
                      type="string",
                      help="filename with count data for each transcript. "
                      "Use this instead "
                      "of recomputing the profile. Useful for plotting the "
                      "meta-gene profile "
                      "from previously computed counts "
                      "[%default]")

    parser.add_option(
        "--background-region-bins",
        dest="background_region_bins",
        type="int",
        help="number of bins on either end of the profile "
        "to be considered for background meta-gene normalization "
        "[%default]")

    parser.set_defaults(
        remove_rna=False,
        ignore_pairs=False,
        force_output=False,
        bin_size=10,
        extends=[],
        shifts=[],
        sort=[],
        reporter="transcript",
        resolution_cds=1000,
        resolution_introns=1000,
        # 3kb is a good balance of seeing long enough 3 prime bias and not omit
        # too many genes. Tim 31th Aug 2013
        resolution_exons_absolute_distance_topolya=3000,
        # introns is only for assess the noise level, thus do ont need a long
        # region, a long region has the side effect of omit more genes. Tim
        # 31th Aug 2013
        resolution_introns_absolute_distance_topolya=500,
        # extension can simply just be the same as resolution
        extension_exons_absolute_distance_topolya=3000,
        extension_introns_absolute_distance_topolya=500,
        resolution_upstream_utr=1000,
        resolution_downstream_utr=1000,
        resolution_upstream=1000,
        resolution_downstream=1000,
        resolution_first=1000,
        resolution_last=1000,
        # mean length of transcripts: about 2.5 kb
        extension_upstream=2500,
        extension_downstream=2500,
        extension_inward=3000,
        extension_outward=3000,
        plot=True,
        methods=[],
        infiles=[],
        controlfiles=[],
        gtffile=None,
        profile_normalizations=[],
        transcript_normalization=None,
        scale_flanks=0,
        merge_pairs=False,
        min_insert_size=0,
        max_insert_size=1000,
        base_accuracy=False,
        matrix_format="single",
        control_factor=None,
        output_all_profiles=False,
        background_region_bins=10,
        input_filename_counts=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    # Keep for backwards compatability
    if len(args) == 2:
        infile, gtf = args
        options.infiles.append(infile)
        options.gtffile = gtf

    if not options.gtffile:
        raise ValueError("no GTF file specified")

    if options.gtffile == "-":
        options.gtffile = options.stdin
    else:
        options.gtffile = IOTools.open_file(options.gtffile)

    if len(options.infiles) == 0:
        raise ValueError("no bam/wig/bed files specified")

    for methodsRequiresBaseAccuracy in [
            "geneprofilewithintrons",
            "geneprofileabsolutedistancefromthreeprimeend",
    ]:
        # If you implemented any methods that you do not want the
        # spliced out introns or exons appear to be covered by
        # non-existent reads, it is better you let those methods imply
        # --base-accurarcy by add them here.
        if methodsRequiresBaseAccuracy in options.methods:
            options.base_accuracy = True

    if options.reporter == "gene":
        gtf_iterator = GTF.flat_gene_iterator(GTF.iterator(options.gtffile))
    elif options.reporter == "transcript":
        gtf_iterator = GTF.transcript_iterator(GTF.iterator(options.gtffile))

    # Select rangecounter based on file type
    if len(options.infiles) > 0:
        if options.infiles[0].endswith(".bam"):
            bamfiles = [pysam.AlignmentFile(x, "rb") for x in options.infiles]

            if options.controlfiles:
                controlfiles = [
                    pysam.AlignmentFile(x, "rb") for x in options.controlfiles
                ]
            else:
                controlfiles = None

            format = "bam"
            if options.merge_pairs:
                range_counter = _bam2geneprofile.RangeCounterBAM(
                    bamfiles,
                    shifts=options.shifts,
                    extends=options.extends,
                    merge_pairs=options.merge_pairs,
                    min_insert_size=options.min_insert_size,
                    max_insert_size=options.max_insert_size,
                    controfiles=controlfiles,
                    control_factor=options.control_factor)

            elif options.shifts or options.extends:
                range_counter = _bam2geneprofile.RangeCounterBAM(
                    bamfiles,
                    shifts=options.shifts,
                    extends=options.extends,
                    controlfiles=controlfiles,
                    control_factor=options.control_factor)

            elif options.base_accuracy:
                range_counter = _bam2geneprofile.RangeCounterBAMBaseAccuracy(
                    bamfiles,
                    controlfiles=controlfiles,
                    control_factor=options.control_factor)
            else:
                range_counter = _bam2geneprofile.RangeCounterBAM(
                    bamfiles,
                    controlfiles=controlfiles,
                    control_factor=options.control_factor)

        elif options.infiles[0].endswith(".bed.gz"):
            bedfiles = [pysam.Tabixfile(x) for x in options.infiles]

            if options.controlfiles:
                controlfiles = [
                    pysam.Tabixfile(x) for x in options.controlfiles
                ]
            else:
                controlfiles = None

            range_counter = _bam2geneprofile.RangeCounterBed(
                bedfiles,
                controlfiles=controlfiles,
                control_factor=options.control_factor)

        elif options.infiles[0].endswith(".bw"):
            wigfiles = [BigWigFile(file=open(x)) for x in options.infiles]
            range_counter = _bam2geneprofile.RangeCounterBigWig(wigfiles)

        else:
            raise NotImplementedError("can't determine file type for %s" %
                                      str(options.infiles))

    counters = []
    for method in options.methods:
        if method == "utrprofile":
            counters.append(
                _bam2geneprofile.UTRCounter(
                    range_counter,
                    options.resolution_upstream,
                    options.resolution_upstream_utr,
                    options.resolution_cds,
                    options.resolution_downstream_utr,
                    options.resolution_downstream,
                    options.extension_upstream,
                    options.extension_downstream,
                ))

        elif method == "geneprofile":
            counters.append(
                _bam2geneprofile.GeneCounter(
                    range_counter, options.resolution_upstream,
                    options.resolution_cds, options.resolution_downstream,
                    options.extension_upstream, options.extension_downstream,
                    options.scale_flanks))

        elif method == "geneprofilewithintrons":
            counters.append(
                _bam2geneprofile.GeneCounterWithIntrons(
                    range_counter, options.resolution_upstream,
                    options.resolution_cds, options.resolution_introns,
                    options.resolution_downstream, options.extension_upstream,
                    options.extension_downstream, options.scale_flanks))

        elif method == "geneprofileabsolutedistancefromthreeprimeend":
            # options.extension_exons_absolute_distance_tostartsite,
            # options.extension_introns_absolute_distance_tostartsite,
            # Tim 31th Aug 2013: a possible feature for future,  if five prime
            # bias is of your interest.
            # (you need to create another class). It is not very difficult to
            # derive from this class, but is not implemented yet
            # This future feature is slightly different the TSS profile
            # already implemented, because in this future feature introns are
            # skipped,
            counters.append(
                _bam2geneprofile.GeneCounterAbsoluteDistanceFromThreePrimeEnd(
                    range_counter, options.resolution_upstream,
                    options.resolution_downstream,
                    options.resolution_exons_absolute_distance_topolya,
                    options.resolution_introns_absolute_distance_topolya,
                    options.extension_upstream, options.extension_downstream,
                    options.extension_exons_absolute_distance_topolya,
                    options.extension_introns_absolute_distance_topolya,
                    options.scale_flanks))

        elif method == "tssprofile":
            counters.append(
                _bam2geneprofile.TSSCounter(range_counter,
                                            options.extension_outward,
                                            options.extension_inward))

        elif method == "intervalprofile":
            counters.append(
                _bam2geneprofile.RegionCounter(range_counter,
                                               options.resolution_upstream,
                                               options.resolution_cds,
                                               options.resolution_downstream,
                                               options.extension_upstream,
                                               options.extension_downstream))

        elif method == "midpointprofile":
            counters.append(
                _bam2geneprofile.MidpointCounter(range_counter,
                                                 options.resolution_upstream,
                                                 options.resolution_downstream,
                                                 options.extension_upstream,
                                                 options.extension_downstream))

        # add new method to split 1st and last exons out
        # requires a representative transcript for reach gene
        # gtf should be sorted gene-position
        elif method == "separateexonprofile":
            counters.append(
                _bam2geneprofile.SeparateExonCounter(
                    range_counter, options.resolution_upstream,
                    options.resolution_first, options.resolution_last,
                    options.resolution_cds, options.resolution_downstream,
                    options.extension_upstream, options.extension_downstream))

        elif method == "separateexonprofilewithintrons":
            counters.append(
                _bam2geneprofile.SeparateExonWithIntronCounter(
                    range_counter, options.resolution_upstream,
                    options.resolution_first, options.resolution_last,
                    options.resolution_cds, options.resolution_introns,
                    options.resolution_downstream, options.extension_upstream,
                    options.extension_downstream))

    # set normalization
    for c in counters:
        c.setNormalization(options.transcript_normalization)
        if options.output_all_profiles:
            c.setOutputProfiles(
                IOTools.open_file(
                    E.getOutputFile(c.name) + ".profiles.tsv.gz", "w"))

    if options.input_filename_counts:
        # read counts from file
        E.info("reading counts from %s" % options.input_filename_counts)
        all_counts = pandas.read_csv(IOTools.open_file(
            options.input_filename_counts),
                                     sep='\t',
                                     header=0,
                                     index_col=0)

        if len(counters) != 1:
            raise NotImplementedError(
                'counting from matrix only implemented for 1 counter.')
        # build counter based on reference counter
        counter = _bam2geneprofile.UnsegmentedCounter(counters[0])
        counters = [counter]
        _bam2geneprofile.countFromCounts(counters, all_counts)

    else:
        E.info("starting counting with %i counters" % len(counters))
        feature_names = _bam2geneprofile.countFromGTF(counters, gtf_iterator)

    # output matrices
    if not options.profile_normalizations:
        options.profile_normalizations.append("none")
    elif "all" in options.profile_normalizations:
        options.profile_normalizations = [
            "none", "area", "counts", "background"
        ]

    for method, counter in zip(options.methods, counters):
        profiles = []
        for norm in options.profile_normalizations:
            # build matrix, apply normalization
            profile = counter.getProfile(
                normalize=norm,
                background_region_bins=options.background_region_bins)
            profiles.append(profile)

        for x in range(1, len(profiles)):
            assert profiles[0].shape == profiles[x].shape

        # build a single matrix of all profiles for output
        matrix = numpy.concatenate(profiles)
        matrix.shape = len(profiles), len(profiles[0])
        matrix = matrix.transpose()

        with IOTools.open_file(
                E.getOutputFile(counter.name) + ".matrix.tsv.gz",
                "w") as outfile:
            outfile.write("bin\tregion\tregion_bin\t%s\n" %
                          "\t".join(options.profile_normalizations))
            fields = []
            bins = []
            for field, nbins in zip(counter.fields, counter.nbins):
                fields.extend([field] * nbins)
                bins.extend(list(range(nbins)))

            for row, cols in enumerate(zip(fields, bins, matrix)):
                outfile.write("%i\t%s\t" %
                              (row, "\t".join([str(x) for x in cols[:-1]])))
                outfile.write("%s\n" % ("\t".join([str(x) for x in cols[-1]])))

        with IOTools.open_file(
                E.getOutputFile(counter.name) + ".lengths.tsv.gz",
                "w") as outfile:
            counter.writeLengthStats(outfile)

        if options.output_all_profiles:
            counter.closeOutputProfiles()

    if options.plot:

        import matplotlib
        # avoid Tk or any X
        matplotlib.use("Agg")
        import matplotlib.pyplot as plt

        for method, counter in zip(options.methods, counters):

            if method in ("geneprofile", "geneprofilewithintrons",
                          "geneprofileabsolutedistancefromthreeprimeend",
                          "utrprofile", "intervalprofile",
                          "separateexonprofile",
                          "separateexonprofilewithintrons"):

                plt.figure()
                plt.subplots_adjust(wspace=0.05)
                max_scale = max([max(x) for x in counter.aggregate_counts])

                for x, counts in enumerate(counter.aggregate_counts):
                    plt.subplot(6, 1, x + 1)
                    plt.plot(list(range(len(counts))), counts)
                    plt.title(counter.fields[x])
                    plt.ylim(0, max_scale)

                figname = counter.name + ".full"

                fn = E.getOutputFile(figname) + ".png"
                plt.savefig(os.path.expanduser(fn))

                plt.figure()

                points = []
                cuts = []
                for x, counts in enumerate(counter.aggregate_counts):
                    points.extend(counts)
                    cuts.append(len(counts))

                plt.plot(list(range(len(points))), points)

                xx, xxx = 0, []
                for x in cuts:
                    xxx.append(xx + x // 2)
                    xx += x
                    plt.axvline(xx, color="r", ls="--")

                plt.xticks(xxx, counter.fields)

                figname = counter.name + ".detail"

                fn = E.getOutputFile(figname) + ".png"
                plt.savefig(os.path.expanduser(fn))

            elif method == "tssprofile":

                plt.figure()
                plt.subplot(1, 3, 1)
                plt.plot(
                    list(
                        range(-options.extension_outward,
                              options.extension_inward)),
                    counter.aggregate_counts[0])
                plt.title(counter.fields[0])
                plt.subplot(1, 3, 2)
                plt.plot(
                    list(
                        range(-options.extension_inward,
                              options.extension_outward)),
                    counter.aggregate_counts[1])
                plt.title(counter.fields[1])
                plt.subplot(1, 3, 3)
                plt.title("combined")
                plt.plot(
                    list(
                        range(-options.extension_outward,
                              options.extension_inward)),
                    counter.aggregate_counts[0])
                plt.plot(
                    list(
                        range(-options.extension_inward,
                              options.extension_outward)),
                    counter.aggregate_counts[1])
                plt.legend(counter.fields[:2])

                fn = E.getOutputFile(counter.name) + ".png"
                plt.savefig(os.path.expanduser(fn))

            elif method == "midpointprofile":

                plt.figure()
                plt.plot(numpy.arange(-options.resolution_upstream, 0),
                         counter.aggregate_counts[0])
                plt.plot(numpy.arange(0, options.resolution_downstream),
                         counter.aggregate_counts[1])

                fn = E.getOutputFile(counter.name) + ".png"
                plt.savefig(os.path.expanduser(fn))

    # write footer and output benchmark information.
    E.stop()

Example #10

Show file

File: chain2stats.py Project: AndreasHegerGenomics/cgat-apps

 def _write_tabbed(self, name, lines, E):
     outfile = E.openOutputFile(name)
     outfile.write('\n'.join(lines))
     outfile.write('\n')
     outfile.close

Example #11

Show file

def concatenate_tables(outfile, options, args):
    '''concatenate tables.'''

    missing_value = options.missing_value

    rx = re.compile(options.regex_filename)

    if options.headers is None or options.headers == "auto":
        row_headers = [[y for y in rx.search(x).groups()]
                       for x in options.filenames]
    else:
        row_headers = [options.headers]

    tables, headers = [], []
    # read all tables
    for filename, header in zip(options.filenames, row_headers):
        table = read_table(filename, options)
        if len(table) == 0:
            E.warn("table '%s' is empty" % filename)
            continue
        tables.append(table)
        headers.append(header)
    row_headers = headers

    if options.cat is None:
        if len(row_headers) == 1:
            row_head_titles = ["filename"]
        else:
            row_head_titles = [
                "pattern" + str(x) for x in range(len(row_headers))
            ]
    else:
        row_head_titles = [x.strip() for x in options.cat.split(",")]
        if len(row_headers[0]) != len(row_head_titles):
            raise ValueError(
                "row header (%i) has different number of fields in "
                "regular expression than supplied by the --cat option (%i)" %
                (len(row_headers[0]), len(row_head_titles)))

    # collect titles
    if options.input_has_titles:
        titles = collections.OrderedDict()
        for table in tables:
            for key in table[0][:-1].split("\t"):
                # skip any titles that conflict with
                # the newly added titles
                if key in row_head_titles:
                    continue
                titles[key] = 1

        outfile.write("%s\t%s\n" % ("\t".join(
            [x for x in row_head_titles]), "\t".join(list(titles.keys()))))

        map_title2column = collections.defaultdict(lambda: None)
        for x, title in enumerate(titles.keys()):
            map_title2column[title] = x
    else:
        ncolumns = [len(table[0].split('\t')) for table in tables]
        if min(ncolumns) != max(ncolumns):
            raise ValueError('tables have unequal number of columns '
                             '(min=%i, max=%i)' %
                             (min(ncolumns), max(ncolumns)))
        # create a pseudo dictionary of columns
        titles = collections.OrderedDict([(x, x)
                                          for x in range(min(ncolumns))])

    all_titles = set(titles.keys())
    for nindex, table in enumerate(tables):
        if options.input_has_titles:
            titles = table[0][:-1].split("\t")
            map_old2new = [map_title2column[t] for t in titles]
            del table[0]
        else:
            map_old2new = list(range(len(all_titles)))

        for l in table:
            data = [missing_value] * len(all_titles)
            for x, value in enumerate(l[:-1].split("\t")):
                if map_old2new[x] is None:
                    continue

                data[map_old2new[x]] = value

            row = "\t".join([str(x)
                             for x in row_headers[nindex]] + data) + "\n"
            outfile.write(row)

Example #12

Show file

def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--no-titles",
                      dest="input_has_titles",
                      action="store_false",
                      help="no titles in input [%default].")

    parser.add_option("--ignore-titles",
                      dest="ignore_titles",
                      action="store_true",
                      help="ignore titles in input [%default]")

    parser.add_option("-i",
                      "--skip-titles",
                      dest="skip_titles",
                      action="store_true",
                      help="skip output of titles.")

    parser.add_option("-m",
                      "--missing-value",
                      dest="missing_value",
                      type="string",
                      help="entry to use for missing values.")

    parser.add_option("--header-names",
                      dest="headers",
                      type="string",
                      help="add headers for files as a ,-separated "
                      "list [%default].")

    parser.add_option("-c",
                      "--columns",
                      dest="columns",
                      type="string",
                      help="columns to use for joining. Multiple columns "
                      "can be specified as a comma-separated list "
                      "[default=%default].")

    parser.add_option("-k",
                      "--take",
                      dest="take",
                      type="string",
                      action="append",
                      help="columns to take. If not set, all columns "
                      "except for "
                      "the join columns are taken [%default]")

    parser.add_option("-g",
                      "--glob",
                      dest="glob",
                      type="string",
                      help="wildcard expression for table names.")

    parser.add_option("-s",
                      "--sort-order",
                      dest="sort",
                      type="string",
                      help="sort by column titles in particular given order: "
                      "alphabetical|numeric|list of columns.")

    parser.add_option("-e",
                      "--merge-overlapping",
                      dest="merge",
                      action="store_true",
                      help="simply merge tables without matching up "
                      "rows. [default=%default].")

    parser.add_option("-a",
                      "--cat",
                      dest="cat",
                      type="string",
                      help="simply concatenate tables. Adds an "
                      "additional column called X with the filename "
                      " [default=%default].")

    parser.add_option("--sort-keys",
                      dest="sort_keys",
                      type="choice",
                      choices=("numeric", "alphabetic"),
                      help="sort key columns by value.")

    parser.add_option("--keep-empty",
                      dest="ignore_empty",
                      action="store_false",
                      help="keep empty tables. The default is "
                      "to ignore them.")

    parser.add_option("--ignore-empty",
                      dest="ignore_empty",
                      action="store_true",
                      help="ignore empty tables - this is "
                      "the default [%default].")

    parser.add_option("--add-file-prefix",
                      dest="add_file_prefix",
                      action="store_true",
                      help="add file prefix to "
                      "columns headers. Suitable for multi-column"
                      "tables [default=%default]")

    parser.add_option("--use-file-prefix",
                      dest="use_file_prefix",
                      action="store_true",
                      help="use file prefix as column headers. "
                      "Suitable for two-column tables "
                      "[default=%default]")

    parser.add_option("--prefixes",
                      dest="prefixes",
                      type="string",
                      help="list of prefixes to use. "
                      ", separated list of prefixes. "
                      "The number of prefixes need to correspond to the "
                      "number of input files [default=%default]")

    parser.add_option("--regex-filename",
                      dest="regex_filename",
                      type="string",
                      help="pattern to apply to filename to "
                      "build prefix [default=%default]")

    parser.add_option("--regex-start",
                      dest="regex_start",
                      type="string",
                      help="regular expression to start "
                      "collecting table in a file [default=%default]")

    parser.add_option("--regex-end",
                      dest="regex_end",
                      type="string",
                      help="regular expression to end collecting "
                      "table in a file [default=%default]")

    parser.add_option("--test",
                      dest="test",
                      type="int",
                      help="test combining tables with "
                      "first X rows [default=%default]")

    parser.set_defaults(
        input_has_titles=True,
        skip_titles=False,
        missing_value="na",
        headers=None,
        sort=None,
        glob=None,
        columns="1",
        sort_keys=False,
        merge=False,
        ignore_empty=True,
        regex_start=None,
        regex_end=None,
        add_file_prefix=False,
        use_file_prefix=False,
        cat=None,
        take=[],
        regex_filename="(.*)",
        prefixes=None,
        test=0,
    )

    (options, args) = E.start(parser, argv=argv)

    if options.headers:
        if "," in options.headers:
            options.headers = options.headers.split(",")
        else:
            options.headers = re.split("\s+", options.headers.strip())

    if options.sort and options.sort not in ("numeric", "alphabetic"):
        if "," in options.sort:
            options.sort = options.sort.split(",")
        else:
            options.sort = re.split("\s+", options.sort)

    if options.merge:
        options.columns = []
    else:
        options.columns = [int(x) - 1 for x in options.columns.split(",")]

    options.filenames = []

    if options.glob:
        options.filenames += glob.glob(options.glob)

    options.filenames += args

    if len(options.filenames) < 1:
        raise ValueError("no tables found.")

    E.info("combining %i tables" % len(options.filenames))

    if options.cat:
        concatenate_tables(options.stdout, options, args)
    else:
        join_tables(options.stdout, options, args)

    E.stop()

Example #13

Show file

def join_tables(outfile, options, args):
    '''join tables.'''

    if options.headers and options.headers[0] != "auto" and \
            len(options.headers) != len(options.filenames):
        raise ValueError("number of provided headers (%i) "
                         "is not equal to number filenames (%i)." %
                         (len(options.headers), len(options.filenames)))

    tables = []
    keys = {}
    sorted_keys = []
    sizes = {}

    if options.merge:
        titles = ["count"]
    else:
        titles = []

    headers_to_delete = []

    if options.prefixes:
        prefixes = [x.strip() for x in options.prefixes.split(",")]
        if len(prefixes) != len(options.filenames):
            raise ValueError(
                ("number of prefixes (%i) and tables (%i) "
                 "do not match") % (len(prefixes), len(options.filenames)))
    else:
        prefixes = None

    E.debug("joining on columns %s and taking columns %s" %
            (options.columns, options.take))

    for nindex, filename in enumerate(options.filenames):

        E.info("processing %s (%i/%i)" %
               (filename, nindex + 1, len(options.filenames)))

        prefix = os.path.basename(filename)

        lines = read_table(filename, options)

        # skip (or not skip) empty tables
        if len(lines) == 0 and options.ignore_empty:
            E.warn("%s is empty - skipped" % filename)
            headers_to_delete.append(nindex)
            continue

        table = {}
        sizes = {}
        max_size = 0
        ncolumns = 0

        if options.input_has_titles:
            data = lines[0][:-1].split("\t")
            # no titles have been defined so far
            if not titles:
                key = "-".join([data[x] for x in options.columns])
                titles = [key]

            # set take based on column titles or numerically
            if options.take:
                take = []
                # convert numeric columns for filtering
                for x in options.take:
                    try:
                        take.append(int(x) - 1)
                    except ValueError:
                        # will raise error if x is not present
                        take.append(data.index(x))
            else:
                # tables with max 100 columns
                take = None

            for x in range(len(data)):
                if x in options.columns or (take and x not in take):
                    continue
                ncolumns += 1
                if options.add_file_prefix:
                    try:
                        p = re.search(options.regex_filename,
                                      prefix).groups()[0]
                    except AttributeError:
                        E.warn("can't extract title from filename %s" % prefix)
                        p = "unknown"
                    titles.append("%s_%s" % (p, data[x]))
                elif options.use_file_prefix:
                    try:
                        p = re.search(options.regex_filename,
                                      prefix).groups()[0]
                    except:
                        E.warn("can't extract title from filename %s" % prefix)
                        p = "unknown"
                    titles.append("%s" % p)
                elif prefixes:
                    titles.append("%s_%s" % (prefixes[nindex], data[x]))
                else:
                    titles.append(data[x])

            del lines[0]
        else:

            # set take based on numeric columns if no titles are present
            if options.take:
                take = []
                # convert numeric columns for filtering
                for x in options.take:
                    take.append(int(x) - 1)
            else:
                # tables with max 100 columns
                take = None

            # IMS: We might still want filename titles even if the input
            # columns don't have titles.
            if options.add_file_prefix:
                if not titles:
                    titles = ["ID"]
                try:
                    p = re.search(options.regex_filename, prefix).groups()[0]
                except AttributeError:
                    E.warn("can't extract title from filename %s" % prefix)
                    p = "unknown"
                titles.append("%s_%s" % (p, data[x]))
            elif options.use_file_prefix:
                if not titles:
                    titles = ["ID"]
                try:
                    p = re.search(options.regex_filename, prefix).groups()[0]
                except:
                    E.warn("can't extract title from filename %s" % prefix)
                    p = "unknown"
                titles.append("%s" % p)
            ncolumns = 1

        n = 0
        for line in lines:
            data = line[:-1].split("\t")
            try:
                row_keys = [data[x] for x in options.columns]
            except IndexError as msg:
                raise IndexError("error while parsing %s: %s" %
                                 (filename, msg))
            if options.sort_keys:
                if options.sort_keys == "numeric":
                    row_keys.sort(lambda x, y: cmp(float(x), float(y)))
                else:
                    row_keys.sort()
            if options.merge:
                key = n
            else:
                key = "-".join(row_keys)

            if key not in keys:
                sorted_keys.append(key)
                keys[key] = 1
                sizes[key] = 0

            if take:
                max_size = len(take)
                table[key] = [data[x] for x in take]
            else:
                max_size = max(len(data) - len(options.columns), max_size)
                table[key] = [
                    data[x] for x in range(0, len(data))
                    if x not in options.columns
                ]
            n += 1

        # enter columns of "na" for empty tables.
        if max_size == 0:
            max_size = ncolumns

        tables.append((max_size, table))

    # delete in reverse order
    if options.headers:
        for nindex in headers_to_delete[::-1]:
            del options.headers[nindex]

    if len(tables) == len(titles) - 1:

        if options.headers:
            headers = ["bin"]
            if options.headers[0] == 'auto':
                for t in range(len(tables)):
                    headers.append(os.path.basename(options.filenames[t]))
                    headers += [""] * (tables[t][0] - 1)

            else:
                for t in range(len(tables)):
                    headers.append(options.headers[t])
                    headers += [""] * (tables[t][0] - 1)

            # use headers as titles, if headers is given and skip-titles is
            # turned on
            if options.input_has_titles and options.skip_titles:
                titles = headers
            else:
                # otherwise: print the headers out right away
                outfile.write("\t".join(headers) + "\n")

        order = list(range(0, len(tables) + 1))

        if options.input_has_titles or \
           (options.use_file_prefix or options.add_file_prefix):

            if options.sort:
                sort_order = []

                if options.sort == "numeric":
                    t = list(
                        zip(list(map(int, titles[1:])),
                            list(range(1,
                                       len(titles) + 1))))
                    t.sort()

                    for tt in t:
                        sort_order.append(titles[tt[1]])

                elif options.sort == "alphabetical":
                    t = list(zip(titles[1:], list(range(1, len(titles) + 1))))
                    t.sort()

                    for tt in t:
                        sort_order.append(titles[tt[1]])
                else:
                    sort_order = options.sort

                map_title2pos = {}
                for x in range(1, len(titles)):
                    map_title2pos[titles[x]] = x

                order = [
                    0,
                ]
                for x in sort_order:
                    if x in map_title2pos:
                        order.append(map_title2pos[x])

            else:
                order = list(range(0, len(titles)))

            outfile.write("\t".join(
                [titles[order[x]] for x in range(len(titles))]))
            outfile.write("\n")

        if options.sort_keys:
            if options.sort_keys:
                if options.sort_keys == "numeric":
                    sorted_keys.sort(lambda x, y: cmp(float(x), float(y)))
                else:
                    sorted_keys.sort()

        for key in sorted_keys:

            outfile.write("%s" % key)

            for x in order[1:]:

                max_size, table = tables[x - 1]
                c = 0
                if key in table:
                    outfile.write("\t")
                    outfile.write("\t".join(table[key]))
                    c = len(table[key])

                assert (max_size == 1)

                outfile.write("\t%s" % options.missing_value * (max_size - c))

            outfile.write("\n")

    else:

        # for multi-column table, just write
        if options.input_has_titles:
            outfile.write("\t".join([titles[x] for x in range(len(titles))]))
            outfile.write("\n")

        for key in sorted_keys:

            outfile.write("%s" % key)

            for x in range(len(tables)):

                max_size, table = tables[x]
                c = 0
                if key in table:
                    outfile.write("\t")
                    outfile.write("\t".join(table[key]))
                    c = len(table[key])

                outfile.write("\t%s" % options.missing_value * (max_size - c))

            outfile.write("\n")

Example #14

Show file

File: bam_compare_alignments.py Project: logust79/cgat-apps

def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-i", "--input-bam", dest="input_bam_file", type="string",
        help="input bam file")

    parser.add_option(
        "-f", "--reference-bam", dest="reference_bam_file", type="string",
        help="reference BAM file [%default]")

    parser.add_option(
        "-q", "--query-name-regex", dest="query_name_regex", type="string",
        help="regular expression to apply on query name. "
        "Potentially required to match samtools sort order and should "
        "evaluate to an integer [%default]")

    parser.set_defaults(
        input_bam_file=None,
        reference_bam_file=None,
        query_name_regex=None,
    )

    (options, args) = E.start(parser, argv, add_output_options=True)

    if len(args) == 2:
        options.input_bam_file = args[0]
        options.reference_bam_file = args[1]

    if options.input_bam_file is None:
        raise ValueError("please supply a BAM file as input")

    if options.reference_bam_file is None:
        raise ValueError("please supply a BAM file as reference")

    # update paths to absolute
    options.input_bam_file = os.path.abspath(options.input_bam_file)
    options.reference_bam_file = os.path.abspath(options.reference_bam_file)

    if not os.path.exists(options.input_bam_file):
        raise OSError("input bam file {} does not exist".format(
            options.input_bam_file))

    if not os.path.exists(options.reference_bam_file):
        raise OSError("reference bam file {} does not exist".format(
            options.reference_bam_file))

    bam_in = pysam.AlignmentFile(options.input_bam_file)
    ref_in = pysam.AlignmentFile(options.reference_bam_file)

    outf_mapped = E.open_output_file("mapped")
    outf_mapped.write("\t".join(
        ["read",
         "length",
         "status",
         "overlap",
         "comp_contig",
         "comp_start",
         "comp_end",
         "ref_contig",
         "ref_start",
         "ref_end",
         "shared_misaligned",
         "shared_aligned",
         "shared_insertion",
         "shared_deletion",
         "comp_aligned",
         "comp_insertion",
         "comp_deletion",
         "ref_aligned",
         "ref_insertion",
         "ref_deletion"]) + "\n")

    outf_missing = E.open_output_file("missing")
    outf_missing.write("\t".join(
        ["read", "length", "status", "aligned",
         "insertion", "deletion"]) + "\n")

    counter = E.Counter()

    if options.query_name_regex:
        rx = re.compile(options.query_name_regex)

    def extract_query(x):
        return int(rx.search(x).groups()[0])

    qname_fn = None
    if options.query_name_regex:
        qname_fn = extract_query

    for reads_cmp, read_ref in group_pairs(iterate_read_pairs(
            bam_in.fetch(until_eof=True),
            ref_in.fetch(until_eof=True),
            qname_fn=qname_fn)):

        if len(reads_cmp) == 0:
            counter.missing += 1
            pairs_ref = set(read_ref.get_aligned_pairs())
            outf_missing.write("\t".join(
                map(str, (
                    read_ref.query_name,
                    read_ref.query_length,
                    "missing") +
                    count_pairs(pairs_ref))) + "\n")
            continue

        if len(reads_cmp) > 1:
            # multiple matches
            counter.multi_mapping += 1
            prefix = "multi_"
        else:
            counter.unique_mapping += 1
            prefix = "unique_"

        is_mapped = False
        for read_cmp in reads_cmp:

            counter.paired += 1

            if read_cmp.is_unmapped:
                counter.unmapped += 1
                pairs_ref = set(read_ref.get_aligned_pairs())
                outf_missing.write("\t".join(
                    map(str, (
                        read_ref.query_name,
                        read_ref.query_length,
                        "unmapped") +
                        count_pairs(pairs_ref))) + "\n")
                continue

            overlap = max(0, (min(read_cmp.reference_end,
                                  read_ref.reference_end) -
                              max(read_cmp.reference_start,
                                  read_ref.reference_start)))

            pairs_cmp = set(read_cmp.get_aligned_pairs())
            pairs_ref = set(read_ref.get_aligned_pairs())
            shared_cmp = pairs_cmp.intersection(pairs_ref)
            unique_cmp = pairs_cmp.difference(pairs_ref)
            missaligned = len([x for x, y in unique_cmp
                               if x is not None and y is not None])

            if read_cmp.reference_name != read_ref.reference_name or \
               overlap == 0:
                status = "mismapped"
            else:
                counter.overlap += 1
                status = "mapped"
                is_mapped = True

            outf_mapped.write("\t".join(
                map(str, (read_cmp.query_name,
                          read_cmp.query_length,
                          prefix + status,
                          overlap,
                          read_cmp.reference_name,
                          read_cmp.reference_start,
                          read_cmp.reference_end,
                          read_ref.reference_name,
                          read_ref.reference_start,
                          read_ref.reference_end,
                          missaligned) +
                    count_pairs(shared_cmp) +
                    count_pairs(pairs_cmp) +
                    count_pairs(pairs_ref))) + "\n")
        else:
            if is_mapped:
                status = "mapped"
            else:
                status = "mismapped"

            counter[prefix + status] += 1

    with E.open_output_file("summary") as outf:
        outf.write("category\tcounts\n")
        outf.write(counter.asTable() + "\n")

    E.stop()

Example #15

Show file

def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-b",
                      "--reference-bed-file",
                      dest="reference_bed_file",
                      type="string",
                      help="reference bed file "
                      "[%default]")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("lvc-comparison", ),
                      help="methods to apply [%default]")

    parser.set_defaults(method="lvc-comparison",
                        reference_fasta_file=None,
                        input_bed_file=None,
                        size_bins=(1000, 10000, 100000),
                        output_sets=True,
                        region_string=None)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    reference_set = collections.defaultdict(quicksect.IntervalTree)

    E.info("reading reference bed file from {}".format(
        options.reference_bed_file))
    with IOTools.open_file(options.reference_bed_file) as inf:
        for record in pysam.tabix_iterator(inf, pysam.asBed()):
            mm = reference_set[record.contig]
            mm.add(record.start, record.end)
    E.info("read reference intervals on {} contigs: {}".format(
        len(list(reference_set.keys())), ",".join(list(reference_set.keys()))))

    if options.output_sets:
        output_tp = E.open_output_file("tp")
        output_fp = E.open_output_file("fp")
        output_fn = E.open_output_file("fn")
    else:
        output_tp = None
        output_fp = None
        output_fn = None

    if options.method == "lvc-comparison":
        c = E.Counter()

        found = set()
        counts = {}
        names = set()
        nsize_bins = len(options.size_bins)
        for bin in range(len(options.size_bins) + 1):
            counts[bin] = dict([(x, collections.defaultdict(int))
                                for x in ("tp", "fn", "fp", "test", "truth")])

        for record in pysam.tabix_iterator(options.stdin, pysam.asBed()):
            if record.contig not in reference_set:
                c.ignored_no_contig += 1
                continue

            c.test += 1
            matches = reference_set[record.contig].search(
                record.start, record.end)
            size = record.end - record.start
            bin = get_size_bin(size, options.size_bins)

            if len(matches) == 0:
                c.fp += 1
                status = "fp"
                if output_fp:
                    output_fp.write(str(record) + "\n")
            elif len(matches) >= 1:
                c.tp += 1
                status = "tp"
                if output_tp:
                    output_tp.write(str(record) + "\n")
                # todo: overlap criteria

                # record found
                for match in matches:
                    found.add((record.contig, match.start, match.end))

            name = record.name.split(",")[0]
            names.add(name)
            counts[bin]["test"][name] += 1
            counts[bin][status][name] += 1

        outf = options.stdout

        with IOTools.open_file(options.reference_bed_file) as inf:
            for record in pysam.tabix_iterator(inf, pysam.asBed()):
                c.truth += 1
                bin = get_size_bin(record.end - record.start,
                                   options.size_bins)
                counts[bin]["truth"]["all"] += 1

                key = (record.contig, record.start, record.end)
                if key not in found:
                    c.fn += 1
                    counts[bin]["fn"]["all"] += 1

        outf.write("\t".join(("category", "size", "test", "tp", "fp", "truth",
                              "fn")) + "\n")

        for name in sorted(names):
            for bin in range(len(options.size_bins) + 1):
                if bin == len(options.size_bins):
                    size_bin = ">={}".format(options.size_bins[-1])
                else:
                    size_bin = "<{}".format(options.size_bins[bin])
                outf.write("\t".join(
                    map(str, (
                        name,
                        size_bin,
                        counts[bin]["test"][name],
                        counts[bin]["tp"][name],
                        counts[bin]["fp"][name],
                        counts[bin]["truth"]["all"],
                        counts[bin]["fn"]["all"],
                    ))) + "\n")

    E.info(str(c))
    E.stop()

Example #16

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--min-overlap",
                      dest="min_overlap",
                      type="float",
                      help="minimum overlap [%default]")

    parser.add_option("-a",
                      "--bam-file",
                      dest="filename_bam",
                      metavar="bam",
                      type="string",
                      help="bam-file to use (required) [%default]")

    parser.add_option("-b",
                      "--bed-file",
                      dest="filename_bed",
                      metavar="bed",
                      type="string",
                      help="bed-file to use (required) [%default]")

    parser.add_option("-s",
                      "--sort-bed",
                      dest="sort_bed",
                      action="store_true",
                      help="sort the bed file by chromosomal location before "
                      "processing. "
                      "[%default]")

    parser.add_option(
        "--assume-sorted",
        dest="sort_bed",
        action="store_false",
        help="assume that the bed-file is sorted by chromosomal location. "
        "[%default]")

    parser.add_option(
        "--split-intervals",
        dest="split_intervals",
        action="store_true",
        help="treat split BAM intervals, for example spliced intervals, "
        "as separate intervals. Note that a single alignment might be "
        "counted several times as a result. "
        "[%default]")

    parser.set_defaults(
        min_overlap=0.5,
        filename_bam=None,
        filename_bed=None,
        sort_bed=True,
        split_intervals=False,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    filename_bam = options.filename_bam
    filename_bed = options.filename_bed

    if filename_bam is None and filename_bed is None:
        if len(args) != 2:
            raise ValueError(
                "please supply a bam and a bed file or two bed-files.")

        filename_bam, filename_bed = args

    if filename_bed is None:
        raise ValueError("please supply a bed file to compare to.")

    if filename_bam is None:
        raise ValueError("please supply a bam file to compare with.")

    E.info("intersecting the two files")

    min_overlap = options.min_overlap

    options.stdout.write("category\talignments\n")

    # get number of columns of reference bed file
    for bed in Bed.iterator(IOTools.open_file(filename_bed)):
        ncolumns_bed = bed.columns
        break
    E.info("assuming %s is bed%i format" % (filename_bed, ncolumns_bed))

    if ncolumns_bed < 4:
        raise ValueError("please supply a name attribute in the bed file")

    # get information about
    if filename_bam.endswith(".bam"):
        format = "-abam"
        samfile = pysam.AlignmentFile(filename_bam, "rb")
        total = samfile.mapped
        # latest bedtools uses bed12 format when bam is input
        ncolumns_bam = 12
        # count per read
        sort_key = lambda x: x.name
    else:
        format = "-a"
        total = IOTools.get_num_lines(filename_bam)
        # get bed format
        ncolumns_bam = 0
        for bed in Bed.iterator(IOTools.open_file(filename_bam)):
            ncolumns_bam = bed.columns
            break

        if ncolumns_bam > 0:
            E.info("assuming %s is bed%i fomat" % (filename_bam, ncolumns_bam))
            if ncolumns_bam == 3:
                # count per interval
                sort_key = lambda x: (x.contig, x.start, x.end)
            else:
                # count per interval category
                sort_key = lambda x: x.name

    # use fields for bam/bed file (regions to count with)
    data_fields = [
        "contig", "start", "end", "name", "score", "strand", "thickstart",
        "thickend", "rgb", "blockcount", "blockstarts", "blockends"
    ][:ncolumns_bam]

    # add fields for second bed (regions to count in)
    data_fields.extend([
        "contig2", "start2", "end2", "name2", "score2", "strand2",
        "thickstart2", "thickend2", "rgb2", "blockcount2", "blockstarts2",
        "blockends2"
    ][:ncolumns_bed])

    # add bases overlap
    data_fields.append("bases_overlap")

    data = collections.namedtuple("data", data_fields)

    options.stdout.write("total\t%i\n" % total)

    if total == 0:
        E.warn("no data in %s" % filename_bam)
        return

    # SNS: sorting optional, off by default
    if options.sort_bed:
        bedcmd = "<( gunzip < %s | sort -k1,1 -k2,2n)" % filename_bed
    else:
        bedcmd = filename_bed

    if options.split_intervals:
        split = "-split"
    else:
        split = ""

    # IMS: newer versions of intersectBed have a very high memory
    #      requirement unless passed sorted bed files.
    statement = """bedtools intersect %(format)s %(filename_bam)s
    -b %(bedcmd)s
    %(split)s
    -sorted -bed -wo -f %(min_overlap)f""" % locals()

    E.info("starting counting process: %s" % statement)
    proc = E.run(statement, return_popen=True, stdout=subprocess.PIPE)

    E.info("counting")
    counts_per_alignment = collections.defaultdict(int)
    take_columns = len(data._fields)

    def iterate(infile):
        for line in infile:
            if not line.strip():
                continue
            yield data._make(line[:-1].split()[:take_columns])

    for read, overlaps in itertools.groupby(iterate(
            IOTools.force_str(proc.stdout)),
                                            key=sort_key):
        annotations = [x.name2 for x in overlaps]
        for anno in annotations:
            counts_per_alignment[anno] += 1

    for key, counts in sorted(counts_per_alignment.items()):
        options.stdout.write("%s\t%i\n" % (key, counts))

    # write footer and output benchmark information.
    E.stop()

Example #17

Show file

def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-a",
                      "--as-gtf",
                      dest="as_gtf",
                      action="store_true",
                      help="output as gtf.")

    parser.add_option(
        "-f",
        "--id-format",
        dest="id_format",
        type="string",
        help="format for numeric identifier if --as-gtf is set and "
        "no name in bed file [%default].")

    parser.set_defaults(as_gtf=False, id_format="%08i", test=None)

    (options, args) = E.start(parser, add_pipe_options=True)

    as_gtf = options.as_gtf
    id_format = options.id_format

    if as_gtf:
        gff = GTF.Entry()
    else:
        gff = GTF.Entry()

    gff.source = "bed"
    gff.feature = "exon"

    ninput, noutput, nskipped = 0, 0, 0

    id = 0
    for bed in Bed.iterator(options.stdin):

        ninput += 1

        gff.contig = bed.contig
        gff.start = bed.start
        gff.end = bed.end
        if bed.fields and len(bed.fields) >= 3:
            gff.strand = bed.fields[2]
        else:
            gff.strand = "."

        if bed.fields and len(bed.fields) >= 2:
            gff.score = bed.fields[1]

        if as_gtf:
            if bed.fields:
                gff.gene_id = bed.fields[0]
                gff.transcript_id = bed.fields[0]
            else:
                id += 1
                gff.gene_id = id_format % id
                gff.transcript_id = id_format % id
        else:
            if bed.fields:
                gff.source = bed.fields[0]

        options.stdout.write(str(gff) + "\n")

        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.stop()

Example #18

Show file

File: chain2stats.py Project: AndreasHegerGenomics/cgat-apps

def main(argv=None):

    if not argv:
        argv = sys.argv

    # get the options
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-c",
                      "--chain-file",
                      dest="chainfile",
                      type="string",
                      help="the chain file to analyse",
                      metavar="FILE")

    parser.add_option(
        "--alignments-per-contig",
        dest="nperchrom",
        type="int",
        help="Number of aligments to report on per chromosome pair",
        default=2)

    parser.add_option(
        "--aggregate-by",
        dest="aggregate",
        type="choice",
        choices=("contig", "none"),
        help="Set to `contig` to perform per chromosome pair analysis",
        default=2)

    parser.add_option(
        "-i",
        "--output-identity",
        dest="output_identity",
        action="store_true",
        help="Generate stats on the sequence identity of the gapped "
        "chains. Requires FastaIndex.py",
        default=False)

    parser.add_option("-d",
                      "--dbpath",
                      dest="dbpath",
                      type="string",
                      help="The path to the indexed fasta files",
                      default=".")

    parser.add_option("-t",
                      "--target-genome",
                      dest="targetgenome",
                      type="string",
                      help="The target genome, eg. Mm19",
                      default=False)

    parser.add_option("-q",
                      "--query-genome",
                      dest="querygenome",
                      type="string",
                      help="The query genome eg. Hg17",
                      default=False)

    parser.add_option(
        "-e",
        "--errors",
        dest="errors",
        action="store_true",
        help="Check chains for erroneous contig sizes using the given db",
        default=False)

    parser.add_option("-r",
                      "--output-report",
                      dest="output_report",
                      action="store_true",
                      help="Write out tab-delimited reports for each analysis",
                      default=False)

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    # make a list of counting objects
    counters = []

    counters.append(CounterPerChromosome(gapped=True))
    counters.append(CounterPerChromosome(gapped=False))

    if options.aggregate == "contig":
        counters.append(CounterPerChromosomePair(gapped=True))
        counters.append(CounterPerChromosomePair(gapped=False))

    counters.append(CounterOfGappedChainLengths(gapped=True))
    counters.append(CounterOfGappedChainLengths(gapped=False))

    if options.output_identity is True:
        if options.targetgenome == 0 or options.querygenome == 0:
            raise Exception(
                "Target and query database must be specified with the \"-e\" flag"
            )
        t_db_path = os.path.join(options.dbpath, options.targetgenome)
        q_db_path = os.path.join(options.dbpath, options.querygenome)
        counters.append(CounterPercentIdentify(t_db_path, q_db_path))

    if options.errors is True:
        if options.targetgenome == 0 or options.querygenome == 0:
            raise Exception(
                "Target and query database must be specified with the \"-e\" flag"
            )
        counters.append(CounterOfErrors(options))

    # iterate over the chains and counters
    for chain in chain_iterator(options.stdin):
        c = Chain(chain)
        for counter in counters:
            counter.add(c)

    # write a report to stdout and individual reports to tab delimited files
    options.stdout.write(
        "\n\n********** chain2stats report starts **********\n")

    for counter in counters:
        counter.report(options)
        if options.output_report is True:
            counter.tabbed_report(options, E)

    options.stdout.write("\n********** chain2stats report ends **********\n\n")

    E.stop()

Example #19

Show file

File: IndexedFasta.py Project: logust79/cgat-apps

    def _iterate(self):
        """iterate over muliple files."""
        def _iter(infile):

            identifier = None
            is_new = False

            for line in infile:
                if line.startswith("#"):
                    continue
                if line.startswith(">"):

                    if self.regexIdentifier:
                        try:
                            identifier = re.search(self.regexIdentifier,
                                                   line[1:-1]).groups()[0]
                        except AttributeError:
                            raise ValueError(
                                "could not parse identifier from line %s "
                                "- check the input" % line[1:-1])
                    else:
                        identifier = re.split("\s", line[1:-1])[0]
                    is_new = True
                else:
                    if not identifier:
                        raise ValueError(
                            "refusing to emit sequence without identifier "
                            "- check the input")
                    yield is_new, identifier, line.strip()
                    is_new = False

        for filename in self.filenames:
            if self.format == "tar.gz" or self.format == "tar" or \
               (self.format == "auto" and filename.endswith("tar.gz")):
                if filename == "-":
                    tf = tarfile.open(fileobj=sys.stdin.buffer, mode="r|*")
                else:
                    tf = tarfile.open(filename, mode="r")
                for f in tf:
                    b, ext = os.path.splitext(f.name)
                    if ext.lower() in (".fasta", ".fa"):
                        E.info("extracting %s" % f.name)
                        if sys.version_info.major >= 3:
                            infile = io.TextIOWrapper(tf.extractfile(f),
                                                      encoding="ascii")
                        else:
                            infile = tf.extractfile(f)
                        for x in _iter(infile):
                            yield x
                    else:
                        E.info("skipping %s" % f.name)

                if tf != sys.stdin:
                    tf.close()
                continue
            elif self.format == "fasta.gz" or (self.format == "auto"
                                               and filename.endswith(".gz")):
                infile = IOTools.open_file(filename, "r")
            elif filename == "-":
                infile = sys.stdin
            else:
                infile = IOTools.open_file(filename, "r")

            for x in _iter(infile):
                yield x
            if filename != "-":
                infile.close()

        raise StopIteration

Example #20

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: set_diff.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-p", "--add-percent", dest="add_percent", action="store_true",
                      help="add percentage information to each line.")

    parser.add_option("-t", "--header-names", dest="headers", type="string",
                      help="comma separated list of headers. If empty or set to '-', filenames are used.")

    parser.add_option("--skip-header", dest="add_header", action="store_false",
                      help="do not add header to flat format.")

    parser.add_option("--output-with-header", dest="write_header", action="store_true",
                      help="write header and exit.")

    parser.add_option("--with-title", dest="with_title", action="store_true",
                      help="use column titles in input data [%default].")

    parser.add_option("--no-title", dest="with_title", action="store_false",
                      help="there are no titles in input data [%default].")

    parser.set_defaults(
        add_percent=False,
        percent_format="%5.2f",
        headers=None,
        add_header=True,
        write_header=False,
        with_title=True,
    )

    (options, args) = E.start(parser)

    if options.add_header:
        options.stdout.write(
            "set1\tset2\tn1\tn2\tunion\tinter\tunique1\tunique2")
        if options.add_percent:
            options.stdout.write(
                "\tpinter\tpunique1\tpunique2\tpcov1\tpcov2\tpcovmax")
        options.stdout.write("\n")

        if options.write_header:
            sys.exit(0)

    if len(args) < 2:
        raise ValueError("please supply at least two filenames.")

    headers, titles, sets = [], [], []

    if options.headers:
        if options.headers == "-":
            headers = args
        else:
            headers = options.headers.split(",")
            if len(headers) != len(args):
                raise ValueError(
                    "please supply the same number of headers as there are filenames.")

    for f in args:
        if options.with_title:
            title, data = IOTools.readList(
               IOTools.open_file(f, "r"), with_title=options.with_title)
            titles.append(title)
        else:
            data = IOTools.readList(open(f, "r"))
        sets.append(set(data))

    if not headers and titles:
        headers = titles
    else:
        headers = args

    for x in range(len(sets) - 1):
        set1 = sets[x]

        for y in range(x + 1, len(sets)):
            set2 = sets[y]
            l1, l2 = len(set1), len(set2)
            options.stdout.write("%s\t%s\t%i\t%i\t%i\t%i\t%i\t%i" % (headers[x], headers[y],
                                                                     l1, l2,
                                                                     len(set1.union(
                                                                         set2)),
                                                                     len(set1.intersection(
                                                                         set2)),
                                                                     len(set1.difference(
                                                                         set2)),
                                                                     len(set2.difference(set1))))

            if options.add_percent:
                if len(set1) == 0:
                    ri, r1, r2 = 0, 1, 0
                    c1, c2, cm = 1, 0, 0
                elif len(set2) == 0:
                    ri, r1, r2 = 0, 0, 1
                    c1, c2, cm = 0, 1, 0
                else:
                    i = len(set1.intersection(set2))
                    ri, r1, r2 = (
                        i / float(len(set1.union(set2))),
                        len(set1.difference(set2)) / float(l1),
                        len(set2.difference(set1)) / float(l2))
                    c1, c2 = (i / float(l1), i / float(l2))
                    cm = max(c1, c2)

                options.stdout.write(
                    "\t" + ("\t".join([options.percent_format for z in range(6)])) % (ri, r1, r2, c1, c2, cm))

            options.stdout.write("\n")

    E.stop()

Example #21

Show file

def runCommand(data):

    filename, cmd, options, tmpdir, subdirs = data

    if subdirs:
        outdir = "%s.dir/" % (filename)
        os.mkdir(outdir)
        cmd = re.sub("%DIR%", outdir, cmd)

    x = re.search("'--log=(\S+)'", cmd) or re.search("'--L\s+(\S+)'", cmd)
    if x:
        logfile = filename + ".log"
        cmd = cmd[:x.start()] + "--log=%s" % logfile + cmd[x.end():]
    else:
        logfile = filename + ".out"

    # working directory - needs to be the one from which the
    # the script is called to resolve input files.
    cwd = os.getcwd()

    if "<(" in cmd or "|" in cmd:
        if "'" in cmd:
            raise ValueError(
                "advanced bash syntax `<()` combined with single quotes")
        cmd = """/bin/bash -c '%s'""" % cmd

    if "|" in cmd:
        if r"\|" not in cmd:
            E.warn("pipes (`|`) within command need to be escaped, "
                   "otherwise jobs run on submit host")

    c = '%s -v "BASH_ENV=%s" -q %s -p %i %s %s' % (
        options.cluster_cmd, options.bashrc, options.cluster_queue,
        options.cluster_priority, options.cluster_options, cmd)

    iteration = 0
    while 1:

        iteration += 1
        if iteration > 1:
            E.info("%s: re-submitting command (repeat=%i): %s" %
                   (filename, iteration, c))
        else:
            E.info("%s: submitting command: %s" % (filename, c))

        infile = IOTools.openFile(filename, "r")
        outfile = IOTools.openFile(filename + ".out", "w")
        errfile = IOTools.openFile(filename + ".err", "a")

        retcode = subprocess.call(c,
                                  shell=True,
                                  stdin=infile,
                                  stdout=outfile,
                                  stderr=errfile,
                                  cwd=cwd,
                                  close_fds=True)

        infile.close()
        outfile.close()
        errfile.close()

        if hasFinished(retcode, filename, options.output_tag, logfile):
            break

        if iteration > options.resubmit:
            E.warn("%s: giving up executing command: retcode=%i" %
                   (filename, retcode))
            break

        E.warn("%s: error while executing command: retcode=%i" %
               (filename, retcode))

    return (retcode, filename, cmd, logfile, iteration)

Example #22

Show file

def build_report():
    '''build report from scratch.'''

    E.info("starting documentation build process from scratch")
    P.run_report(clean=True)

Example #23

Show file

def getOptionParser():
    """create parser and add options."""

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--split-at-lines",
                      dest="split_at_lines",
                      type="int",
                      help="split jobs according to line number [%default].")

    parser.add_option(
        "--split-at-column",
        dest="split_at_column",
        type="int",
        help="split jobs according to column. Columns start at number 1 "
        "and the input should be sorted by this column [%default].")

    parser.add_option(
        "--group-by-regex",
        dest="group_by_regex",
        type="string",
        help="group jobs according to a regular expression [%default].")

    parser.add_option(
        "--split-at-regex",
        dest="split_at_regex",
        type="string",
        help="split jobs according to a regular expression [%default].")

    parser.add_option("--split-at-tag",
                      dest="split_at_tag",
                      type="int",
                      help="split a file at a tag [%default].")

    parser.add_option(
        "--chunk-size",
        dest="chunksize",
        type="int",
        help="when splitting at regex or tag, aggregate x entries [%default].")

    parser.add_option(
        "--debug",
        dest="debug",
        action="store_true",
        help="debug mode. Do not delete temporary file [%default].")

    parser.add_option(
        "--dry-run",
        dest="dry_run",
        action="store_true",
        help="dry run. Do not split input and simply forward stdin to stdout. "
        "Useful for debugging the command [%default].")

    parser.add_option("--input-header",
                      dest="input_header",
                      action="store_true",
                      help="The input stream contains a table header. "
                      "This header is replicated for each job [%default].")

    parser.add_option(
        "--output-header",
        dest="output_header",
        action="store_true",
        help="The output jobs contain a table header. "
        "The header is removed for each job except for the first [%default].")

    parser.add_option(
        "--output-regex-header",
        dest="output_regex_header",
        type="string",
        help="Regular expression for header (in stdout stream). Any lines "
        "before the first line matching this regular expression are ignored"
        "[%default].")

    parser.add_option(
        "--output-tag",
        dest="output_tag",
        type="string",
        help="The output jobs contain a tag in the last line denoting "
        "job completion. If the unix return value denotes an error, the "
        "presence of this tag is checked [%default].")

    parser.add_option(
        "--subdirs",
        dest="subdirs",
        action="store_true",
        help="Run within separate subdirs for jobs. This permits "
        "multiple output streams. Use a placeholder %DIR% if you supply "
        "the ouput pattern as a command line option [%default].")

    parser.add_option(
        "-T",
        "--temp-dir",
        dest="tmpdir",
        type="string",
        help="Temporary directory to be used. Default is the current "
        "directory [%default].")

    parser.add_option("--max-files",
                      dest="max_files",
                      type="int",
                      help="create at most x files [%default].")

    parser.add_option(
        "--max-lines",
        dest="max_lines",
        type="int",
        help="in addition to splitting into chunksize, also split if "
        "more than max-lines is reached [%default].")

    parser.add_option(
        "--renumber",
        dest="renumber",
        type="string",
        help="renumber ids consecutively, supply a pattern [%default].")

    parser.add_option(
        "--renumber-column",
        dest="renumber_column",
        type="string",
        action="append",
        help="specify column to renumber. The format is regex:column, "
        "for example csv:1 or csv:id [%default].")

    parser.add_option(
        "-r",
        "--reduce",
        dest="reduce",
        type="string",
        action="append",
        help="Add reduce functions for specific files. The format is "
        "file:reducer. The default reducer is 'table' for all files "
        "[%default].")

    parser.add_option(
        "-m",
        "--map",
        dest="map",
        type="string",
        action="append",
        help="Map specific columns in tables. The format is "
        "file:column:pattern, for example .table:1:%06i [%default].")

    parser.add_option("--resume",
                      dest="resume",
                      type="string",
                      help="resume aborted run from files in dir [%default]")

    parser.add_option("--collect",
                      dest="collect",
                      type="string",
                      help="collect files in dir and process as normally "
                      "[%default]")

    parser.add_option("--is-binary",
                      dest="binary",
                      action="store_true",
                      help="the output is binary - files are concatenated "
                      "without parsing [%default]")

    parser.add_option(
        "--resubmit",
        dest="resubmit",
        type="int",
        help="if a job fails, automatically resubmit # times. Set to 0 "
        "in order to disable resubmission [%default]")

    parser.add_option("--fail",
                      dest="resubmit",
                      action="store_false",
                      help="if a job fails, do not resubmit [%default]")

    parser.add_option("--bashrc",
                      dest="bashrc",
                      type="string",
                      help="bashrc file to use [%default]")

    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("multiprocessing", "threads", "drmaa"),
                      help="method to submit jobs [%default]")

    parser.add_option("--job-memory",
                      dest="job_memory",
                      type="string",
                      help="per-job memory requirement."
                      "Unit must be specified, eg. 100M, 1G ")

    parser.add_option(
        "-e",
        "--env",
        dest="environment",
        type="string",
        action="append",
        help="environment variables to be passed to the jobs [%default]")

    parser.add_option(
        "--output-filename-pattern",
        dest="output_pattern",
        type="string",
        help="Pattern for secondary output filenames. Should contain a '%s' "
        "[%default].")

    parser.set_defaults(
        split_at_lines=None,
        split_at_column=None,
        split_at_regex=None,
        group_by_regex=None,
        split_at_tag=None,
        chunksize=100,
        cluster_cmd='qrsh -cwd -now n',
        bashrc="~/.bashrc",
        input_header=False,
        output_header=False,
        output_regex_header=None,
        debug=False,
        dry_run=False,
        tmpdir="./",
        subdirs=False,
        renumber=None,
        output_tag="# job finished",
        map=[],
        reduce=[],
        resume=None,
        renumber_column=[],
        resubmit=5,
        collect=None,
        method="drmaa",
        job_memory=None,
        max_files=None,
        max_lines=None,
        binary=False,
        environment=[],
        output_pattern="%s",
    )

    # stop parsing options at the first argument
    parser.disable_interspersed_args()

    return parser

Example #24

Show file

def update_report():
    '''update report.'''

    E.info("updating documentation")
    P.run_report(clean=False)

Example #25

Show file

File: r_mann_whitney_u.py Project: AndreasHegerGenomics/cgat-apps

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: r_mann_whitney_u.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-m", "--method", dest="method", type="string",
                      help="method to use [ks=Kolmogorov-Smirnov,mwu=Mann-WhitneyU]")
    parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string",
                      help="write hardcopy to file.", metavar="FILE")
    parser.add_option("-1", "--infile1", dest="filename_input1", type="string",
                      help="input filename for distribution 1.")
    parser.add_option("-2", "--infile2", dest="filename_input2", type="string",
                      help="input filename for distribution 2.")
    parser.add_option("-p", "--infile-map", dest="filename_input_map", type="string",
                      help="input filename for mapping categories to values.")

    parser.set_defaults(
        method="ks",
        filename_input1=None,
        filename_input2=None,
        filename_input_map=None,
    )

    (options, args) = E.start(parser,
                              add_pipe_options=True)

    map_category2value = {}
    if options.filename_input_map:
        map_category2value = IOTools.ReadMap(open(options.filename_input_map, "r"),
                                             map_functions=(str, float))

    values1, errors1 = IOTools.ReadList(open(options.filename_input1, "r"),
                                        map_category=map_category2value)
    values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"),
                                        map_category=map_category2value)

    E.info("ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i" % (len(values1), len(errors1),
                                                                 len(values2), len(errors2)))

    if options.hardcopy:
        R.png(options.hardcopy, width=1024, height=768)

    if options.method == "ks":
        result = R.ks_test(values1, values2)
    elif options.method == "mwu":
        result = R.wilcox_test(values1, values2, paired=False)

    R.assign("v1", values1)
    R.assign("v2", values2)

    R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True))

    R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot")

    R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );""")

    R("""hist( v1, freq=FALSE, width=0.5, density=10, main='Relative frequency histogram')""")
    R("""hist( v2, freq=FALSE, add=TRUE,   width=0.5, col='red', offset=0.5, density=20, angle=135)""")
    R("""hist( v1, freq=TRUE,  width=0.5, density=10, main='Absolute frequency histogram')""")
    R("""hist( v2, freq=TRUE,  add=TRUE,   width=0.5, col='red', offset=0.5, density=20, angle=135)""")

    print("## Results for %s" % result['method'])
    for x in ['p.value', 'statistic', 'alternative', 'method']:
        print(x, result[x])

    E.stop()

Example #26

Show file

def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-f", "--fasta", dest="input_filename_fasta",
        type="string",
        help="filename with fasta sequences. ")

    parser.add_option(
        "-o", "--output-filename-sequences", dest="output_filename_sequences",
        type="string",
        help="output per sequence information to filename")

    parser.set_defaults(
        input_filename_fasta=None,
    )

    (options, args) = E.start(parser, argv=argv)

    if len(args) > 0:
        options.input_filename_fasta = args[0]

    sequence_pairs = []

    if options.input_filename_fasta != "-" and os.path.exists(
            options.input_filename_fasta + ".fai"):
        has_index = 1
        fastafile = pysam.FastaFile(options.input_filename_fasta)
        sequence_pairs = list(zip(fastafile.references, fastafile.lengths))
    else:
        has_index = 0
        iterator = pysam.FastxFile(options.input_filename_fasta)
        for record in iterator:
            sequence_pairs.append(
                (record.name,
                 len(record.sequence)))

    lengths = numpy.array([x[1] for x in sequence_pairs])

    options.stdout.write("\t".join((
        "has_index", "nsequences", "total_length", "min_length",
        "max_length", "median_length", "mean_length")) + "\n")

    if len(lengths) > 0:
        options.stdout.write("\t".join(map(str, (
            has_index,
            len(sequence_pairs),
            lengths.sum(),
            lengths.min(),
            lengths.max(),
            numpy.median(lengths),
            lengths.mean()))) + "\n")
    else:
        options.stdout.write("\t".join(map(str, (
            has_index,
            len(sequence_pairs),
            0,
            "",
            "",
            "",
            ""))) + "\n")

    if options.output_filename_sequences:
        with IOTools.open_file(options.output_filename_sequences, "w") as outf:
            outf.write("name\tlength\n")
            outf.write(
                "\n".join(["\t".join(map(str, x)) for x in sequence_pairs]) + "\n")

    E.stop()

Example #27

Show file

def buildAlleles(sequence, variants, reference_start=0, phased=True):
    '''build alleles for ``sequence`` adding ``variants``.

    Variants are assumed to be in 0-based coordinates on the same strand as the sequence. 
    ``reference_start`` is the position of the first base of ``sequence``. Set to 0, if
    the positions in ``variants`` are relative to ``sequence``.
    '''
    def _delete(allele, del_start, del_end, variant, sequence, startoffset,
                endoffset, feature_start, feature_end):
        '''little helper: update ``allele`` with a deletion ``del_start:del_end``.
        '''

        # truncate variant according to the feature
        variant = variant[startoffset:len(variant) - endoffset]

        n = variant.count("-")
        if n:
            if variant.startswith("-"):
                del_start += n
                variant = variant[n:]
            else:
                del_end -= n
                variant = variant[:-n]

        # due to gaps, the variant is not actually within the feauture
        if del_start >= del_end:
            return

        refseq = sequence[del_start:del_end].upper()

        assert refseq == variant, \
            'reference base mismatch at deletion: expected %s %s %s, got %s[%i:%i] at feature=%i-%i, variant=%i-%i, relative=%i-%i, del=%i-%i, action=%s' % \
            (sequence[del_start - 10:del_start],
             refseq,
             sequence[del_end:del_end + 10],
             variant, startoffset, len(variant) - endoffset,
             feature_start, feature_end,
             var_start, var_end,
             rel_start, rel_end,
             del_start, del_end,
             action)

        l = del_end - del_start

        # assert len("".join(allele[del_start:del_end])) == l, \
        #     "deletion conflicts with other indels: " \
        #     "got %s[%i:%i] (ref=%s, allele=%s) at feature=%i-%i, variant=%i-%i, relative=%i-%i, del=%i-%i, action=%s" % \
        #     (variant, startoffset, len(variant)-endoffset,
        #      refseq, str(allele[del_start:del_end]),
        #      feature_start, feature_end,
        #      var_start, var_end,
        #      rel_start, rel_end,
        #      del_start, del_end,
        #      action)

        allele[del_start:del_end] = [""] * l

    allele1 = list(sequence.lower())
    allele2 = list(sequence.lower())

    if reference_start is None:
        feature_start = 0
    else:
        feature_start = reference_start

    feature_end = feature_start + len(sequence)

    # main loop: insert variants into allele sequences
    for var_start, var_end, reference, action, has_wildtype, variantseqs in variants:

        # skip variants that are out-of-range
        if var_end <= feature_start or var_start >= feature_end:
            continue

        is_homozygous = len(variantseqs) == 1 and not has_wildtype

        rel_start, rel_end = var_start - feature_start, var_end - feature_start
        startoffset = max(0, feature_start - var_start)
        endoffset = max(0, var_end - feature_end)
        pruned_start, pruned_end = max(0,
                                       rel_start), min(len(sequence), rel_end)

        if action == "=":

            if E.global_options.loglevel >= 10:
                E.debug(
                    "adding SNP at postition %i: reference=%s variants=%s" %
                    (var_start, reference, variantseqs))

            if allele1[rel_start] == "" or allele2[rel_start] == "":
                # these can be cases, where a base is deleted in one allele,
                # but recorded as a homozygous substitution in another allele.
                E.warn("substitution conflicts with a deletion - ignored: %s" %
                       str((var_start, var_end, reference, action,
                            has_wildtype, variantseqs)))
                continue

            assert rel_start >= 0
            assert sequence[rel_start].upper() == reference, \
                'reference base mismatch: expected %s %s %s, got %s at feature=%i-%i, variant=%i-%i, relative=%i-%i, pruned=%i-%i, action=%s' % \
                (sequence[rel_start - 10:rel_start],
                 sequence[rel_start].upper(),
                 sequence[rel_start + 1:rel_start + 10],
                 reference,
                 feature_start, feature_end,
                 var_start, var_end,
                 rel_start, rel_end,
                 pruned_start, pruned_end,
                 action)

            if phased:
                allele1[rel_start] = variantseqs[0] + allele1[rel_start][1:]
                allele2[rel_start] = variantseqs[1] + allele2[rel_start][1:]
            elif is_homozygous:
                allele1[rel_start] = variantseqs[0] + allele1[rel_start][1:]
                allele2[rel_start] = variantseqs[0] + allele2[rel_start][1:]
            else:
                if has_wildtype:
                    if reference == variantseqs[0]:
                        allele2[rel_start] = variantseqs[1] + allele2[
                            rel_start][1:]
                    else:
                        allele2[rel_start] = variantseqs[0] + allele2[
                            rel_start][1:]
                else:
                    allele1[
                        rel_start] = variantseqs[0] + allele1[rel_start][1:]
                    allele2[
                        rel_start] = variantseqs[1] + allele2[rel_start][1:]

        elif action == "-":
            if phased:
                _delete(allele1, pruned_start, pruned_end, variantseqs[0],
                        sequence, startoffset, endoffset, feature_start,
                        feature_end)
                _delete(allele2, pruned_start, pruned_end, variantseqs[1],
                        sequence, startoffset, endoffset, feature_start,
                        feature_end)
            elif is_homozygous:
                _delete(allele1, pruned_start, pruned_end, variantseqs[0],
                        sequence, startoffset, endoffset, feature_start,
                        feature_end)
                _delete(allele2, pruned_start, pruned_end, variantseqs[0],
                        sequence, startoffset, endoffset, feature_start,
                        feature_end)
            else:
                if has_wildtype:
                    _delete(allele2, pruned_start, pruned_end, variantseqs[0],
                            sequence, startoffset, endoffset, feature_start,
                            feature_end)
                else:
                    _delete(allele1, pruned_start, pruned_end, variantseqs[0],
                            sequence, startoffset, endoffset, feature_start,
                            feature_end)
                    _delete(allele2, pruned_start, pruned_end, variantseqs[1],
                            sequence, startoffset, endoffset, feature_start,
                            feature_end)

        elif action == "+":
            # ignore insertions at position -1
            if rel_start < 0:
                continue

            if phased:
                allele1[rel_start] += variantseqs[0].upper()
                allele2[rel_start] += variantseqs[1].upper()
            elif is_homozygous:
                allele1[rel_start] += variantseqs[0].upper()
                allele2[rel_start] += variantseqs[0].upper()
            else:
                if has_wildtype:
                    allele2[rel_start] += variantseqs[0].upper()
                else:
                    allele1[rel_start] += variantseqs[0].upper()
                    allele2[rel_start] += variantseqs[1].upper()

        elif action == ">":
            # indel
            if rel_start >= 0:
                allele1[rel_start] += variantseqs[0].upper()
            _delete(allele2, pruned_start, pruned_end, variantseqs[1],
                    sequence, startoffset, endoffset, feature_start,
                    feature_end)

        elif action == "<":
            # delin
            if rel_start >= 0:
                allele2[rel_start] += variantseqs[1].upper()
            _delete(allele1, pruned_start, pruned_end, variantseqs[0],
                    sequence, startoffset, endoffset, feature_start,
                    feature_end)

    assert len(sequence) == len(allele1)
    assert len(sequence) == len(allele2)

    return (allele1, allele2)

Example #28

Show file

File: cgat_logfiles2tsv.py Project: logust79/cgat-flow

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option(
        "-g", "--glob", dest="glob_pattern", type="string",
        help="glob pattern to use for collecting files [%default].")

    parser.add_option(
        "-f", "--file-pattern", dest="file_pattern", type="string",
        help="only check files matching this pattern [%default].")

    parser.add_option("-m", "--mode", dest="mode", type="choice",
                      choices=("file", "node"),
                      help="analysis mode [%default].")

    parser.add_option(
        "-r", "--recursive", action="store_true",
        help="recursively look for logfiles from current directory "
        "[%default].")

    parser.set_defaults(
        truncate_sites_list=0,
        glob_pattern="*.log",
        mode="file",
        recursive=False,
    )

    (options, args) = E.Start(parser)

    if args:
        filenames = args
    elif options.glob_pattern:
        filenames = glob.glob(options.glob_pattern)

    if len(filenames) == 0:
        raise ValueError("no files to analyse")

    if options.mode == "file":
        totals = Logfile.LogFileData()

        options.stdout.write("file\t%s\n" % totals.getHeader())

        for filename in filenames:
            if filename == "-":
                infile = sys.stdin
            elif filename[-3:] == ".gz":
                infile = gzip.open(filename, "r")
            else:
                infile = open(filename, "r")

            subtotals = Logfile.LogFileData()
            for line in infile:
                subtotals.add(line)

            infile.close()

            options.stdout.write("%s\t%s\n" % (filename, str(subtotals)))
            totals += subtotals

        options.stdout.write("%s\t%s\n" % ("total", str(totals)))

    elif options.mode == "node":

        chunks_per_node = {}

        rx_node = re.compile("# job started at .* \d+ on (\S+)")

        for filename in filenames:
            if filename == "-":
                infile = sys.stdin
            elif filename[-3:] == ".gz":
                infile = gzip.open(filename, "r")
            else:
                infile = open(filename, "r")

            data = Logfile.LogFileDataLines()

            for line in infile:

                if rx_node.match(line):
                    node_id = rx_node.match(line).groups()[0]
                    data = Logfile.LogFileDataLines()
                    if node_id not in chunks_per_node:
                        chunks_per_node[node_id] = []
                    chunks_per_node[node_id].append(data)
                    continue

                data.add(line)

        options.stdout.write("node\t%s\n" % data.getHeader())
        total = Logfile.LogFileDataLines()

        for node, data in sorted(chunks_per_node.items()):
            subtotal = Logfile.LogFileDataLines()
            for d in data:
                # options.stdout.write( "%s\t%s\n" % (node, str(d) ) )
                subtotal += d

            options.stdout.write("%s\t%s\n" % (node, str(subtotal)))

            total += subtotal

        options.stdout.write("%s\t%s\n" % ("total", str(total)))

    E.Stop()

Example #29

Show file

def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: split_fasta.py 1714 2007-12-11 16:51:12Z andreas $"
    )

    parser.add_option("-f",
                      "--file",
                      dest="input_filename",
                      type="string",
                      help="input filename. If not given, stdin is used.",
                      metavar="FILE")

    parser.add_option(
        "-i",
        "--input-pattern",
        dest="input_pattern",
        type="string",
        help="input pattern. Parses description line in order to extract id.")

    parser.add_option(
        "-o",
        "--output-filename-pattern",
        dest="output_pattern",
        type="string",
        help="output pattern. Gives filename for a given sequence.")

    parser.add_option(
        "-n",
        "--num-sequences",
        dest="num_sequences",
        type="int",
        help="split by number of sequences (not implemented yet).")

    parser.add_option("-m",
                      "--map",
                      dest="map_filename",
                      type="string",
                      help="map filename. Map identifiers to filenames",
                      metavar="FILE")

    parser.add_option("-s",
                      "--skip-identifiers",
                      dest="skip_identifiers",
                      action="store_true",
                      help="do not write identifiers.",
                      metavar="FILE")

    parser.add_option("--min-size",
                      dest="min_size",
                      type="int",
                      help="minimum cluster size.")

    parser.set_defaults(input_filename=None,
                        map_filename=None,
                        skip_identifiers=False,
                        input_pattern="^(\S+)",
                        min_size=0,
                        num_sequences=None,
                        output_pattern="%s")

    (options, args) = E.start(parser)

    if options.input_filename:
        infile = IOTools.open_file(options.input_filename, "r")
    else:
        infile = sys.stdin

    if options.map_filename:
        map_id2filename = IOTools.ReadMap(open(options.map_filename, "r"))
    else:
        map_id2filename = {}

    if options.num_sequences:
        files = FilesChunks(chunk_size=options.num_sequences,
                            output_pattern=options.output_pattern,
                            skip_identifiers=options.skip_identifiers)

    else:
        files = Files(output_pattern=options.output_pattern,
                      skip_identifiers=options.skip_identifiers)

    if options.input_pattern:
        rx = re.compile(options.input_pattern)
    else:
        rx = None

    ninput = 0
    noutput = 0
    identifier = None
    chunk = 0

    for seq in FastaIterator.iterate(infile):

        ninput += 1

        if rx:
            try:
                identifier = rx.search(seq.title).groups()[0]
            except AttributeError:
                print("# parsing error in description line %s" % (seq.title))
        else:
            identifier = seq.title

        if map_id2filename:
            if identifier in map_id2filename:
                identifier = map_id2filename[identifier]
            else:
                continue

        files.Write(identifier, seq)
        noutput += 1

    if options.input_filename:
        infile.close()

    # delete all clusters below a minimum size
    # Note: this has to be done at the end, because
    # clusters sizes are only available once both the fasta
    # file and the map has been parsed.
    if options.min_size:
        ndeleted = files.DeleteFiles(min_size=options.min_size)
    else:
        ndeleted = 0

    if options.loglevel >= 1:
        print("# input=%i, output=%i, ndeleted=%i" %
              (ninput, noutput, ndeleted))

    E.stop()

Example #30

Show file

File: gtfs2tsv.py Project: AndreasHegerGenomics/cgat-apps

def main(argv=None):

    if not argv:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-e",
                      "--output-equivalent",
                      dest="write_equivalent",
                      action="store_true",
                      help="write equivalent entries [default=%default].")

    parser.add_option("-f",
                      "--output-full",
                      dest="write_full",
                      action="store_true",
                      help="write full gff entries [default=%default].")

    parser.add_option("-p",
                      "--add-percent",
                      dest="add_percent",
                      action="store_true",
                      help="add percentage columns [default=%default].")

    parser.add_option("-s",
                      "--ignore-strand",
                      dest="ignore_strand",
                      action="store_true",
                      help="ignore strand information [default=%default].")

    parser.set_defaults(
        write_equivalent=False,
        write_full=False,
        add_percent=False,
        ignore_strand=False,
        as_gtf=False,
    )

    (options, args) = E.start(parser, argv, add_output_options=True)

    if len(args) != 2:
        raise ValueError("two arguments required")

    input_filename1, input_filename2 = args

    # duplicated features cause a problem. Make sure
    # features are non-overlapping by running
    # gff_combine.py on GFF files first.

    E.info("reading data started")

    idx, genes2 = {}, set()
    for e in GTF.readFromFile(IOTools.open_file(input_filename2, "r")):
        genes2.add(e.gene_id)
        if e.contig not in idx:
            idx[e.contig] = bx.intervals.intersection.Intersecter()
        idx[e.contig].add_interval(
            bx.intervals.Interval(e.start, e.end, value=e))

    overlaps_genes = []

    E.info("reading data finished: %i contigs" % len(idx))

    # outfile_diff and outfile_overlap not implemented
    # outfile_diff = getFile( options, "diff" )
    # outfile_overlap = getFile( options, "overlap" )
    overlapping_genes = set()

    genes1 = set()

    # iterate over exons
    with IOTools.open_file(input_filename1, "r") as infile:
        for this in GTF.iterator(infile):

            genes1.add(this.gene_id)

            try:
                intervals = idx[this.contig].find(this.start, this.end)
            except KeyError:
                continue

            others = [x.value for x in intervals]
            for other in others:
                overlapping_genes.add((this.gene_id, other.gene_id))

            # check for identical/half-identical matches
            output = None
            for other in others:
                if this.start == other.start and this.end == other.end:
                    output, symbol = other, "="
                    break
            else:
                for other in others:
                    if this.start == other.start or this.end == other.end:
                        output, symbol = other, "|"
                        break
                else:
                    symbol = "~"

    # if outfile_diff != options.stdout: outfile_diff.close()
    # if outfile_overlap != options.stdout: outfile_overlap.close()

    outfile = None
    ##################################################################
    ##################################################################
    ##################################################################
    # print gene based information
    ##################################################################
    if overlapping_genes:
        outfile = getFile(options, "genes_ovl")
        outfile.write("gene_id1\tgene_id2\n")
        for a, b in sorted(overlapping_genes):
            outfile.write("%s\t%s\n" % (a, b))
        if outfile != options.stdout:
            outfile.close()

        outfile_total = getFile(options, "genes_total")
        outfile_total.write(
            "set\tngenes\tnoverlapping\tpoverlapping\tnunique\tpunique\n")

        outfile = getFile(options, "genes_uniq1")
        b = set([x[0] for x in overlapping_genes])
        d = genes1.difference(b)
        outfile.write("gene_id1\n")
        outfile.write("\n".join(sorted(d)) + "\n")
        if outfile != options.stdout:
            outfile.close()
        outfile_total.write(
            "%s\t%i\t%i\t%5.2f\t%i\t%5.2f\n" %
            (os.path.basename(input_filename1), len(genes1), len(b),
             100.0 * len(b) / len(a), len(d), 100.0 * len(d) / len(genes1)))

        outfile = getFile(options, "genes_uniq2")
        b = set([x[1] for x in overlapping_genes])
        d = genes2.difference(b)
        outfile.write("gene_id2\n")
        outfile.write("\n".join(sorted(d)) + "\n")
        if outfile != options.stdout:
            outfile.close()

        outfile_total.write(
            "%s\t%i\t%i\t%5.2f\t%i\t%5.2f\n" %
            (os.path.basename(input_filename2), len(genes2), len(b),
             100.0 * len(b) / len(a), len(d), 100.0 * len(d) / len(genes2)))
        if outfile_total != options.stdout:
            outfile_total.close()

    E.stop()