Example #1
0
def DumpGOFromDatabase(outfile, dbhandle, options):
    """read go assignments from database.

    and dump them into a flatfile.
    (one to many mapping of genes to GO categories)
    and a dictionary of go-term to go information
    """

    E.info("category\ttotal\tgenes\tcategories")

    all_genes = collections.defaultdict(int)
    all_categories = collections.defaultdict(int)
    all_ntotal = 0

    outfile.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n")

    for go_type in options.ontology:

        genes = collections.defaultdict(int)
        categories = collections.defaultdict(int)
        ntotal = 0
        statement = GetGOStatement(go_type, options.database_name,
                                   options.species)

        results = Database.executewait(dbhandle, statement,
                                       retries=0).fetchall()

        for result in results:
            outfile.write("\t".join(map(str, (go_type, ) + result)) + "\n")
            gene_id, goid, description, evidence = result
            genes[gene_id] += 1
            categories[goid] += 1
            ntotal += 1
            all_genes[gene_id] += 1
            all_categories[goid] += 1
            all_ntotal += 1

        E.info("%s\t%i\t%i\t%i" %
               (go_type, ntotal, len(genes), len(categories)))

    E.info("%s\t%i\t%i\t%i" %
           ("all", all_ntotal, len(all_genes), len(all_categories)))

    return
Example #2
0
def splitFiles(infile, nchunks, out_dir):
    '''
    Give files names based on splitting into an arbitrary number of chunks
    '''

    df = pd.read_table(infile, sep="\t", header=0, index_col=0)
    total = len(df.index.tolist())

    # split into aribitrary number of chunks, or arbitrary chunk size?
    # small n bad for large input size, large n bad for small input size
    # set min/max chunk size, e.g. 100 genes minimum, 500 maximum?

    if total/nchunks < 100:
        step = 100
        E.warn("too few genes in each chunk, resetting to 100 genes per chunk")
    elif total/nchunks > 500:
        step = 500
        E.warn("too many genes per chunk, resetting to 500 genes per chunk")
    else:
        step = total/nchunks
        E.info("chunking input file into %i chunks" % step)

    file_pattern = infile.split("/")[1].rstrip("-expression.tsv")
    idx = 0
    for i in range(step, total, step):
        start = "%s" % idx
        end = "%s" % i
        file_name = "%s/%s-%s_%s-split.tsv" % (out_dir,
                                               file_pattern,
                                               start,
                                               end)
        with open(file_name, "w") as file_handle:
            file_handle.write(file_name + "\n")
        idx = i

    # final file
    start = "%s" % idx
    end = "%s" % total
    file_name = "%s/%s-%s_%s-split.tsv" % (out_dir,
                                           file_pattern,
                                           start,
                                           end)
    with open(file_name, "w") as file_handle:
        file_handle.write(file_name + "\n")
Example #3
0
    def parseHeader(self, infile, outfile, options):
        """parse header in infile."""
        # skip comments until header
        while 1:
            l = infile.readline()
            if not l:
                break
            if self.header_regex:
                if self.header_regex.search(l):
                    break
            elif l[0] != "#":
                break
            options.stdlog.write(l)

        # print only the first header and check if
        # all the headers are the same.
        if self.header:
            if self.header != l:
                raise ValueError("inconsistent header in file %s\n"
                                 "got=%s\nexpected=%s" %
                                 (infile, l, self.header))
        else:
            outfile.write(l)
            self.header = l
            self.nfields = l.count("\t")
            if self.nfields == 0:
                E.warn("only single column in header: %s" % l[:-1])

            if self.mFieldIndex is None and self.mFieldName:
                try:
                    self.mFieldIndex = self.header.split("\t").index(
                        self.mFieldName)
                except ValueError:
                    E.warn("no mapping, can not find field %s in %s" %
                           (self.mFieldName, self.header))
                    self.mFieldName = None

                E.debug("substituting field: %s, %s" %
                        (self.mFieldName, self.mFieldIndex))
Example #4
0
def annotate(infile, annotation_file, outfile):
    '''
    annotate infile with annotations from
    annotation gtf file
    '''
    inf = open(infile)
    header = inf.readline()
    include = set()

    E.info("reading genes to keep")
    for line in inf.readlines():
        data = line[:-1].split("\t")
        gene_id = data[8].strip('"')
        include.add(gene_id)

    E.info("reading annotations file")
    annotations = {}
    for gtf in GTF.iterator(IOTools.openFile(annotation_file)):
        if gtf.gene_id in include:
            annotations[gtf.gene_id] = \
                [gtf.gene_name, gtf.species, gtf.description]

    inf = open(infile)
    header = inf.readline()

    E.info("writing results with annotations")
    outf = open(outfile, "w")
    outf.write(header.strip("\n") +
               "\tgene_name\tspecies_centroid\tdescription\n")
    for line in inf.readlines():
        data = line[:-1].split("\t")
        gene_id = data[8].strip('"')
        try:
            outf.write("\t".join(data + annotations[gene_id]) + "\n")
        except KeyError:
            outf.write("\t".join(data + ["NA", "NA", "NA"]) + "\n")
    outf.close()
Example #5
0
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--test",
                      dest="test",
                      type="string",
                      help="supply help")

    parser.add_option("--plot-type",
                      dest="plot_type",
                      type="choice",
                      choices=["manhattan", "qqplot", "epistasis"],
                      help="plot type to generate")

    parser.add_option("--resolution",
                      dest="resolution",
                      type="choice",
                      choices=["genome_wide", "chromosome", "fine_map"],
                      help="the resolution of plotting, wether the plot "
                      "depicts the whole genome, a single chromosome or "
                      "a specific locus")

    parser.add_option("--file-format",
                      dest="file_format",
                      type="choice",
                      choices=["plink", "cassi", "cassi_covar"],
                      help="input file format, used to parse the file "
                      "properly")

    parser.add_option("--save-path",
                      dest="save_path",
                      type="string",
                      help="path and filename to save image to")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    parser.set_defaults(resolution="genome_wide",
                        plot_type="manhattan",
                        file_format="plink")

    # if the input is a list of files, split them
    infile = argv[-1]
    infiles = infile.split(",")

    # need to parse epistasis output slightly differently
    if options.plot_type == "epistasis":
        epi = True
    else:
        epi = False

    if len(infiles) > 1:
        results = gwas.GWASResults(assoc_file=infiles,
                                   epistasis=epi,
                                   file_format=options.file_format)
    elif len(infiles) == 1:
        results = gwas.GWASResults(assoc_file=infile,
                                   epistasis=epi,
                                   file_format=options.file_format)
    else:
        raise IOError("no input files detected, please specifiy association "
                      "results files as the last command line argument")

    if options.plot_type == "manhattan":
        df = results.plotManhattan(resolution=options.resolution,
                                   save_path=options.save_path)
    elif options.plot_type == "qqplot":
        results.plotQQ(save_path=options.save_path,
                       resolution=options.resolution)
    elif options.plot_type == "epistasis":
        results.plotEpistasis(save_path=options.save_path,
                              resolution=options.resolution)
    else:
        pass

    # only output appended results for Manhattan plot, not qqplot
    try:
        df.to_csv(options.stdout, sep="\t", index=None)
    except UnboundLocalError:
        pass

    # write footer and output benchmark information.
    E.stop()
Example #6
0
def mergeVariants(variants):
    '''merge overlapping variants.

    Overlapping variants occur if there are two deletions
    at the same location:

        WT      ACTG  
        Allele1 -CT-   
        Allele2 ----

    This will be encoded by samtools as (0-based coordinates)::

        0 * -A/ACTG
        3 * -G/-G

    This upsets the re-constitution algoritm.

    This method separates these two variants into two non-overlapping
    variants making use of variable length deletions.

        0 * -A/-A
        1 * ---G/-CTG

    Another case:

        WT      ACTG  
        Allele1 ACT-   
        Allele2 ----

    This will be encoded by samtools as (0-based coordinates)::

        0 * */-ACTG
        3 * -G/*

    This method separates these two as::

        0 * */-ACT
        3 * -G/-G

    '''

    if len(variants) == 0:
        return []

    # sorts by start and then end
    variants.sort()
    merged_variants = []

    def _add(offset, dest, src):
        for x, c in enumerate(src):
            dest[x + offset] = c

    def _split(seq0, seq1):
        # split
        was_0, was_1 = seq0[0] == "-", seq1[0] == "-"
        for x, cc in enumerate(zip(seq0, seq1)):
            is_0, is_1 = cc[0] == "-", cc[1] == "-"
            # yield all changes
            if (is_0 ^ was_0) or (is_1 ^ was_1):
                yield x, was_0, was_1
            was_0, was_1 = is_0, is_1

        yield x + 1, was_0, was_1

    last = variants[0]
    for this in variants[1:]:

        if this.start < last.end and \
                this.action == "-" and \
                last.action == "-":

            E.warn("merging overlapping deletions: %s and %s" %
                   (str(last), str(this)))

            mend = max(last.end, this.end)
            mstart = min(this.start, last.start)
            l = mend - mstart

            seq0 = list("-" * l)
            seq1 = list("-" * l)

            _add(last.start - mstart, seq0, last.variantseqs[0])
            _add(last.start - mstart, seq1, last.variantseqs[1])
            _add(this.start - mstart, seq0, this.variantseqs[0])
            _add(this.start - mstart, seq1, this.variantseqs[1])

            last_x = 0
            n = []
            for x, was_0, was_1 in _split(seq0, seq1):
                if last_x == x:
                    continue

                this = ExtendedVariant._make((
                    mstart + last_x,
                    mstart + x,
                    "*",
                    last.action,
                    was_0 ^ was_1,
                    ["".join(seq0[last_x:x]), "".join(seq1[last_x:x])],
                ))
                n.append(this)
                last_x = x

            E.warn("overlapping deletions merged in %i blocks as: %s" %
                   (len(n), list(map(str, n))))
            merged_variants.extend(n[:-1])
            this = n[-1]
        else:
            merged_variants.append(last)

        last = this

    merged_variants.append(last)

    return merged_variants
Example #7
0
def main(argv=None):

    parser = getOptionParser()

    (options, args) = E.Start(parser, add_cluster_options=True)

    if len(args) == 0:
        raise ValueError(
            "command line argument missing - see usage information")

    options.renumber_column = [x.split(":") for x in options.renumber_column]

    cmd = args[0]
    if len(args) > 1:
        cmd += " '" + "' '".join(args[1:]) + "'"

    if options.dry_run:

        cmd = re.sub("%DIR%", "", cmd)
        retcode = subprocess.call(cmd,
                                  shell=True,
                                  stdin=sys.stdin,
                                  stdout=sys.stdout,
                                  cwd=os.getcwd(),
                                  close_fds=True)
        E.Stop()
        sys.exit(0)

    failed_requests = []
    started_requests = []
    niterations = 0

    if not options.collect:
        tmpdir = os.path.abspath(tempfile.mkdtemp(dir=options.tmpdir))

        E.info(" working in directory %s" % tmpdir)

        if options.split_at_lines:
            chunk_iterator = chunk_iterator_lines
            args = (options.split_at_lines, )
        elif options.split_at_column:
            chunk_iterator = chunk_iterator_column
            args = (options.split_at_column - 1, options.max_files)
        elif options.split_at_regex:
            chunk_iterator = chunk_iterator_regex_split
            args = (re.compile(options.split_at_regex), 0, options.chunksize,
                    options.max_lines)
        elif options.group_by_regex:
            chunk_iterator = chunk_iterator_regex_group
            args = (re.compile(options.group_by_regex), 0, options.chunksize)
        else:
            raise ValueError("please specify a way to chunk input data")

        data = [(x, cmd, options, None, options.subdirs)
                for x in chunk_iterator(options.stdin,
                                        args,
                                        prefix=tmpdir,
                                        use_header=options.input_header)]

        started_requests = [(x[0], x[0] + ".out") for x in data]

        if len(data) == 0:
            E.warn("no data received")
            E.Stop()
            sys.exit(0)

        if options.method == "multiprocessing":
            pool = Pool(options.cluster_num_jobs)
            results = pool.map(runCommand, data, chunksize=1)
        elif options.method == "drmaa":
            results = []
            runDRMAA(data, environment=options.environment)
        elif options.method == "threads":
            pool = ThreadPool(options.cluster_num_jobs)
            results = pool.map(runCommand, data, chunksize=1)

        niterations = 0
        for retcode, filename, cmd, logfile, iterations in results:
            niterations += iterations
            if not hasFinished(retcode, filename, options.output_tag, logfile):
                failed_requests.append((filename, cmd))

    else:
        tmpdir = options.collect
        started_requests = [(x[:-4], x) for x in glob.glob(tmpdir + "/*.out")]

        E.info("collecting %i files from %s" % (len(started_requests), tmpdir))

    if failed_requests:
        for fn, cmd in failed_requests:
            E.error("failed request: filename= %s, cmd= %s" % (fn, cmd))
    else:
        E.info("building result from %i parts" % len(started_requests))

        if options.renumber:
            mapper = MapperLocal(pattern=options.renumber)
        else:
            mapper = MapperEmpty()

        # deal with stdout
        name = None
        index = None

        for pattern, column in options.renumber_column:

            if re.search(pattern, "stdout"):
                try:
                    index = int(column) - 1
                except ValueError:
                    name = column
                    break

        if options.binary:
            ResultBuilderBinary()(started_requests, options.stdout, options)
        else:
            regex = None
            if options.output_regex_header:
                regex = re.compile(options.output_regex_header)
            ResultBuilder(mapper=mapper,
                          field_index=index,
                          field_name=name,
                          header_regex=regex)(started_requests, options.stdout,
                                              options)

        # deal with logfiles : combine them into a single file
        rr = re.search("'--log=(\S+)'", cmd) or re.search("'--L\s+(\S+)'", cmd)
        if rr:
            E.info("logging output goes to %s" % rr.groups()[0])
            logfile = IOTools.openFile(rr.groups()[0], "a")
            ResultBuilderLog()([(x[0], "%s.log" % x[0])
                                for x in started_requests], logfile, options)
            logfile.close()

        # deal with other files
        if options.subdirs:

            files = glob.glob("%s/*.dir/*" % tmpdir)
            # remove directory
            filenames = set([os.path.basename(x) for x in files])
            xx = len(".out")

            for filename in filenames:

                _, filetype = os.path.splitext(filename)

                name = None
                index = None

                for pattern, column in options.renumber_column:
                    if re.search(pattern, filename):
                        try:
                            index = int(column) - 1
                        except ValueError:
                            name = column
                        break

                if options.binary:
                    builder = ResultBuilderBinary(mapper=mapper)
                elif filetype in (".fa", ".fasta"):
                    builder = ResultBuilderFasta(mapper=mapper)
                elif filetype in (".mali", ):
                    builder = ResultBuilderFasta(mapper=MapperEmpty())
                elif filetype in (".psl"):
                    builder = ResultBuilderPSL(mapper=mapper)
                elif filetype in (".gtf", ".gff"):
                    builder = ResultBuilderGFF(mapper=mapper,
                                               field_index=index,
                                               field_name=name)
                elif filetype in (".png"):
                    builder = ResultBuilderCopies(mapper=mapper)
                else:
                    builder = ResultBuilder(mapper=mapper,
                                            field_index=index,
                                            field_name=name)

                E.debug("chose the following builder for %s: %s: %s" %
                        (filename, filetype, str(builder)))

                E.info("collecting results for %s" % filename)

                input_filenames = []
                for fi, fn in started_requests:
                    fn = fn[:-xx] + ".dir/" + filename
                    if os.path.exists(fn):
                        input_filenames.append((fi, fn))

                E.info("output of %i files goes to %s" %
                       (len(filenames), filename))

                outfile = IOTools.openFile(options.output_pattern % filename,
                                           "w")
                builder(input_filenames, outfile, options)
                outfile.close()

    if not options.debug and (not options.resume or not options.collect):
        if len(failed_requests) == 0:
            E.info("removing directory %s" % tmpdir)
            shutil.rmtree(tmpdir)
        else:
            E.info("directory %s not removed due to %i failed jobs" %
                   (tmpdir, len(failed_requests)))

    E.info("job control: nstarted=%i, nfinished=%i, nerrors=%i, nrepeats=%i" %
           (len(started_requests), len(started_requests) -
            len(failed_requests), len(failed_requests), niterations))

    E.Stop()
Example #8
0
def runDRMAA(data, environment):
    '''run jobs in data using drmaa to connect to the cluster.'''

    # SNS: Error dection now taken care of with Cluster.py
    # expandStatement function

    # working directory - needs to be the one from which the
    # the script is called to resolve input files.
    cwd = os.getcwd()

    session = drmaa.Session()
    session.initialize()

    jobids = []
    kwargs = {}

    for filename, cmd, options, tmpdir, subdirs in data:

        from_stdin, to_stdout = True, True

        if subdirs:
            outdir = "%s.dir/" % (filename)
            os.mkdir(outdir)
            cmd = re.sub("%DIR%", outdir, cmd)

        x = re.search("'--log=(\S+)'", cmd) or re.search("'--L\s+(\S+)'", cmd)
        if x:
            logfile = filename + ".log"
            cmd = cmd[:x.start()] + "--log=%s" % logfile + cmd[x.end():]
        else:
            logfile = filename + ".out"

        if "%STDIN%" in cmd:
            cmd = re.sub("%STDIN%", filename, cmd)
            from_stdin = False

        if "%STDOUT%" in cmd:
            cmd = re.sub("%STDOUT%", filename + ".out", cmd)
            to_stdout = False

        cmd = " ".join(re.sub("\t+", " ", cmd).split("\n"))
        E.info("running statement:\n%s" % cmd)

        job_script = tempfile.NamedTemporaryFile(dir=os.getcwd(),
                                                 delete=False,
                                                 mode="w+t")
        job_script.write("#!/bin/bash\n")  # -l -O expand_aliases\n" )
        job_script.write(Cluster.expandStatement(cmd) + "\n")
        job_script.close()

        job_path = os.path.abspath(job_script.name)

        os.chmod(job_path, stat.S_IRWXG | stat.S_IRWXU)

        # get session for process - only one is permitted

        job_name = os.path.basename(kwargs.get("outfile", "farm.py"))

        options_dict = vars(options)
        options_dict["workingdir"] = os.getcwd()

        if options.job_memory:
            job_memory = options.job_memory
        elif options.cluster_memory_default:
            job_memory = options.cluster_memory_default
        else:
            job_memory = "2G"

        jt = Cluster.setupDrmaaJobTemplate(session, options_dict, job_name,
                                           job_memory)

        jt.remoteCommand = job_path

        # update the environment
        e = {'BASH_ENV': options.bashrc}
        if environment:
            for en in environment:
                try:
                    e[en] = os.environ[en]
                except KeyError:
                    raise KeyError(
                        "could not export environment variable '%s'" % en)
        jt.jobEnvironment = e

        # SNS: Native specifation setting abstracted
        # to Pipeline/Cluster.setupDrmaaJobTemplate()

        # use stdin for data
        if from_stdin:
            jt.inputPath = ":" + filename

        # set paths.

        # later: allow redirection of stdout and stderr to files
        # could this even be across hosts?
        if to_stdout:
            jt.outputPath = ":" + filename + ".out"
        else:
            jt.outputPath = ":" + filename + ".stdout"

        jt.errorPath = ":" + filename + ".err"

        jobid = session.runJob(jt)
        jobids.append((jobid, job_path, filename, cmd, logfile))

    E.debug("%i jobs have been submitted" % len(jobids))

    results = []

    for jobid, job_path, filename, cmd, logfile in jobids:

        try:
            retval = session.wait(jobid, drmaa.Session.TIMEOUT_WAIT_FOREVER)
        except Exception as msg:
            # ignore message 24 in PBS
            # code 24: drmaa: Job finished but resource usage information
            # and/or termination status could not be provided.":
            if not msg.message.startswith("code 24"):
                raise
            retval = None

        if retval and retval.exitStatus != 0:
            raise OSError("Child was terminated by signal %i: \n%s\n" %
                          (retval.exitStatus, cmd))

        results.append((retval, filename, cmd, logfile, 1))

        os.unlink(job_path)

    session.deleteJobTemplate(jt)
    session.exit()
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
        """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--method",
                      dest="methods",
                      type="choice",
                      action="append",
                      choices=(
                          "geneprofile",
                          "tssprofile",
                          "utrprofile",
                          "intervalprofile",
                          "midpointprofile",
                          "geneprofilewithintrons",
                          "geneprofileabsolutedistancefromthreeprimeend",
                          "separateexonprofile",
                          "separateexonprofilewithintrons",
                      ),
                      help='counters to use. Counters describe the '
                      'meta-gene structure to use. '
                      'Note using geneprofilewithintrons, or '
                      'geneprofileabsolutedistancefromthreeprimeend will '
                      'automatically turn on the --use-base-accuracy option'
                      '[%default].')

    parser.add_option("-b",
                      "--bam-file",
                      "--bedfile",
                      "--bigwigfile",
                      dest="infiles",
                      metavar="BAM",
                      type="string",
                      action="append",
                      help="BAM/bed/bigwig files to use. Do not mix "
                      "different types [%default]")

    parser.add_option("-c",
                      "--control-bam-file",
                      dest="controlfiles",
                      metavar="BAM",
                      type="string",
                      action="append",
                      help="control/input to use. Should be of the same "
                      "type as the bam/bed/bigwig file"
                      " [%default]")

    parser.add_option("-g",
                      "--gtf-file",
                      dest="gtffile",
                      type="string",
                      metavar="GTF",
                      help="GTF file to use. "
                      "[%default]")

    parser.add_option("--normalize-transcript",
                      dest="transcript_normalization",
                      type="choice",
                      choices=("none", "max", "sum", "total-max", "total-sum"),
                      help="normalization to apply on each transcript "
                      "profile before adding to meta-gene profile. "
                      "[%default]")

    parser.add_option("--normalize-profile",
                      dest="profile_normalizations",
                      type="choice",
                      action="append",
                      choices=("all", "none", "area", "counts", "background"),
                      help="normalization to apply on meta-gene "
                      "profile normalization. "
                      "[%default]")

    parser.add_option(
        "-r",
        "--reporter",
        dest="reporter",
        type="choice",
        choices=("gene", "transcript"),
        help="report results for genes or transcripts."
        " When 'genes` is chosen, exons across all transcripts for"
        " a gene are merged. When 'transcript' is chosen, counts are"
        " computed for each transcript separately with each transcript"
        " contributing equally to the meta-gene profile."
        " [%default]")

    parser.add_option("-i",
                      "--shift-size",
                      dest="shifts",
                      type="int",
                      action="append",
                      help="shift reads in :term:`bam` formatted file "
                      "before computing densities (ChIP-Seq). "
                      "[%default]")

    parser.add_option("-a",
                      "--merge-pairs",
                      dest="merge_pairs",
                      action="store_true",
                      help="merge pairs in :term:`bam` formatted "
                      "file before computing "
                      "densities (ChIP-Seq). "
                      "[%default]")

    parser.add_option("-u",
                      "--use-base-accuracy",
                      dest="base_accuracy",
                      action="store_true",
                      help="compute densities with base accuracy. The default "
                      "is to only use the start and end of the aligned region "
                      "(RNA-Seq) "
                      "[%default]")

    parser.add_option("-e",
                      "--extend",
                      dest="extends",
                      type="int",
                      action="append",
                      help="extend reads in :term:`bam` formatted file "
                      "(ChIP-Seq). "
                      "[%default]")

    parser.add_option("--resolution-upstream",
                      dest="resolution_upstream",
                      type="int",
                      help="resolution of upstream region in bp "
                      "[%default]")

    parser.add_option("--resolution-downstream",
                      dest="resolution_downstream",
                      type="int",
                      help="resolution of downstream region in bp "
                      "[%default]")

    parser.add_option("--resolution-upstream-utr",
                      dest="resolution_upstream_utr",
                      type="int",
                      help="resolution of upstream UTR region in bp "
                      "[%default]")

    parser.add_option("--resolution-downstream-utr",
                      dest="resolution_downstream_utr",
                      type="int",
                      help="resolution of downstream UTR region in bp "
                      "[%default]")

    parser.add_option("--resolution-cds",
                      dest="resolution_cds",
                      type="int",
                      help="resolution of cds region in bp "
                      "[%default]")

    parser.add_option("--resolution-first-exon",
                      dest="resolution_first",
                      type="int",
                      help="resolution of first exon in gene, in bp"
                      "[%default]")

    parser.add_option("--resolution-last-exon",
                      dest="resolution_last",
                      type="int",
                      help="resolution of last exon in gene, in bp"
                      "[%default]")

    parser.add_option("--resolution-introns",
                      dest="resolution_introns",
                      type="int",
                      help="resolution of introns region in bp "
                      "[%default]")

    parser.add_option("--resolution-exons-absolute-distance-topolya",
                      dest="resolution_exons_absolute_distance_topolya",
                      type="int",
                      help="resolution of exons absolute distance "
                      "topolya in bp "
                      "[%default]")

    parser.add_option("--resolution-introns-absolute-distance-topolya",
                      dest="resolution_introns_absolute_distance_topolya",
                      type="int",
                      help="resolution of introns absolute distance "
                      "topolya in bp "
                      "[%default]")

    parser.add_option("--extension-exons-absolute-distance-topolya",
                      dest="extension_exons_absolute_distance_topolya",
                      type="int",
                      help="extension for exons from the absolute "
                      "distance from the topolya in bp "
                      "[%default]")

    parser.add_option(
        "--extension-introns-absolute-distance-topolya",
        dest="extension_introns_absolute_distance_topolya",
        type="int",
        help="extension for introns from the absolute distance from "
        "the topolya in bp [%default]")

    parser.add_option("--extension-upstream",
                      dest="extension_upstream",
                      type="int",
                      help="extension upstream from the first exon in bp"
                      "[%default]")

    parser.add_option("--extension-downstream",
                      dest="extension_downstream",
                      type="int",
                      help="extension downstream from the last exon in bp"
                      "[%default]")

    parser.add_option("--extension-inward",
                      dest="extension_inward",
                      type="int",
                      help="extension inward from a TSS start site in bp"
                      "[%default]")

    parser.add_option("--extension-outward",
                      dest="extension_outward",
                      type="int",
                      help="extension outward from a TSS start site in bp"
                      "[%default]")

    parser.add_option("--scale-flank-length",
                      dest="scale_flanks",
                      type="int",
                      help="scale flanks to (integer multiples of) gene length"
                      "[%default]")

    parser.add_option(
        "--control-factor",
        dest="control_factor",
        type="float",
        help="factor for normalizing control and foreground data. "
        "Computed from data if not set. "
        "[%default]")

    parser.add_option("--output-all-profiles",
                      dest="output_all_profiles",
                      action="store_true",
                      help="keep individual profiles for each "
                      "transcript and output. "
                      "[%default]")

    parser.add_option("--counts-tsv-file",
                      dest="input_filename_counts",
                      type="string",
                      help="filename with count data for each transcript. "
                      "Use this instead "
                      "of recomputing the profile. Useful for plotting the "
                      "meta-gene profile "
                      "from previously computed counts "
                      "[%default]")

    parser.add_option(
        "--background-region-bins",
        dest="background_region_bins",
        type="int",
        help="number of bins on either end of the profile "
        "to be considered for background meta-gene normalization "
        "[%default]")

    parser.set_defaults(
        remove_rna=False,
        ignore_pairs=False,
        force_output=False,
        bin_size=10,
        extends=[],
        shifts=[],
        sort=[],
        reporter="transcript",
        resolution_cds=1000,
        resolution_introns=1000,
        # 3kb is a good balance of seeing long enough 3 prime bias and not omit
        # too many genes. Tim 31th Aug 2013
        resolution_exons_absolute_distance_topolya=3000,
        # introns is only for assess the noise level, thus do ont need a long
        # region, a long region has the side effect of omit more genes. Tim
        # 31th Aug 2013
        resolution_introns_absolute_distance_topolya=500,
        # extension can simply just be the same as resolution
        extension_exons_absolute_distance_topolya=3000,
        extension_introns_absolute_distance_topolya=500,
        resolution_upstream_utr=1000,
        resolution_downstream_utr=1000,
        resolution_upstream=1000,
        resolution_downstream=1000,
        resolution_first=1000,
        resolution_last=1000,
        # mean length of transcripts: about 2.5 kb
        extension_upstream=2500,
        extension_downstream=2500,
        extension_inward=3000,
        extension_outward=3000,
        plot=True,
        methods=[],
        infiles=[],
        controlfiles=[],
        gtffile=None,
        profile_normalizations=[],
        transcript_normalization=None,
        scale_flanks=0,
        merge_pairs=False,
        min_insert_size=0,
        max_insert_size=1000,
        base_accuracy=False,
        matrix_format="single",
        control_factor=None,
        output_all_profiles=False,
        background_region_bins=10,
        input_filename_counts=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    # Keep for backwards compatability
    if len(args) == 2:
        infile, gtf = args
        options.infiles.append(infile)
        options.gtffile = gtf

    if not options.gtffile:
        raise ValueError("no GTF file specified")

    if options.gtffile == "-":
        options.gtffile = options.stdin
    else:
        options.gtffile = IOTools.open_file(options.gtffile)

    if len(options.infiles) == 0:
        raise ValueError("no bam/wig/bed files specified")

    for methodsRequiresBaseAccuracy in [
            "geneprofilewithintrons",
            "geneprofileabsolutedistancefromthreeprimeend",
    ]:
        # If you implemented any methods that you do not want the
        # spliced out introns or exons appear to be covered by
        # non-existent reads, it is better you let those methods imply
        # --base-accurarcy by add them here.
        if methodsRequiresBaseAccuracy in options.methods:
            options.base_accuracy = True

    if options.reporter == "gene":
        gtf_iterator = GTF.flat_gene_iterator(GTF.iterator(options.gtffile))
    elif options.reporter == "transcript":
        gtf_iterator = GTF.transcript_iterator(GTF.iterator(options.gtffile))

    # Select rangecounter based on file type
    if len(options.infiles) > 0:
        if options.infiles[0].endswith(".bam"):
            bamfiles = [pysam.AlignmentFile(x, "rb") for x in options.infiles]

            if options.controlfiles:
                controlfiles = [
                    pysam.AlignmentFile(x, "rb") for x in options.controlfiles
                ]
            else:
                controlfiles = None

            format = "bam"
            if options.merge_pairs:
                range_counter = _bam2geneprofile.RangeCounterBAM(
                    bamfiles,
                    shifts=options.shifts,
                    extends=options.extends,
                    merge_pairs=options.merge_pairs,
                    min_insert_size=options.min_insert_size,
                    max_insert_size=options.max_insert_size,
                    controfiles=controlfiles,
                    control_factor=options.control_factor)

            elif options.shifts or options.extends:
                range_counter = _bam2geneprofile.RangeCounterBAM(
                    bamfiles,
                    shifts=options.shifts,
                    extends=options.extends,
                    controlfiles=controlfiles,
                    control_factor=options.control_factor)

            elif options.base_accuracy:
                range_counter = _bam2geneprofile.RangeCounterBAMBaseAccuracy(
                    bamfiles,
                    controlfiles=controlfiles,
                    control_factor=options.control_factor)
            else:
                range_counter = _bam2geneprofile.RangeCounterBAM(
                    bamfiles,
                    controlfiles=controlfiles,
                    control_factor=options.control_factor)

        elif options.infiles[0].endswith(".bed.gz"):
            bedfiles = [pysam.Tabixfile(x) for x in options.infiles]

            if options.controlfiles:
                controlfiles = [
                    pysam.Tabixfile(x) for x in options.controlfiles
                ]
            else:
                controlfiles = None

            range_counter = _bam2geneprofile.RangeCounterBed(
                bedfiles,
                controlfiles=controlfiles,
                control_factor=options.control_factor)

        elif options.infiles[0].endswith(".bw"):
            wigfiles = [BigWigFile(file=open(x)) for x in options.infiles]
            range_counter = _bam2geneprofile.RangeCounterBigWig(wigfiles)

        else:
            raise NotImplementedError("can't determine file type for %s" %
                                      str(options.infiles))

    counters = []
    for method in options.methods:
        if method == "utrprofile":
            counters.append(
                _bam2geneprofile.UTRCounter(
                    range_counter,
                    options.resolution_upstream,
                    options.resolution_upstream_utr,
                    options.resolution_cds,
                    options.resolution_downstream_utr,
                    options.resolution_downstream,
                    options.extension_upstream,
                    options.extension_downstream,
                ))

        elif method == "geneprofile":
            counters.append(
                _bam2geneprofile.GeneCounter(
                    range_counter, options.resolution_upstream,
                    options.resolution_cds, options.resolution_downstream,
                    options.extension_upstream, options.extension_downstream,
                    options.scale_flanks))

        elif method == "geneprofilewithintrons":
            counters.append(
                _bam2geneprofile.GeneCounterWithIntrons(
                    range_counter, options.resolution_upstream,
                    options.resolution_cds, options.resolution_introns,
                    options.resolution_downstream, options.extension_upstream,
                    options.extension_downstream, options.scale_flanks))

        elif method == "geneprofileabsolutedistancefromthreeprimeend":
            # options.extension_exons_absolute_distance_tostartsite,
            # options.extension_introns_absolute_distance_tostartsite,
            # Tim 31th Aug 2013: a possible feature for future,  if five prime
            # bias is of your interest.
            # (you need to create another class). It is not very difficult to
            # derive from this class, but is not implemented yet
            # This future feature is slightly different the TSS profile
            # already implemented, because in this future feature introns are
            # skipped,
            counters.append(
                _bam2geneprofile.GeneCounterAbsoluteDistanceFromThreePrimeEnd(
                    range_counter, options.resolution_upstream,
                    options.resolution_downstream,
                    options.resolution_exons_absolute_distance_topolya,
                    options.resolution_introns_absolute_distance_topolya,
                    options.extension_upstream, options.extension_downstream,
                    options.extension_exons_absolute_distance_topolya,
                    options.extension_introns_absolute_distance_topolya,
                    options.scale_flanks))

        elif method == "tssprofile":
            counters.append(
                _bam2geneprofile.TSSCounter(range_counter,
                                            options.extension_outward,
                                            options.extension_inward))

        elif method == "intervalprofile":
            counters.append(
                _bam2geneprofile.RegionCounter(range_counter,
                                               options.resolution_upstream,
                                               options.resolution_cds,
                                               options.resolution_downstream,
                                               options.extension_upstream,
                                               options.extension_downstream))

        elif method == "midpointprofile":
            counters.append(
                _bam2geneprofile.MidpointCounter(range_counter,
                                                 options.resolution_upstream,
                                                 options.resolution_downstream,
                                                 options.extension_upstream,
                                                 options.extension_downstream))

        # add new method to split 1st and last exons out
        # requires a representative transcript for reach gene
        # gtf should be sorted gene-position
        elif method == "separateexonprofile":
            counters.append(
                _bam2geneprofile.SeparateExonCounter(
                    range_counter, options.resolution_upstream,
                    options.resolution_first, options.resolution_last,
                    options.resolution_cds, options.resolution_downstream,
                    options.extension_upstream, options.extension_downstream))

        elif method == "separateexonprofilewithintrons":
            counters.append(
                _bam2geneprofile.SeparateExonWithIntronCounter(
                    range_counter, options.resolution_upstream,
                    options.resolution_first, options.resolution_last,
                    options.resolution_cds, options.resolution_introns,
                    options.resolution_downstream, options.extension_upstream,
                    options.extension_downstream))

    # set normalization
    for c in counters:
        c.setNormalization(options.transcript_normalization)
        if options.output_all_profiles:
            c.setOutputProfiles(
                IOTools.open_file(
                    E.getOutputFile(c.name) + ".profiles.tsv.gz", "w"))

    if options.input_filename_counts:
        # read counts from file
        E.info("reading counts from %s" % options.input_filename_counts)
        all_counts = pandas.read_csv(IOTools.open_file(
            options.input_filename_counts),
                                     sep='\t',
                                     header=0,
                                     index_col=0)

        if len(counters) != 1:
            raise NotImplementedError(
                'counting from matrix only implemented for 1 counter.')
        # build counter based on reference counter
        counter = _bam2geneprofile.UnsegmentedCounter(counters[0])
        counters = [counter]
        _bam2geneprofile.countFromCounts(counters, all_counts)

    else:
        E.info("starting counting with %i counters" % len(counters))
        feature_names = _bam2geneprofile.countFromGTF(counters, gtf_iterator)

    # output matrices
    if not options.profile_normalizations:
        options.profile_normalizations.append("none")
    elif "all" in options.profile_normalizations:
        options.profile_normalizations = [
            "none", "area", "counts", "background"
        ]

    for method, counter in zip(options.methods, counters):
        profiles = []
        for norm in options.profile_normalizations:
            # build matrix, apply normalization
            profile = counter.getProfile(
                normalize=norm,
                background_region_bins=options.background_region_bins)
            profiles.append(profile)

        for x in range(1, len(profiles)):
            assert profiles[0].shape == profiles[x].shape

        # build a single matrix of all profiles for output
        matrix = numpy.concatenate(profiles)
        matrix.shape = len(profiles), len(profiles[0])
        matrix = matrix.transpose()

        with IOTools.open_file(
                E.getOutputFile(counter.name) + ".matrix.tsv.gz",
                "w") as outfile:
            outfile.write("bin\tregion\tregion_bin\t%s\n" %
                          "\t".join(options.profile_normalizations))
            fields = []
            bins = []
            for field, nbins in zip(counter.fields, counter.nbins):
                fields.extend([field] * nbins)
                bins.extend(list(range(nbins)))

            for row, cols in enumerate(zip(fields, bins, matrix)):
                outfile.write("%i\t%s\t" %
                              (row, "\t".join([str(x) for x in cols[:-1]])))
                outfile.write("%s\n" % ("\t".join([str(x) for x in cols[-1]])))

        with IOTools.open_file(
                E.getOutputFile(counter.name) + ".lengths.tsv.gz",
                "w") as outfile:
            counter.writeLengthStats(outfile)

        if options.output_all_profiles:
            counter.closeOutputProfiles()

    if options.plot:

        import matplotlib
        # avoid Tk or any X
        matplotlib.use("Agg")
        import matplotlib.pyplot as plt

        for method, counter in zip(options.methods, counters):

            if method in ("geneprofile", "geneprofilewithintrons",
                          "geneprofileabsolutedistancefromthreeprimeend",
                          "utrprofile", "intervalprofile",
                          "separateexonprofile",
                          "separateexonprofilewithintrons"):

                plt.figure()
                plt.subplots_adjust(wspace=0.05)
                max_scale = max([max(x) for x in counter.aggregate_counts])

                for x, counts in enumerate(counter.aggregate_counts):
                    plt.subplot(6, 1, x + 1)
                    plt.plot(list(range(len(counts))), counts)
                    plt.title(counter.fields[x])
                    plt.ylim(0, max_scale)

                figname = counter.name + ".full"

                fn = E.getOutputFile(figname) + ".png"
                plt.savefig(os.path.expanduser(fn))

                plt.figure()

                points = []
                cuts = []
                for x, counts in enumerate(counter.aggregate_counts):
                    points.extend(counts)
                    cuts.append(len(counts))

                plt.plot(list(range(len(points))), points)

                xx, xxx = 0, []
                for x in cuts:
                    xxx.append(xx + x // 2)
                    xx += x
                    plt.axvline(xx, color="r", ls="--")

                plt.xticks(xxx, counter.fields)

                figname = counter.name + ".detail"

                fn = E.getOutputFile(figname) + ".png"
                plt.savefig(os.path.expanduser(fn))

            elif method == "tssprofile":

                plt.figure()
                plt.subplot(1, 3, 1)
                plt.plot(
                    list(
                        range(-options.extension_outward,
                              options.extension_inward)),
                    counter.aggregate_counts[0])
                plt.title(counter.fields[0])
                plt.subplot(1, 3, 2)
                plt.plot(
                    list(
                        range(-options.extension_inward,
                              options.extension_outward)),
                    counter.aggregate_counts[1])
                plt.title(counter.fields[1])
                plt.subplot(1, 3, 3)
                plt.title("combined")
                plt.plot(
                    list(
                        range(-options.extension_outward,
                              options.extension_inward)),
                    counter.aggregate_counts[0])
                plt.plot(
                    list(
                        range(-options.extension_inward,
                              options.extension_outward)),
                    counter.aggregate_counts[1])
                plt.legend(counter.fields[:2])

                fn = E.getOutputFile(counter.name) + ".png"
                plt.savefig(os.path.expanduser(fn))

            elif method == "midpointprofile":

                plt.figure()
                plt.plot(numpy.arange(-options.resolution_upstream, 0),
                         counter.aggregate_counts[0])
                plt.plot(numpy.arange(0, options.resolution_downstream),
                         counter.aggregate_counts[1])

                fn = E.getOutputFile(counter.name) + ".png"
                plt.savefig(os.path.expanduser(fn))

    # write footer and output benchmark information.
    E.stop()
 def _write_tabbed(self, name, lines, E):
     outfile = E.openOutputFile(name)
     outfile.write('\n'.join(lines))
     outfile.write('\n')
     outfile.close
Example #11
0
def concatenate_tables(outfile, options, args):
    '''concatenate tables.'''

    missing_value = options.missing_value

    rx = re.compile(options.regex_filename)

    if options.headers is None or options.headers == "auto":
        row_headers = [[y for y in rx.search(x).groups()]
                       for x in options.filenames]
    else:
        row_headers = [options.headers]

    tables, headers = [], []
    # read all tables
    for filename, header in zip(options.filenames, row_headers):
        table = read_table(filename, options)
        if len(table) == 0:
            E.warn("table '%s' is empty" % filename)
            continue
        tables.append(table)
        headers.append(header)
    row_headers = headers

    if options.cat is None:
        if len(row_headers) == 1:
            row_head_titles = ["filename"]
        else:
            row_head_titles = [
                "pattern" + str(x) for x in range(len(row_headers))
            ]
    else:
        row_head_titles = [x.strip() for x in options.cat.split(",")]
        if len(row_headers[0]) != len(row_head_titles):
            raise ValueError(
                "row header (%i) has different number of fields in "
                "regular expression than supplied by the --cat option (%i)" %
                (len(row_headers[0]), len(row_head_titles)))

    # collect titles
    if options.input_has_titles:
        titles = collections.OrderedDict()
        for table in tables:
            for key in table[0][:-1].split("\t"):
                # skip any titles that conflict with
                # the newly added titles
                if key in row_head_titles:
                    continue
                titles[key] = 1

        outfile.write("%s\t%s\n" % ("\t".join(
            [x for x in row_head_titles]), "\t".join(list(titles.keys()))))

        map_title2column = collections.defaultdict(lambda: None)
        for x, title in enumerate(titles.keys()):
            map_title2column[title] = x
    else:
        ncolumns = [len(table[0].split('\t')) for table in tables]
        if min(ncolumns) != max(ncolumns):
            raise ValueError('tables have unequal number of columns '
                             '(min=%i, max=%i)' %
                             (min(ncolumns), max(ncolumns)))
        # create a pseudo dictionary of columns
        titles = collections.OrderedDict([(x, x)
                                          for x in range(min(ncolumns))])

    all_titles = set(titles.keys())
    for nindex, table in enumerate(tables):
        if options.input_has_titles:
            titles = table[0][:-1].split("\t")
            map_old2new = [map_title2column[t] for t in titles]
            del table[0]
        else:
            map_old2new = list(range(len(all_titles)))

        for l in table:
            data = [missing_value] * len(all_titles)
            for x, value in enumerate(l[:-1].split("\t")):
                if map_old2new[x] is None:
                    continue

                data[map_old2new[x]] = value

            row = "\t".join([str(x)
                             for x in row_headers[nindex]] + data) + "\n"
            outfile.write(row)
Example #12
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t",
                      "--no-titles",
                      dest="input_has_titles",
                      action="store_false",
                      help="no titles in input [%default].")

    parser.add_option("--ignore-titles",
                      dest="ignore_titles",
                      action="store_true",
                      help="ignore titles in input [%default]")

    parser.add_option("-i",
                      "--skip-titles",
                      dest="skip_titles",
                      action="store_true",
                      help="skip output of titles.")

    parser.add_option("-m",
                      "--missing-value",
                      dest="missing_value",
                      type="string",
                      help="entry to use for missing values.")

    parser.add_option("--header-names",
                      dest="headers",
                      type="string",
                      help="add headers for files as a ,-separated "
                      "list [%default].")

    parser.add_option("-c",
                      "--columns",
                      dest="columns",
                      type="string",
                      help="columns to use for joining. Multiple columns "
                      "can be specified as a comma-separated list "
                      "[default=%default].")

    parser.add_option("-k",
                      "--take",
                      dest="take",
                      type="string",
                      action="append",
                      help="columns to take. If not set, all columns "
                      "except for "
                      "the join columns are taken [%default]")

    parser.add_option("-g",
                      "--glob",
                      dest="glob",
                      type="string",
                      help="wildcard expression for table names.")

    parser.add_option("-s",
                      "--sort-order",
                      dest="sort",
                      type="string",
                      help="sort by column titles in particular given order: "
                      "alphabetical|numeric|list of columns.")

    parser.add_option("-e",
                      "--merge-overlapping",
                      dest="merge",
                      action="store_true",
                      help="simply merge tables without matching up "
                      "rows. [default=%default].")

    parser.add_option("-a",
                      "--cat",
                      dest="cat",
                      type="string",
                      help="simply concatenate tables. Adds an "
                      "additional column called X with the filename "
                      " [default=%default].")

    parser.add_option("--sort-keys",
                      dest="sort_keys",
                      type="choice",
                      choices=("numeric", "alphabetic"),
                      help="sort key columns by value.")

    parser.add_option("--keep-empty",
                      dest="ignore_empty",
                      action="store_false",
                      help="keep empty tables. The default is "
                      "to ignore them.")

    parser.add_option("--ignore-empty",
                      dest="ignore_empty",
                      action="store_true",
                      help="ignore empty tables - this is "
                      "the default [%default].")

    parser.add_option("--add-file-prefix",
                      dest="add_file_prefix",
                      action="store_true",
                      help="add file prefix to "
                      "columns headers. Suitable for multi-column"
                      "tables [default=%default]")

    parser.add_option("--use-file-prefix",
                      dest="use_file_prefix",
                      action="store_true",
                      help="use file prefix as column headers. "
                      "Suitable for two-column tables "
                      "[default=%default]")

    parser.add_option("--prefixes",
                      dest="prefixes",
                      type="string",
                      help="list of prefixes to use. "
                      ", separated list of prefixes. "
                      "The number of prefixes need to correspond to the "
                      "number of input files [default=%default]")

    parser.add_option("--regex-filename",
                      dest="regex_filename",
                      type="string",
                      help="pattern to apply to filename to "
                      "build prefix [default=%default]")

    parser.add_option("--regex-start",
                      dest="regex_start",
                      type="string",
                      help="regular expression to start "
                      "collecting table in a file [default=%default]")

    parser.add_option("--regex-end",
                      dest="regex_end",
                      type="string",
                      help="regular expression to end collecting "
                      "table in a file [default=%default]")

    parser.add_option("--test",
                      dest="test",
                      type="int",
                      help="test combining tables with "
                      "first X rows [default=%default]")

    parser.set_defaults(
        input_has_titles=True,
        skip_titles=False,
        missing_value="na",
        headers=None,
        sort=None,
        glob=None,
        columns="1",
        sort_keys=False,
        merge=False,
        ignore_empty=True,
        regex_start=None,
        regex_end=None,
        add_file_prefix=False,
        use_file_prefix=False,
        cat=None,
        take=[],
        regex_filename="(.*)",
        prefixes=None,
        test=0,
    )

    (options, args) = E.start(parser, argv=argv)

    if options.headers:
        if "," in options.headers:
            options.headers = options.headers.split(",")
        else:
            options.headers = re.split("\s+", options.headers.strip())

    if options.sort and options.sort not in ("numeric", "alphabetic"):
        if "," in options.sort:
            options.sort = options.sort.split(",")
        else:
            options.sort = re.split("\s+", options.sort)

    if options.merge:
        options.columns = []
    else:
        options.columns = [int(x) - 1 for x in options.columns.split(",")]

    options.filenames = []

    if options.glob:
        options.filenames += glob.glob(options.glob)

    options.filenames += args

    if len(options.filenames) < 1:
        raise ValueError("no tables found.")

    E.info("combining %i tables" % len(options.filenames))

    if options.cat:
        concatenate_tables(options.stdout, options, args)
    else:
        join_tables(options.stdout, options, args)

    E.stop()
Example #13
0
def join_tables(outfile, options, args):
    '''join tables.'''

    if options.headers and options.headers[0] != "auto" and \
            len(options.headers) != len(options.filenames):
        raise ValueError("number of provided headers (%i) "
                         "is not equal to number filenames (%i)." %
                         (len(options.headers), len(options.filenames)))

    tables = []
    keys = {}
    sorted_keys = []
    sizes = {}

    if options.merge:
        titles = ["count"]
    else:
        titles = []

    headers_to_delete = []

    if options.prefixes:
        prefixes = [x.strip() for x in options.prefixes.split(",")]
        if len(prefixes) != len(options.filenames):
            raise ValueError(
                ("number of prefixes (%i) and tables (%i) "
                 "do not match") % (len(prefixes), len(options.filenames)))
    else:
        prefixes = None

    E.debug("joining on columns %s and taking columns %s" %
            (options.columns, options.take))

    for nindex, filename in enumerate(options.filenames):

        E.info("processing %s (%i/%i)" %
               (filename, nindex + 1, len(options.filenames)))

        prefix = os.path.basename(filename)

        lines = read_table(filename, options)

        # skip (or not skip) empty tables
        if len(lines) == 0 and options.ignore_empty:
            E.warn("%s is empty - skipped" % filename)
            headers_to_delete.append(nindex)
            continue

        table = {}
        sizes = {}
        max_size = 0
        ncolumns = 0

        if options.input_has_titles:
            data = lines[0][:-1].split("\t")
            # no titles have been defined so far
            if not titles:
                key = "-".join([data[x] for x in options.columns])
                titles = [key]

            # set take based on column titles or numerically
            if options.take:
                take = []
                # convert numeric columns for filtering
                for x in options.take:
                    try:
                        take.append(int(x) - 1)
                    except ValueError:
                        # will raise error if x is not present
                        take.append(data.index(x))
            else:
                # tables with max 100 columns
                take = None

            for x in range(len(data)):
                if x in options.columns or (take and x not in take):
                    continue
                ncolumns += 1
                if options.add_file_prefix:
                    try:
                        p = re.search(options.regex_filename,
                                      prefix).groups()[0]
                    except AttributeError:
                        E.warn("can't extract title from filename %s" % prefix)
                        p = "unknown"
                    titles.append("%s_%s" % (p, data[x]))
                elif options.use_file_prefix:
                    try:
                        p = re.search(options.regex_filename,
                                      prefix).groups()[0]
                    except:
                        E.warn("can't extract title from filename %s" % prefix)
                        p = "unknown"
                    titles.append("%s" % p)
                elif prefixes:
                    titles.append("%s_%s" % (prefixes[nindex], data[x]))
                else:
                    titles.append(data[x])

            del lines[0]
        else:

            # set take based on numeric columns if no titles are present
            if options.take:
                take = []
                # convert numeric columns for filtering
                for x in options.take:
                    take.append(int(x) - 1)
            else:
                # tables with max 100 columns
                take = None

            # IMS: We might still want filename titles even if the input
            # columns don't have titles.
            if options.add_file_prefix:
                if not titles:
                    titles = ["ID"]
                try:
                    p = re.search(options.regex_filename, prefix).groups()[0]
                except AttributeError:
                    E.warn("can't extract title from filename %s" % prefix)
                    p = "unknown"
                titles.append("%s_%s" % (p, data[x]))
            elif options.use_file_prefix:
                if not titles:
                    titles = ["ID"]
                try:
                    p = re.search(options.regex_filename, prefix).groups()[0]
                except:
                    E.warn("can't extract title from filename %s" % prefix)
                    p = "unknown"
                titles.append("%s" % p)
            ncolumns = 1

        n = 0
        for line in lines:
            data = line[:-1].split("\t")
            try:
                row_keys = [data[x] for x in options.columns]
            except IndexError as msg:
                raise IndexError("error while parsing %s: %s" %
                                 (filename, msg))
            if options.sort_keys:
                if options.sort_keys == "numeric":
                    row_keys.sort(lambda x, y: cmp(float(x), float(y)))
                else:
                    row_keys.sort()
            if options.merge:
                key = n
            else:
                key = "-".join(row_keys)

            if key not in keys:
                sorted_keys.append(key)
                keys[key] = 1
                sizes[key] = 0

            if take:
                max_size = len(take)
                table[key] = [data[x] for x in take]
            else:
                max_size = max(len(data) - len(options.columns), max_size)
                table[key] = [
                    data[x] for x in range(0, len(data))
                    if x not in options.columns
                ]
            n += 1

        # enter columns of "na" for empty tables.
        if max_size == 0:
            max_size = ncolumns

        tables.append((max_size, table))

    # delete in reverse order
    if options.headers:
        for nindex in headers_to_delete[::-1]:
            del options.headers[nindex]

    if len(tables) == len(titles) - 1:

        if options.headers:
            headers = ["bin"]
            if options.headers[0] == 'auto':
                for t in range(len(tables)):
                    headers.append(os.path.basename(options.filenames[t]))
                    headers += [""] * (tables[t][0] - 1)

            else:
                for t in range(len(tables)):
                    headers.append(options.headers[t])
                    headers += [""] * (tables[t][0] - 1)

            # use headers as titles, if headers is given and skip-titles is
            # turned on
            if options.input_has_titles and options.skip_titles:
                titles = headers
            else:
                # otherwise: print the headers out right away
                outfile.write("\t".join(headers) + "\n")

        order = list(range(0, len(tables) + 1))

        if options.input_has_titles or \
           (options.use_file_prefix or options.add_file_prefix):

            if options.sort:
                sort_order = []

                if options.sort == "numeric":
                    t = list(
                        zip(list(map(int, titles[1:])),
                            list(range(1,
                                       len(titles) + 1))))
                    t.sort()

                    for tt in t:
                        sort_order.append(titles[tt[1]])

                elif options.sort == "alphabetical":
                    t = list(zip(titles[1:], list(range(1, len(titles) + 1))))
                    t.sort()

                    for tt in t:
                        sort_order.append(titles[tt[1]])
                else:
                    sort_order = options.sort

                map_title2pos = {}
                for x in range(1, len(titles)):
                    map_title2pos[titles[x]] = x

                order = [
                    0,
                ]
                for x in sort_order:
                    if x in map_title2pos:
                        order.append(map_title2pos[x])

            else:
                order = list(range(0, len(titles)))

            outfile.write("\t".join(
                [titles[order[x]] for x in range(len(titles))]))
            outfile.write("\n")

        if options.sort_keys:
            if options.sort_keys:
                if options.sort_keys == "numeric":
                    sorted_keys.sort(lambda x, y: cmp(float(x), float(y)))
                else:
                    sorted_keys.sort()

        for key in sorted_keys:

            outfile.write("%s" % key)

            for x in order[1:]:

                max_size, table = tables[x - 1]
                c = 0
                if key in table:
                    outfile.write("\t")
                    outfile.write("\t".join(table[key]))
                    c = len(table[key])

                assert (max_size == 1)

                outfile.write("\t%s" % options.missing_value * (max_size - c))

            outfile.write("\n")

    else:

        # for multi-column table, just write
        if options.input_has_titles:
            outfile.write("\t".join([titles[x] for x in range(len(titles))]))
            outfile.write("\n")

        for key in sorted_keys:

            outfile.write("%s" % key)

            for x in range(len(tables)):

                max_size, table = tables[x]
                c = 0
                if key in table:
                    outfile.write("\t")
                    outfile.write("\t".join(table[key]))
                    c = len(table[key])

                outfile.write("\t%s" % options.missing_value * (max_size - c))

            outfile.write("\n")
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-i", "--input-bam", dest="input_bam_file", type="string",
        help="input bam file")

    parser.add_option(
        "-f", "--reference-bam", dest="reference_bam_file", type="string",
        help="reference BAM file [%default]")

    parser.add_option(
        "-q", "--query-name-regex", dest="query_name_regex", type="string",
        help="regular expression to apply on query name. "
        "Potentially required to match samtools sort order and should "
        "evaluate to an integer [%default]")

    parser.set_defaults(
        input_bam_file=None,
        reference_bam_file=None,
        query_name_regex=None,
    )

    (options, args) = E.start(parser, argv, add_output_options=True)

    if len(args) == 2:
        options.input_bam_file = args[0]
        options.reference_bam_file = args[1]

    if options.input_bam_file is None:
        raise ValueError("please supply a BAM file as input")

    if options.reference_bam_file is None:
        raise ValueError("please supply a BAM file as reference")

    # update paths to absolute
    options.input_bam_file = os.path.abspath(options.input_bam_file)
    options.reference_bam_file = os.path.abspath(options.reference_bam_file)

    if not os.path.exists(options.input_bam_file):
        raise OSError("input bam file {} does not exist".format(
            options.input_bam_file))

    if not os.path.exists(options.reference_bam_file):
        raise OSError("reference bam file {} does not exist".format(
            options.reference_bam_file))

    bam_in = pysam.AlignmentFile(options.input_bam_file)
    ref_in = pysam.AlignmentFile(options.reference_bam_file)

    outf_mapped = E.open_output_file("mapped")
    outf_mapped.write("\t".join(
        ["read",
         "length",
         "status",
         "overlap",
         "comp_contig",
         "comp_start",
         "comp_end",
         "ref_contig",
         "ref_start",
         "ref_end",
         "shared_misaligned",
         "shared_aligned",
         "shared_insertion",
         "shared_deletion",
         "comp_aligned",
         "comp_insertion",
         "comp_deletion",
         "ref_aligned",
         "ref_insertion",
         "ref_deletion"]) + "\n")

    outf_missing = E.open_output_file("missing")
    outf_missing.write("\t".join(
        ["read", "length", "status", "aligned",
         "insertion", "deletion"]) + "\n")

    counter = E.Counter()

    if options.query_name_regex:
        rx = re.compile(options.query_name_regex)

    def extract_query(x):
        return int(rx.search(x).groups()[0])

    qname_fn = None
    if options.query_name_regex:
        qname_fn = extract_query

    for reads_cmp, read_ref in group_pairs(iterate_read_pairs(
            bam_in.fetch(until_eof=True),
            ref_in.fetch(until_eof=True),
            qname_fn=qname_fn)):

        if len(reads_cmp) == 0:
            counter.missing += 1
            pairs_ref = set(read_ref.get_aligned_pairs())
            outf_missing.write("\t".join(
                map(str, (
                    read_ref.query_name,
                    read_ref.query_length,
                    "missing") +
                    count_pairs(pairs_ref))) + "\n")
            continue

        if len(reads_cmp) > 1:
            # multiple matches
            counter.multi_mapping += 1
            prefix = "multi_"
        else:
            counter.unique_mapping += 1
            prefix = "unique_"

        is_mapped = False
        for read_cmp in reads_cmp:

            counter.paired += 1

            if read_cmp.is_unmapped:
                counter.unmapped += 1
                pairs_ref = set(read_ref.get_aligned_pairs())
                outf_missing.write("\t".join(
                    map(str, (
                        read_ref.query_name,
                        read_ref.query_length,
                        "unmapped") +
                        count_pairs(pairs_ref))) + "\n")
                continue

            overlap = max(0, (min(read_cmp.reference_end,
                                  read_ref.reference_end) -
                              max(read_cmp.reference_start,
                                  read_ref.reference_start)))

            pairs_cmp = set(read_cmp.get_aligned_pairs())
            pairs_ref = set(read_ref.get_aligned_pairs())
            shared_cmp = pairs_cmp.intersection(pairs_ref)
            unique_cmp = pairs_cmp.difference(pairs_ref)
            missaligned = len([x for x, y in unique_cmp
                               if x is not None and y is not None])

            if read_cmp.reference_name != read_ref.reference_name or \
               overlap == 0:
                status = "mismapped"
            else:
                counter.overlap += 1
                status = "mapped"
                is_mapped = True

            outf_mapped.write("\t".join(
                map(str, (read_cmp.query_name,
                          read_cmp.query_length,
                          prefix + status,
                          overlap,
                          read_cmp.reference_name,
                          read_cmp.reference_start,
                          read_cmp.reference_end,
                          read_ref.reference_name,
                          read_ref.reference_start,
                          read_ref.reference_end,
                          missaligned) +
                    count_pairs(shared_cmp) +
                    count_pairs(pairs_cmp) +
                    count_pairs(pairs_ref))) + "\n")
        else:
            if is_mapped:
                status = "mapped"
            else:
                status = "mismapped"

            counter[prefix + status] += 1

    with E.open_output_file("summary") as outf:
        outf.write("category\tcounts\n")
        outf.write(counter.asTable() + "\n")

    E.stop()
Example #15
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-b",
                      "--reference-bed-file",
                      dest="reference_bed_file",
                      type="string",
                      help="reference bed file "
                      "[%default]")

    parser.add_option("-m",
                      "--method",
                      dest="method",
                      type="choice",
                      choices=("lvc-comparison", ),
                      help="methods to apply [%default]")

    parser.set_defaults(method="lvc-comparison",
                        reference_fasta_file=None,
                        input_bed_file=None,
                        size_bins=(1000, 10000, 100000),
                        output_sets=True,
                        region_string=None)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    reference_set = collections.defaultdict(quicksect.IntervalTree)

    E.info("reading reference bed file from {}".format(
        options.reference_bed_file))
    with IOTools.open_file(options.reference_bed_file) as inf:
        for record in pysam.tabix_iterator(inf, pysam.asBed()):
            mm = reference_set[record.contig]
            mm.add(record.start, record.end)
    E.info("read reference intervals on {} contigs: {}".format(
        len(list(reference_set.keys())), ",".join(list(reference_set.keys()))))

    if options.output_sets:
        output_tp = E.open_output_file("tp")
        output_fp = E.open_output_file("fp")
        output_fn = E.open_output_file("fn")
    else:
        output_tp = None
        output_fp = None
        output_fn = None

    if options.method == "lvc-comparison":
        c = E.Counter()

        found = set()
        counts = {}
        names = set()
        nsize_bins = len(options.size_bins)
        for bin in range(len(options.size_bins) + 1):
            counts[bin] = dict([(x, collections.defaultdict(int))
                                for x in ("tp", "fn", "fp", "test", "truth")])

        for record in pysam.tabix_iterator(options.stdin, pysam.asBed()):
            if record.contig not in reference_set:
                c.ignored_no_contig += 1
                continue

            c.test += 1
            matches = reference_set[record.contig].search(
                record.start, record.end)
            size = record.end - record.start
            bin = get_size_bin(size, options.size_bins)

            if len(matches) == 0:
                c.fp += 1
                status = "fp"
                if output_fp:
                    output_fp.write(str(record) + "\n")
            elif len(matches) >= 1:
                c.tp += 1
                status = "tp"
                if output_tp:
                    output_tp.write(str(record) + "\n")
                # todo: overlap criteria

                # record found
                for match in matches:
                    found.add((record.contig, match.start, match.end))

            name = record.name.split(",")[0]
            names.add(name)
            counts[bin]["test"][name] += 1
            counts[bin][status][name] += 1

        outf = options.stdout

        with IOTools.open_file(options.reference_bed_file) as inf:
            for record in pysam.tabix_iterator(inf, pysam.asBed()):
                c.truth += 1
                bin = get_size_bin(record.end - record.start,
                                   options.size_bins)
                counts[bin]["truth"]["all"] += 1

                key = (record.contig, record.start, record.end)
                if key not in found:
                    c.fn += 1
                    counts[bin]["fn"]["all"] += 1

        outf.write("\t".join(("category", "size", "test", "tp", "fp", "truth",
                              "fn")) + "\n")

        for name in sorted(names):
            for bin in range(len(options.size_bins) + 1):
                if bin == len(options.size_bins):
                    size_bin = ">={}".format(options.size_bins[-1])
                else:
                    size_bin = "<{}".format(options.size_bins[bin])
                outf.write("\t".join(
                    map(str, (
                        name,
                        size_bin,
                        counts[bin]["test"][name],
                        counts[bin]["tp"][name],
                        counts[bin]["fp"][name],
                        counts[bin]["truth"]["all"],
                        counts[bin]["fn"]["all"],
                    ))) + "\n")

    E.info(str(c))
    E.stop()
Example #16
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-m",
                      "--min-overlap",
                      dest="min_overlap",
                      type="float",
                      help="minimum overlap [%default]")

    parser.add_option("-a",
                      "--bam-file",
                      dest="filename_bam",
                      metavar="bam",
                      type="string",
                      help="bam-file to use (required) [%default]")

    parser.add_option("-b",
                      "--bed-file",
                      dest="filename_bed",
                      metavar="bed",
                      type="string",
                      help="bed-file to use (required) [%default]")

    parser.add_option("-s",
                      "--sort-bed",
                      dest="sort_bed",
                      action="store_true",
                      help="sort the bed file by chromosomal location before "
                      "processing. "
                      "[%default]")

    parser.add_option(
        "--assume-sorted",
        dest="sort_bed",
        action="store_false",
        help="assume that the bed-file is sorted by chromosomal location. "
        "[%default]")

    parser.add_option(
        "--split-intervals",
        dest="split_intervals",
        action="store_true",
        help="treat split BAM intervals, for example spliced intervals, "
        "as separate intervals. Note that a single alignment might be "
        "counted several times as a result. "
        "[%default]")

    parser.set_defaults(
        min_overlap=0.5,
        filename_bam=None,
        filename_bed=None,
        sort_bed=True,
        split_intervals=False,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.start(parser, argv=argv)

    filename_bam = options.filename_bam
    filename_bed = options.filename_bed

    if filename_bam is None and filename_bed is None:
        if len(args) != 2:
            raise ValueError(
                "please supply a bam and a bed file or two bed-files.")

        filename_bam, filename_bed = args

    if filename_bed is None:
        raise ValueError("please supply a bed file to compare to.")

    if filename_bam is None:
        raise ValueError("please supply a bam file to compare with.")

    E.info("intersecting the two files")

    min_overlap = options.min_overlap

    options.stdout.write("category\talignments\n")

    # get number of columns of reference bed file
    for bed in Bed.iterator(IOTools.open_file(filename_bed)):
        ncolumns_bed = bed.columns
        break
    E.info("assuming %s is bed%i format" % (filename_bed, ncolumns_bed))

    if ncolumns_bed < 4:
        raise ValueError("please supply a name attribute in the bed file")

    # get information about
    if filename_bam.endswith(".bam"):
        format = "-abam"
        samfile = pysam.AlignmentFile(filename_bam, "rb")
        total = samfile.mapped
        # latest bedtools uses bed12 format when bam is input
        ncolumns_bam = 12
        # count per read
        sort_key = lambda x: x.name
    else:
        format = "-a"
        total = IOTools.get_num_lines(filename_bam)
        # get bed format
        ncolumns_bam = 0
        for bed in Bed.iterator(IOTools.open_file(filename_bam)):
            ncolumns_bam = bed.columns
            break

        if ncolumns_bam > 0:
            E.info("assuming %s is bed%i fomat" % (filename_bam, ncolumns_bam))
            if ncolumns_bam == 3:
                # count per interval
                sort_key = lambda x: (x.contig, x.start, x.end)
            else:
                # count per interval category
                sort_key = lambda x: x.name

    # use fields for bam/bed file (regions to count with)
    data_fields = [
        "contig", "start", "end", "name", "score", "strand", "thickstart",
        "thickend", "rgb", "blockcount", "blockstarts", "blockends"
    ][:ncolumns_bam]

    # add fields for second bed (regions to count in)
    data_fields.extend([
        "contig2", "start2", "end2", "name2", "score2", "strand2",
        "thickstart2", "thickend2", "rgb2", "blockcount2", "blockstarts2",
        "blockends2"
    ][:ncolumns_bed])

    # add bases overlap
    data_fields.append("bases_overlap")

    data = collections.namedtuple("data", data_fields)

    options.stdout.write("total\t%i\n" % total)

    if total == 0:
        E.warn("no data in %s" % filename_bam)
        return

    # SNS: sorting optional, off by default
    if options.sort_bed:
        bedcmd = "<( gunzip < %s | sort -k1,1 -k2,2n)" % filename_bed
    else:
        bedcmd = filename_bed

    if options.split_intervals:
        split = "-split"
    else:
        split = ""

    # IMS: newer versions of intersectBed have a very high memory
    #      requirement unless passed sorted bed files.
    statement = """bedtools intersect %(format)s %(filename_bam)s
    -b %(bedcmd)s
    %(split)s
    -sorted -bed -wo -f %(min_overlap)f""" % locals()

    E.info("starting counting process: %s" % statement)
    proc = E.run(statement, return_popen=True, stdout=subprocess.PIPE)

    E.info("counting")
    counts_per_alignment = collections.defaultdict(int)
    take_columns = len(data._fields)

    def iterate(infile):
        for line in infile:
            if not line.strip():
                continue
            yield data._make(line[:-1].split()[:take_columns])

    for read, overlaps in itertools.groupby(iterate(
            IOTools.force_str(proc.stdout)),
                                            key=sort_key):
        annotations = [x.name2 for x in overlaps]
        for anno in annotations:
            counts_per_alignment[anno] += 1

    for key, counts in sorted(counts_per_alignment.items()):
        options.stdout.write("%s\t%i\n" % (key, counts))

    # write footer and output benchmark information.
    E.stop()
Example #17
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-a",
                      "--as-gtf",
                      dest="as_gtf",
                      action="store_true",
                      help="output as gtf.")

    parser.add_option(
        "-f",
        "--id-format",
        dest="id_format",
        type="string",
        help="format for numeric identifier if --as-gtf is set and "
        "no name in bed file [%default].")

    parser.set_defaults(as_gtf=False, id_format="%08i", test=None)

    (options, args) = E.start(parser, add_pipe_options=True)

    as_gtf = options.as_gtf
    id_format = options.id_format

    if as_gtf:
        gff = GTF.Entry()
    else:
        gff = GTF.Entry()

    gff.source = "bed"
    gff.feature = "exon"

    ninput, noutput, nskipped = 0, 0, 0

    id = 0
    for bed in Bed.iterator(options.stdin):

        ninput += 1

        gff.contig = bed.contig
        gff.start = bed.start
        gff.end = bed.end
        if bed.fields and len(bed.fields) >= 3:
            gff.strand = bed.fields[2]
        else:
            gff.strand = "."

        if bed.fields and len(bed.fields) >= 2:
            gff.score = bed.fields[1]

        if as_gtf:
            if bed.fields:
                gff.gene_id = bed.fields[0]
                gff.transcript_id = bed.fields[0]
            else:
                id += 1
                gff.gene_id = id_format % id
                gff.transcript_id = id_format % id
        else:
            if bed.fields:
                gff.source = bed.fields[0]

        options.stdout.write(str(gff) + "\n")

        noutput += 1

    E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped))

    E.stop()
def main(argv=None):

    if not argv:
        argv = sys.argv

    # get the options
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-c",
                      "--chain-file",
                      dest="chainfile",
                      type="string",
                      help="the chain file to analyse",
                      metavar="FILE")

    parser.add_option(
        "--alignments-per-contig",
        dest="nperchrom",
        type="int",
        help="Number of aligments to report on per chromosome pair",
        default=2)

    parser.add_option(
        "--aggregate-by",
        dest="aggregate",
        type="choice",
        choices=("contig", "none"),
        help="Set to `contig` to perform per chromosome pair analysis",
        default=2)

    parser.add_option(
        "-i",
        "--output-identity",
        dest="output_identity",
        action="store_true",
        help="Generate stats on the sequence identity of the gapped "
        "chains. Requires FastaIndex.py",
        default=False)

    parser.add_option("-d",
                      "--dbpath",
                      dest="dbpath",
                      type="string",
                      help="The path to the indexed fasta files",
                      default=".")

    parser.add_option("-t",
                      "--target-genome",
                      dest="targetgenome",
                      type="string",
                      help="The target genome, eg. Mm19",
                      default=False)

    parser.add_option("-q",
                      "--query-genome",
                      dest="querygenome",
                      type="string",
                      help="The query genome eg. Hg17",
                      default=False)

    parser.add_option(
        "-e",
        "--errors",
        dest="errors",
        action="store_true",
        help="Check chains for erroneous contig sizes using the given db",
        default=False)

    parser.add_option("-r",
                      "--output-report",
                      dest="output_report",
                      action="store_true",
                      help="Write out tab-delimited reports for each analysis",
                      default=False)

    (options, args) = E.start(parser, argv=argv, add_output_options=True)

    # make a list of counting objects
    counters = []

    counters.append(CounterPerChromosome(gapped=True))
    counters.append(CounterPerChromosome(gapped=False))

    if options.aggregate == "contig":
        counters.append(CounterPerChromosomePair(gapped=True))
        counters.append(CounterPerChromosomePair(gapped=False))

    counters.append(CounterOfGappedChainLengths(gapped=True))
    counters.append(CounterOfGappedChainLengths(gapped=False))

    if options.output_identity is True:
        if options.targetgenome == 0 or options.querygenome == 0:
            raise Exception(
                "Target and query database must be specified with the \"-e\" flag"
            )
        t_db_path = os.path.join(options.dbpath, options.targetgenome)
        q_db_path = os.path.join(options.dbpath, options.querygenome)
        counters.append(CounterPercentIdentify(t_db_path, q_db_path))

    if options.errors is True:
        if options.targetgenome == 0 or options.querygenome == 0:
            raise Exception(
                "Target and query database must be specified with the \"-e\" flag"
            )
        counters.append(CounterOfErrors(options))

    # iterate over the chains and counters
    for chain in chain_iterator(options.stdin):
        c = Chain(chain)
        for counter in counters:
            counter.add(c)

    # write a report to stdout and individual reports to tab delimited files
    options.stdout.write(
        "\n\n********** chain2stats report starts **********\n")

    for counter in counters:
        counter.report(options)
        if options.output_report is True:
            counter.tabbed_report(options, E)

    options.stdout.write("\n********** chain2stats report ends **********\n\n")

    E.stop()
Example #19
0
    def _iterate(self):
        """iterate over muliple files."""
        def _iter(infile):

            identifier = None
            is_new = False

            for line in infile:
                if line.startswith("#"):
                    continue
                if line.startswith(">"):

                    if self.regexIdentifier:
                        try:
                            identifier = re.search(self.regexIdentifier,
                                                   line[1:-1]).groups()[0]
                        except AttributeError:
                            raise ValueError(
                                "could not parse identifier from line %s "
                                "- check the input" % line[1:-1])
                    else:
                        identifier = re.split("\s", line[1:-1])[0]
                    is_new = True
                else:
                    if not identifier:
                        raise ValueError(
                            "refusing to emit sequence without identifier "
                            "- check the input")
                    yield is_new, identifier, line.strip()
                    is_new = False

        for filename in self.filenames:
            if self.format == "tar.gz" or self.format == "tar" or \
               (self.format == "auto" and filename.endswith("tar.gz")):
                if filename == "-":
                    tf = tarfile.open(fileobj=sys.stdin.buffer, mode="r|*")
                else:
                    tf = tarfile.open(filename, mode="r")
                for f in tf:
                    b, ext = os.path.splitext(f.name)
                    if ext.lower() in (".fasta", ".fa"):
                        E.info("extracting %s" % f.name)
                        if sys.version_info.major >= 3:
                            infile = io.TextIOWrapper(tf.extractfile(f),
                                                      encoding="ascii")
                        else:
                            infile = tf.extractfile(f)
                        for x in _iter(infile):
                            yield x
                    else:
                        E.info("skipping %s" % f.name)

                if tf != sys.stdin:
                    tf.close()
                continue
            elif self.format == "fasta.gz" or (self.format == "auto"
                                               and filename.endswith(".gz")):
                infile = IOTools.open_file(filename, "r")
            elif filename == "-":
                infile = sys.stdin
            else:
                infile = IOTools.open_file(filename, "r")

            for x in _iter(infile):
                yield x
            if filename != "-":
                infile.close()

        raise StopIteration
Example #20
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: set_diff.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-p", "--add-percent", dest="add_percent", action="store_true",
                      help="add percentage information to each line.")

    parser.add_option("-t", "--header-names", dest="headers", type="string",
                      help="comma separated list of headers. If empty or set to '-', filenames are used.")

    parser.add_option("--skip-header", dest="add_header", action="store_false",
                      help="do not add header to flat format.")

    parser.add_option("--output-with-header", dest="write_header", action="store_true",
                      help="write header and exit.")

    parser.add_option("--with-title", dest="with_title", action="store_true",
                      help="use column titles in input data [%default].")

    parser.add_option("--no-title", dest="with_title", action="store_false",
                      help="there are no titles in input data [%default].")

    parser.set_defaults(
        add_percent=False,
        percent_format="%5.2f",
        headers=None,
        add_header=True,
        write_header=False,
        with_title=True,
    )

    (options, args) = E.start(parser)

    if options.add_header:
        options.stdout.write(
            "set1\tset2\tn1\tn2\tunion\tinter\tunique1\tunique2")
        if options.add_percent:
            options.stdout.write(
                "\tpinter\tpunique1\tpunique2\tpcov1\tpcov2\tpcovmax")
        options.stdout.write("\n")

        if options.write_header:
            sys.exit(0)

    if len(args) < 2:
        raise ValueError("please supply at least two filenames.")

    headers, titles, sets = [], [], []

    if options.headers:
        if options.headers == "-":
            headers = args
        else:
            headers = options.headers.split(",")
            if len(headers) != len(args):
                raise ValueError(
                    "please supply the same number of headers as there are filenames.")

    for f in args:
        if options.with_title:
            title, data = IOTools.readList(
               IOTools.open_file(f, "r"), with_title=options.with_title)
            titles.append(title)
        else:
            data = IOTools.readList(open(f, "r"))
        sets.append(set(data))

    if not headers and titles:
        headers = titles
    else:
        headers = args

    for x in range(len(sets) - 1):
        set1 = sets[x]

        for y in range(x + 1, len(sets)):
            set2 = sets[y]
            l1, l2 = len(set1), len(set2)
            options.stdout.write("%s\t%s\t%i\t%i\t%i\t%i\t%i\t%i" % (headers[x], headers[y],
                                                                     l1, l2,
                                                                     len(set1.union(
                                                                         set2)),
                                                                     len(set1.intersection(
                                                                         set2)),
                                                                     len(set1.difference(
                                                                         set2)),
                                                                     len(set2.difference(set1))))

            if options.add_percent:
                if len(set1) == 0:
                    ri, r1, r2 = 0, 1, 0
                    c1, c2, cm = 1, 0, 0
                elif len(set2) == 0:
                    ri, r1, r2 = 0, 0, 1
                    c1, c2, cm = 0, 1, 0
                else:
                    i = len(set1.intersection(set2))
                    ri, r1, r2 = (
                        i / float(len(set1.union(set2))),
                        len(set1.difference(set2)) / float(l1),
                        len(set2.difference(set1)) / float(l2))
                    c1, c2 = (i / float(l1), i / float(l2))
                    cm = max(c1, c2)

                options.stdout.write(
                    "\t" + ("\t".join([options.percent_format for z in range(6)])) % (ri, r1, r2, c1, c2, cm))

            options.stdout.write("\n")

    E.stop()
Example #21
0
def runCommand(data):

    filename, cmd, options, tmpdir, subdirs = data

    if subdirs:
        outdir = "%s.dir/" % (filename)
        os.mkdir(outdir)
        cmd = re.sub("%DIR%", outdir, cmd)

    x = re.search("'--log=(\S+)'", cmd) or re.search("'--L\s+(\S+)'", cmd)
    if x:
        logfile = filename + ".log"
        cmd = cmd[:x.start()] + "--log=%s" % logfile + cmd[x.end():]
    else:
        logfile = filename + ".out"

    # working directory - needs to be the one from which the
    # the script is called to resolve input files.
    cwd = os.getcwd()

    if "<(" in cmd or "|" in cmd:
        if "'" in cmd:
            raise ValueError(
                "advanced bash syntax `<()` combined with single quotes")
        cmd = """/bin/bash -c '%s'""" % cmd

    if "|" in cmd:
        if r"\|" not in cmd:
            E.warn("pipes (`|`) within command need to be escaped, "
                   "otherwise jobs run on submit host")

    c = '%s -v "BASH_ENV=%s" -q %s -p %i %s %s' % (
        options.cluster_cmd, options.bashrc, options.cluster_queue,
        options.cluster_priority, options.cluster_options, cmd)

    iteration = 0
    while 1:

        iteration += 1
        if iteration > 1:
            E.info("%s: re-submitting command (repeat=%i): %s" %
                   (filename, iteration, c))
        else:
            E.info("%s: submitting command: %s" % (filename, c))

        infile = IOTools.openFile(filename, "r")
        outfile = IOTools.openFile(filename + ".out", "w")
        errfile = IOTools.openFile(filename + ".err", "a")

        retcode = subprocess.call(c,
                                  shell=True,
                                  stdin=infile,
                                  stdout=outfile,
                                  stderr=errfile,
                                  cwd=cwd,
                                  close_fds=True)

        infile.close()
        outfile.close()
        errfile.close()

        if hasFinished(retcode, filename, options.output_tag, logfile):
            break

        if iteration > options.resubmit:
            E.warn("%s: giving up executing command: retcode=%i" %
                   (filename, retcode))
            break

        E.warn("%s: error while executing command: retcode=%i" %
               (filename, retcode))

    return (retcode, filename, cmd, logfile, iteration)
Example #22
0
def build_report():
    '''build report from scratch.'''

    E.info("starting documentation build process from scratch")
    P.run_report(clean=True)
Example #23
0
def getOptionParser():
    """create parser and add options."""

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--split-at-lines",
                      dest="split_at_lines",
                      type="int",
                      help="split jobs according to line number [%default].")

    parser.add_option(
        "--split-at-column",
        dest="split_at_column",
        type="int",
        help="split jobs according to column. Columns start at number 1 "
        "and the input should be sorted by this column [%default].")

    parser.add_option(
        "--group-by-regex",
        dest="group_by_regex",
        type="string",
        help="group jobs according to a regular expression [%default].")

    parser.add_option(
        "--split-at-regex",
        dest="split_at_regex",
        type="string",
        help="split jobs according to a regular expression [%default].")

    parser.add_option("--split-at-tag",
                      dest="split_at_tag",
                      type="int",
                      help="split a file at a tag [%default].")

    parser.add_option(
        "--chunk-size",
        dest="chunksize",
        type="int",
        help="when splitting at regex or tag, aggregate x entries [%default].")

    parser.add_option(
        "--debug",
        dest="debug",
        action="store_true",
        help="debug mode. Do not delete temporary file [%default].")

    parser.add_option(
        "--dry-run",
        dest="dry_run",
        action="store_true",
        help="dry run. Do not split input and simply forward stdin to stdout. "
        "Useful for debugging the command [%default].")

    parser.add_option("--input-header",
                      dest="input_header",
                      action="store_true",
                      help="The input stream contains a table header. "
                      "This header is replicated for each job [%default].")

    parser.add_option(
        "--output-header",
        dest="output_header",
        action="store_true",
        help="The output jobs contain a table header. "
        "The header is removed for each job except for the first [%default].")

    parser.add_option(
        "--output-regex-header",
        dest="output_regex_header",
        type="string",
        help="Regular expression for header (in stdout stream). Any lines "
        "before the first line matching this regular expression are ignored"
        "[%default].")

    parser.add_option(
        "--output-tag",
        dest="output_tag",
        type="string",
        help="The output jobs contain a tag in the last line denoting "
        "job completion. If the unix return value denotes an error, the "
        "presence of this tag is checked [%default].")

    parser.add_option(
        "--subdirs",
        dest="subdirs",
        action="store_true",
        help="Run within separate subdirs for jobs. This permits "
        "multiple output streams. Use a placeholder %DIR% if you supply "
        "the ouput pattern as a command line option [%default].")

    parser.add_option(
        "-T",
        "--temp-dir",
        dest="tmpdir",
        type="string",
        help="Temporary directory to be used. Default is the current "
        "directory [%default].")

    parser.add_option("--max-files",
                      dest="max_files",
                      type="int",
                      help="create at most x files [%default].")

    parser.add_option(
        "--max-lines",
        dest="max_lines",
        type="int",
        help="in addition to splitting into chunksize, also split if "
        "more than max-lines is reached [%default].")

    parser.add_option(
        "--renumber",
        dest="renumber",
        type="string",
        help="renumber ids consecutively, supply a pattern [%default].")

    parser.add_option(
        "--renumber-column",
        dest="renumber_column",
        type="string",
        action="append",
        help="specify column to renumber. The format is regex:column, "
        "for example csv:1 or csv:id [%default].")

    parser.add_option(
        "-r",
        "--reduce",
        dest="reduce",
        type="string",
        action="append",
        help="Add reduce functions for specific files. The format is "
        "file:reducer. The default reducer is 'table' for all files "
        "[%default].")

    parser.add_option(
        "-m",
        "--map",
        dest="map",
        type="string",
        action="append",
        help="Map specific columns in tables. The format is "
        "file:column:pattern, for example .table:1:%06i [%default].")

    parser.add_option("--resume",
                      dest="resume",
                      type="string",
                      help="resume aborted run from files in dir [%default]")

    parser.add_option("--collect",
                      dest="collect",
                      type="string",
                      help="collect files in dir and process as normally "
                      "[%default]")

    parser.add_option("--is-binary",
                      dest="binary",
                      action="store_true",
                      help="the output is binary - files are concatenated "
                      "without parsing [%default]")

    parser.add_option(
        "--resubmit",
        dest="resubmit",
        type="int",
        help="if a job fails, automatically resubmit # times. Set to 0 "
        "in order to disable resubmission [%default]")

    parser.add_option("--fail",
                      dest="resubmit",
                      action="store_false",
                      help="if a job fails, do not resubmit [%default]")

    parser.add_option("--bashrc",
                      dest="bashrc",
                      type="string",
                      help="bashrc file to use [%default]")

    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("multiprocessing", "threads", "drmaa"),
                      help="method to submit jobs [%default]")

    parser.add_option("--job-memory",
                      dest="job_memory",
                      type="string",
                      help="per-job memory requirement."
                      "Unit must be specified, eg. 100M, 1G ")

    parser.add_option(
        "-e",
        "--env",
        dest="environment",
        type="string",
        action="append",
        help="environment variables to be passed to the jobs [%default]")

    parser.add_option(
        "--output-filename-pattern",
        dest="output_pattern",
        type="string",
        help="Pattern for secondary output filenames. Should contain a '%s' "
        "[%default].")

    parser.set_defaults(
        split_at_lines=None,
        split_at_column=None,
        split_at_regex=None,
        group_by_regex=None,
        split_at_tag=None,
        chunksize=100,
        cluster_cmd='qrsh -cwd -now n',
        bashrc="~/.bashrc",
        input_header=False,
        output_header=False,
        output_regex_header=None,
        debug=False,
        dry_run=False,
        tmpdir="./",
        subdirs=False,
        renumber=None,
        output_tag="# job finished",
        map=[],
        reduce=[],
        resume=None,
        renumber_column=[],
        resubmit=5,
        collect=None,
        method="drmaa",
        job_memory=None,
        max_files=None,
        max_lines=None,
        binary=False,
        environment=[],
        output_pattern="%s",
    )

    # stop parsing options at the first argument
    parser.disable_interspersed_args()

    return parser
Example #24
0
def update_report():
    '''update report.'''

    E.info("updating documentation")
    P.run_report(clean=False)
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id: r_mann_whitney_u.py 2782 2009-09-10 11:40:29Z andreas $")

    parser.add_option("-m", "--method", dest="method", type="string",
                      help="method to use [ks=Kolmogorov-Smirnov,mwu=Mann-WhitneyU]")
    parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string",
                      help="write hardcopy to file.", metavar="FILE")
    parser.add_option("-1", "--infile1", dest="filename_input1", type="string",
                      help="input filename for distribution 1.")
    parser.add_option("-2", "--infile2", dest="filename_input2", type="string",
                      help="input filename for distribution 2.")
    parser.add_option("-p", "--infile-map", dest="filename_input_map", type="string",
                      help="input filename for mapping categories to values.")

    parser.set_defaults(
        method="ks",
        filename_input1=None,
        filename_input2=None,
        filename_input_map=None,
    )

    (options, args) = E.start(parser,
                              add_pipe_options=True)

    map_category2value = {}
    if options.filename_input_map:
        map_category2value = IOTools.ReadMap(open(options.filename_input_map, "r"),
                                             map_functions=(str, float))

    values1, errors1 = IOTools.ReadList(open(options.filename_input1, "r"),
                                        map_category=map_category2value)
    values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"),
                                        map_category=map_category2value)

    E.info("ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i" % (len(values1), len(errors1),
                                                                 len(values2), len(errors2)))

    if options.hardcopy:
        R.png(options.hardcopy, width=1024, height=768)

    if options.method == "ks":
        result = R.ks_test(values1, values2)
    elif options.method == "mwu":
        result = R.wilcox_test(values1, values2, paired=False)

    R.assign("v1", values1)
    R.assign("v2", values2)

    R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True))

    R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot")

    R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );""")

    R("""hist( v1, freq=FALSE, width=0.5, density=10, main='Relative frequency histogram')""")
    R("""hist( v2, freq=FALSE, add=TRUE,   width=0.5, col='red', offset=0.5, density=20, angle=135)""")
    R("""hist( v1, freq=TRUE,  width=0.5, density=10, main='Absolute frequency histogram')""")
    R("""hist( v2, freq=TRUE,  add=TRUE,   width=0.5, col='red', offset=0.5, density=20, angle=135)""")

    print("## Results for %s" % result['method'])
    for x in ['p.value', 'statistic', 'alternative', 'method']:
        print(x, result[x])

    E.stop()
Example #26
0
def main(argv=None):

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option(
        "-f", "--fasta", dest="input_filename_fasta",
        type="string",
        help="filename with fasta sequences. ")

    parser.add_option(
        "-o", "--output-filename-sequences", dest="output_filename_sequences",
        type="string",
        help="output per sequence information to filename")

    parser.set_defaults(
        input_filename_fasta=None,
    )

    (options, args) = E.start(parser, argv=argv)

    if len(args) > 0:
        options.input_filename_fasta = args[0]

    sequence_pairs = []

    if options.input_filename_fasta != "-" and os.path.exists(
            options.input_filename_fasta + ".fai"):
        has_index = 1
        fastafile = pysam.FastaFile(options.input_filename_fasta)
        sequence_pairs = list(zip(fastafile.references, fastafile.lengths))
    else:
        has_index = 0
        iterator = pysam.FastxFile(options.input_filename_fasta)
        for record in iterator:
            sequence_pairs.append(
                (record.name,
                 len(record.sequence)))

    lengths = numpy.array([x[1] for x in sequence_pairs])

    options.stdout.write("\t".join((
        "has_index", "nsequences", "total_length", "min_length",
        "max_length", "median_length", "mean_length")) + "\n")

    if len(lengths) > 0:
        options.stdout.write("\t".join(map(str, (
            has_index,
            len(sequence_pairs),
            lengths.sum(),
            lengths.min(),
            lengths.max(),
            numpy.median(lengths),
            lengths.mean()))) + "\n")
    else:
        options.stdout.write("\t".join(map(str, (
            has_index,
            len(sequence_pairs),
            0,
            "",
            "",
            "",
            ""))) + "\n")

    if options.output_filename_sequences:
        with IOTools.open_file(options.output_filename_sequences, "w") as outf:
            outf.write("name\tlength\n")
            outf.write(
                "\n".join(["\t".join(map(str, x)) for x in sequence_pairs]) + "\n")

    E.stop()
Example #27
0
def buildAlleles(sequence, variants, reference_start=0, phased=True):
    '''build alleles for ``sequence`` adding ``variants``.

    Variants are assumed to be in 0-based coordinates on the same strand as the sequence. 
    ``reference_start`` is the position of the first base of ``sequence``. Set to 0, if
    the positions in ``variants`` are relative to ``sequence``.
    '''
    def _delete(allele, del_start, del_end, variant, sequence, startoffset,
                endoffset, feature_start, feature_end):
        '''little helper: update ``allele`` with a deletion ``del_start:del_end``.
        '''

        # truncate variant according to the feature
        variant = variant[startoffset:len(variant) - endoffset]

        n = variant.count("-")
        if n:
            if variant.startswith("-"):
                del_start += n
                variant = variant[n:]
            else:
                del_end -= n
                variant = variant[:-n]

        # due to gaps, the variant is not actually within the feauture
        if del_start >= del_end:
            return

        refseq = sequence[del_start:del_end].upper()

        assert refseq == variant, \
            'reference base mismatch at deletion: expected %s %s %s, got %s[%i:%i] at feature=%i-%i, variant=%i-%i, relative=%i-%i, del=%i-%i, action=%s' % \
            (sequence[del_start - 10:del_start],
             refseq,
             sequence[del_end:del_end + 10],
             variant, startoffset, len(variant) - endoffset,
             feature_start, feature_end,
             var_start, var_end,
             rel_start, rel_end,
             del_start, del_end,
             action)

        l = del_end - del_start

        # assert len("".join(allele[del_start:del_end])) == l, \
        #     "deletion conflicts with other indels: " \
        #     "got %s[%i:%i] (ref=%s, allele=%s) at feature=%i-%i, variant=%i-%i, relative=%i-%i, del=%i-%i, action=%s" % \
        #     (variant, startoffset, len(variant)-endoffset,
        #      refseq, str(allele[del_start:del_end]),
        #      feature_start, feature_end,
        #      var_start, var_end,
        #      rel_start, rel_end,
        #      del_start, del_end,
        #      action)

        allele[del_start:del_end] = [""] * l

    allele1 = list(sequence.lower())
    allele2 = list(sequence.lower())

    if reference_start is None:
        feature_start = 0
    else:
        feature_start = reference_start

    feature_end = feature_start + len(sequence)

    # main loop: insert variants into allele sequences
    for var_start, var_end, reference, action, has_wildtype, variantseqs in variants:

        # skip variants that are out-of-range
        if var_end <= feature_start or var_start >= feature_end:
            continue

        is_homozygous = len(variantseqs) == 1 and not has_wildtype

        rel_start, rel_end = var_start - feature_start, var_end - feature_start
        startoffset = max(0, feature_start - var_start)
        endoffset = max(0, var_end - feature_end)
        pruned_start, pruned_end = max(0,
                                       rel_start), min(len(sequence), rel_end)

        if action == "=":

            if E.global_options.loglevel >= 10:
                E.debug(
                    "adding SNP at postition %i: reference=%s variants=%s" %
                    (var_start, reference, variantseqs))

            if allele1[rel_start] == "" or allele2[rel_start] == "":
                # these can be cases, where a base is deleted in one allele,
                # but recorded as a homozygous substitution in another allele.
                E.warn("substitution conflicts with a deletion - ignored: %s" %
                       str((var_start, var_end, reference, action,
                            has_wildtype, variantseqs)))
                continue

            assert rel_start >= 0
            assert sequence[rel_start].upper() == reference, \
                'reference base mismatch: expected %s %s %s, got %s at feature=%i-%i, variant=%i-%i, relative=%i-%i, pruned=%i-%i, action=%s' % \
                (sequence[rel_start - 10:rel_start],
                 sequence[rel_start].upper(),
                 sequence[rel_start + 1:rel_start + 10],
                 reference,
                 feature_start, feature_end,
                 var_start, var_end,
                 rel_start, rel_end,
                 pruned_start, pruned_end,
                 action)

            if phased:
                allele1[rel_start] = variantseqs[0] + allele1[rel_start][1:]
                allele2[rel_start] = variantseqs[1] + allele2[rel_start][1:]
            elif is_homozygous:
                allele1[rel_start] = variantseqs[0] + allele1[rel_start][1:]
                allele2[rel_start] = variantseqs[0] + allele2[rel_start][1:]
            else:
                if has_wildtype:
                    if reference == variantseqs[0]:
                        allele2[rel_start] = variantseqs[1] + allele2[
                            rel_start][1:]
                    else:
                        allele2[rel_start] = variantseqs[0] + allele2[
                            rel_start][1:]
                else:
                    allele1[
                        rel_start] = variantseqs[0] + allele1[rel_start][1:]
                    allele2[
                        rel_start] = variantseqs[1] + allele2[rel_start][1:]

        elif action == "-":
            if phased:
                _delete(allele1, pruned_start, pruned_end, variantseqs[0],
                        sequence, startoffset, endoffset, feature_start,
                        feature_end)
                _delete(allele2, pruned_start, pruned_end, variantseqs[1],
                        sequence, startoffset, endoffset, feature_start,
                        feature_end)
            elif is_homozygous:
                _delete(allele1, pruned_start, pruned_end, variantseqs[0],
                        sequence, startoffset, endoffset, feature_start,
                        feature_end)
                _delete(allele2, pruned_start, pruned_end, variantseqs[0],
                        sequence, startoffset, endoffset, feature_start,
                        feature_end)
            else:
                if has_wildtype:
                    _delete(allele2, pruned_start, pruned_end, variantseqs[0],
                            sequence, startoffset, endoffset, feature_start,
                            feature_end)
                else:
                    _delete(allele1, pruned_start, pruned_end, variantseqs[0],
                            sequence, startoffset, endoffset, feature_start,
                            feature_end)
                    _delete(allele2, pruned_start, pruned_end, variantseqs[1],
                            sequence, startoffset, endoffset, feature_start,
                            feature_end)

        elif action == "+":
            # ignore insertions at position -1
            if rel_start < 0:
                continue

            if phased:
                allele1[rel_start] += variantseqs[0].upper()
                allele2[rel_start] += variantseqs[1].upper()
            elif is_homozygous:
                allele1[rel_start] += variantseqs[0].upper()
                allele2[rel_start] += variantseqs[0].upper()
            else:
                if has_wildtype:
                    allele2[rel_start] += variantseqs[0].upper()
                else:
                    allele1[rel_start] += variantseqs[0].upper()
                    allele2[rel_start] += variantseqs[1].upper()

        elif action == ">":
            # indel
            if rel_start >= 0:
                allele1[rel_start] += variantseqs[0].upper()
            _delete(allele2, pruned_start, pruned_end, variantseqs[1],
                    sequence, startoffset, endoffset, feature_start,
                    feature_end)

        elif action == "<":
            # delin
            if rel_start >= 0:
                allele2[rel_start] += variantseqs[1].upper()
            _delete(allele1, pruned_start, pruned_end, variantseqs[0],
                    sequence, startoffset, endoffset, feature_start,
                    feature_end)

    assert len(sequence) == len(allele1)
    assert len(sequence) == len(allele2)

    return (allele1, allele2)
Example #28
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version="%prog version: $Id$",
        usage=globals()["__doc__"])

    parser.add_option(
        "-g", "--glob", dest="glob_pattern", type="string",
        help="glob pattern to use for collecting files [%default].")

    parser.add_option(
        "-f", "--file-pattern", dest="file_pattern", type="string",
        help="only check files matching this pattern [%default].")

    parser.add_option("-m", "--mode", dest="mode", type="choice",
                      choices=("file", "node"),
                      help="analysis mode [%default].")

    parser.add_option(
        "-r", "--recursive", action="store_true",
        help="recursively look for logfiles from current directory "
        "[%default].")

    parser.set_defaults(
        truncate_sites_list=0,
        glob_pattern="*.log",
        mode="file",
        recursive=False,
    )

    (options, args) = E.Start(parser)

    if args:
        filenames = args
    elif options.glob_pattern:
        filenames = glob.glob(options.glob_pattern)

    if len(filenames) == 0:
        raise ValueError("no files to analyse")

    if options.mode == "file":
        totals = Logfile.LogFileData()

        options.stdout.write("file\t%s\n" % totals.getHeader())

        for filename in filenames:
            if filename == "-":
                infile = sys.stdin
            elif filename[-3:] == ".gz":
                infile = gzip.open(filename, "r")
            else:
                infile = open(filename, "r")

            subtotals = Logfile.LogFileData()
            for line in infile:
                subtotals.add(line)

            infile.close()

            options.stdout.write("%s\t%s\n" % (filename, str(subtotals)))
            totals += subtotals

        options.stdout.write("%s\t%s\n" % ("total", str(totals)))

    elif options.mode == "node":

        chunks_per_node = {}

        rx_node = re.compile("# job started at .* \d+ on (\S+)")

        for filename in filenames:
            if filename == "-":
                infile = sys.stdin
            elif filename[-3:] == ".gz":
                infile = gzip.open(filename, "r")
            else:
                infile = open(filename, "r")

            data = Logfile.LogFileDataLines()

            for line in infile:

                if rx_node.match(line):
                    node_id = rx_node.match(line).groups()[0]
                    data = Logfile.LogFileDataLines()
                    if node_id not in chunks_per_node:
                        chunks_per_node[node_id] = []
                    chunks_per_node[node_id].append(data)
                    continue

                data.add(line)

        options.stdout.write("node\t%s\n" % data.getHeader())
        total = Logfile.LogFileDataLines()

        for node, data in sorted(chunks_per_node.items()):
            subtotal = Logfile.LogFileDataLines()
            for d in data:
                # options.stdout.write( "%s\t%s\n" % (node, str(d) ) )
                subtotal += d

            options.stdout.write("%s\t%s\n" % (node, str(subtotal)))

            total += subtotal

        options.stdout.write("%s\t%s\n" % ("total", str(total)))

    E.Stop()
Example #29
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: split_fasta.py 1714 2007-12-11 16:51:12Z andreas $"
    )

    parser.add_option("-f",
                      "--file",
                      dest="input_filename",
                      type="string",
                      help="input filename. If not given, stdin is used.",
                      metavar="FILE")

    parser.add_option(
        "-i",
        "--input-pattern",
        dest="input_pattern",
        type="string",
        help="input pattern. Parses description line in order to extract id.")

    parser.add_option(
        "-o",
        "--output-filename-pattern",
        dest="output_pattern",
        type="string",
        help="output pattern. Gives filename for a given sequence.")

    parser.add_option(
        "-n",
        "--num-sequences",
        dest="num_sequences",
        type="int",
        help="split by number of sequences (not implemented yet).")

    parser.add_option("-m",
                      "--map",
                      dest="map_filename",
                      type="string",
                      help="map filename. Map identifiers to filenames",
                      metavar="FILE")

    parser.add_option("-s",
                      "--skip-identifiers",
                      dest="skip_identifiers",
                      action="store_true",
                      help="do not write identifiers.",
                      metavar="FILE")

    parser.add_option("--min-size",
                      dest="min_size",
                      type="int",
                      help="minimum cluster size.")

    parser.set_defaults(input_filename=None,
                        map_filename=None,
                        skip_identifiers=False,
                        input_pattern="^(\S+)",
                        min_size=0,
                        num_sequences=None,
                        output_pattern="%s")

    (options, args) = E.start(parser)

    if options.input_filename:
        infile = IOTools.open_file(options.input_filename, "r")
    else:
        infile = sys.stdin

    if options.map_filename:
        map_id2filename = IOTools.ReadMap(open(options.map_filename, "r"))
    else:
        map_id2filename = {}

    if options.num_sequences:
        files = FilesChunks(chunk_size=options.num_sequences,
                            output_pattern=options.output_pattern,
                            skip_identifiers=options.skip_identifiers)

    else:
        files = Files(output_pattern=options.output_pattern,
                      skip_identifiers=options.skip_identifiers)

    if options.input_pattern:
        rx = re.compile(options.input_pattern)
    else:
        rx = None

    ninput = 0
    noutput = 0
    identifier = None
    chunk = 0

    for seq in FastaIterator.iterate(infile):

        ninput += 1

        if rx:
            try:
                identifier = rx.search(seq.title).groups()[0]
            except AttributeError:
                print("# parsing error in description line %s" % (seq.title))
        else:
            identifier = seq.title

        if map_id2filename:
            if identifier in map_id2filename:
                identifier = map_id2filename[identifier]
            else:
                continue

        files.Write(identifier, seq)
        noutput += 1

    if options.input_filename:
        infile.close()

    # delete all clusters below a minimum size
    # Note: this has to be done at the end, because
    # clusters sizes are only available once both the fasta
    # file and the map has been parsed.
    if options.min_size:
        ndeleted = files.DeleteFiles(min_size=options.min_size)
    else:
        ndeleted = 0

    if options.loglevel >= 1:
        print("# input=%i, output=%i, ndeleted=%i" %
              (ninput, noutput, ndeleted))

    E.stop()
def main(argv=None):

    if not argv:
        argv = sys.argv

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-e",
                      "--output-equivalent",
                      dest="write_equivalent",
                      action="store_true",
                      help="write equivalent entries [default=%default].")

    parser.add_option("-f",
                      "--output-full",
                      dest="write_full",
                      action="store_true",
                      help="write full gff entries [default=%default].")

    parser.add_option("-p",
                      "--add-percent",
                      dest="add_percent",
                      action="store_true",
                      help="add percentage columns [default=%default].")

    parser.add_option("-s",
                      "--ignore-strand",
                      dest="ignore_strand",
                      action="store_true",
                      help="ignore strand information [default=%default].")

    parser.set_defaults(
        write_equivalent=False,
        write_full=False,
        add_percent=False,
        ignore_strand=False,
        as_gtf=False,
    )

    (options, args) = E.start(parser, argv, add_output_options=True)

    if len(args) != 2:
        raise ValueError("two arguments required")

    input_filename1, input_filename2 = args

    # duplicated features cause a problem. Make sure
    # features are non-overlapping by running
    # gff_combine.py on GFF files first.

    E.info("reading data started")

    idx, genes2 = {}, set()
    for e in GTF.readFromFile(IOTools.open_file(input_filename2, "r")):
        genes2.add(e.gene_id)
        if e.contig not in idx:
            idx[e.contig] = bx.intervals.intersection.Intersecter()
        idx[e.contig].add_interval(
            bx.intervals.Interval(e.start, e.end, value=e))

    overlaps_genes = []

    E.info("reading data finished: %i contigs" % len(idx))

    # outfile_diff and outfile_overlap not implemented
    # outfile_diff = getFile( options, "diff" )
    # outfile_overlap = getFile( options, "overlap" )
    overlapping_genes = set()

    genes1 = set()

    # iterate over exons
    with IOTools.open_file(input_filename1, "r") as infile:
        for this in GTF.iterator(infile):

            genes1.add(this.gene_id)

            try:
                intervals = idx[this.contig].find(this.start, this.end)
            except KeyError:
                continue

            others = [x.value for x in intervals]
            for other in others:
                overlapping_genes.add((this.gene_id, other.gene_id))

            # check for identical/half-identical matches
            output = None
            for other in others:
                if this.start == other.start and this.end == other.end:
                    output, symbol = other, "="
                    break
            else:
                for other in others:
                    if this.start == other.start or this.end == other.end:
                        output, symbol = other, "|"
                        break
                else:
                    symbol = "~"

    # if outfile_diff != options.stdout: outfile_diff.close()
    # if outfile_overlap != options.stdout: outfile_overlap.close()

    outfile = None
    ##################################################################
    ##################################################################
    ##################################################################
    # print gene based information
    ##################################################################
    if overlapping_genes:
        outfile = getFile(options, "genes_ovl")
        outfile.write("gene_id1\tgene_id2\n")
        for a, b in sorted(overlapping_genes):
            outfile.write("%s\t%s\n" % (a, b))
        if outfile != options.stdout:
            outfile.close()

        outfile_total = getFile(options, "genes_total")
        outfile_total.write(
            "set\tngenes\tnoverlapping\tpoverlapping\tnunique\tpunique\n")

        outfile = getFile(options, "genes_uniq1")
        b = set([x[0] for x in overlapping_genes])
        d = genes1.difference(b)
        outfile.write("gene_id1\n")
        outfile.write("\n".join(sorted(d)) + "\n")
        if outfile != options.stdout:
            outfile.close()
        outfile_total.write(
            "%s\t%i\t%i\t%5.2f\t%i\t%5.2f\n" %
            (os.path.basename(input_filename1), len(genes1), len(b),
             100.0 * len(b) / len(a), len(d), 100.0 * len(d) / len(genes1)))

        outfile = getFile(options, "genes_uniq2")
        b = set([x[1] for x in overlapping_genes])
        d = genes2.difference(b)
        outfile.write("gene_id2\n")
        outfile.write("\n".join(sorted(d)) + "\n")
        if outfile != options.stdout:
            outfile.close()

        outfile_total.write(
            "%s\t%i\t%i\t%5.2f\t%i\t%5.2f\n" %
            (os.path.basename(input_filename2), len(genes2), len(b),
             100.0 * len(b) / len(a), len(d), 100.0 * len(d) / len(genes2)))
        if outfile_total != options.stdout:
            outfile_total.close()

    E.stop()