Example #1
0
def buildFastQCSummaryStatus(infiles, outfile, datadir):
    '''load fastqc status summaries into a single table.'''

    outf = IOTools.openFile(outfile, "w")
    names = set()
    results = []
    for infile in infiles:
        track = P.snip(os.path.basename(infile), ".fastqc")
        filename = os.path.join(datadir,
                                track + "*_fastqc",
                                "fastqc_data.txt")
        
        # there can be missing sections
        for fn in glob.glob(filename):
            stats = collections.defaultdict(str)
            for name, status, header, data in FastqcSectionIterator(
                    IOTools.openFile(fn)):
                stats[name] = status

            results.append((track, fn, stats))
            names.update(stats.keys())
            
    names = list(names)
    outf.write("track\tfilename\t%s\n" % "\t".join(names))
    for track, fn, stats in results:
        outf.write("%s\t%s\t%s\n" %
                   (track, os.path.dirname(fn),
                    "\t".join(stats[x] for x in names)))
    outf.close()
Example #2
0
def summarizeAllProcessing( infiles, outfile ):
    '''summarize processing information.'''

    outf = IOTools.openFile( outfile, "w" )
    data = []
    for infile in infiles:
        inf = IOTools.openFile( infile )
        for line in inf:
            track, step, pair, ninput, noutput = line[:-1].split("\t")
            if track == "track": continue
            data.append( (track, step, pair, ninput, noutput) )
            
    # sort by track, pair, input
    data.sort( key = lambda x: (x[0], x[2], -int(x[3])))
    first = True
    for key, v in itertools.groupby( data, lambda x: (x[0], x[2])):
        vals = list(v)
        track,pair = key
        ninput = int(vals[0][3])
        outputs = [int(x[4]) for x in vals]
        if first:
            outf.write( "track\tpair\tninput\t%s\t%s\t%s\t%s\n" % ("\t".join( [x[1] for x in vals] ),
                                                                   "noutput",
                                                                   "\t".join( ["percent_%s" % x[1] for x in vals] ),
                                                                   "percent_output" ))
            first = False
        outf.write( "%s\t%s\t%i\t%s\t%i\t%s\t%s\n" % ( track, pair, ninput, 
                                                       "\t".join( map(str,outputs)),
                                                       outputs[-1], 
                                                       "\t".join( [ "%5.2f" % (100.0 * x / ninput) for x in outputs ] ),
                                                       "%5.2f" % (100.0 * outputs[-1] / ninput)))
    outf.close()
def annotate(infile, outfile, geneset):
    '''
    annotate NOGs into functional categories
    '''
    annotation = {}
    E.info("loading geneset")
    anno = IOTools.openFile(geneset)
    for line in anno.readlines():
        data = line[:-1].split("\t")
        nog, funccat = data[1], data[3]
        annotation[nog] = funccat
    E.info("finished loading gene set")

    E.info("annotating infile")
    inf = IOTools.openFile(infile)
    header = inf.readline()
    outf = IOTools.openFile(outfile, "w")
    outf.write(header[:-1] + "\ttaxa\n")
    for line in inf.readlines():
        data = line[:-1].split("\t")
        nog = data[0]
        try:
            pathway = annotation[nog]
        except KeyError:
            pathway = "Function unknown"
        outf.write(line[:-1] + "\t" + pathway + "\n")
    outf.close()
Example #4
0
def filterGTF(gtf, filterstring, tempout):

    if "!=" in filterstring:
        column, value = filterstring.split("!=")
        value = value.split("+")
        filtertype = "notin"

    elif "=" in filterstring:
        column, value = filterstring.split("=")
        value = value.split("+")
        filtertype = "in"

    elif "-in_file-" in filterstring:
        column, value = filterstring.split("-in_file-")
        value = [line.strip() for line in IOTools.openFile(value)]
        filtertype = "in_file"

    elif "-notin_file-" in filterstring:
        column, value = filterstring.split("-notin_file-")
        value = [line.strip() for line in IOTools.openFile(value)]
        filtertype = "notin_file"

    elif "-morethan-" in filterstring:
        column, value = filterstring.split("-morethan-")
        value = float(value)
        filtertype = "morethan"

    elif "-lessthan-" in filterstring:
        column, value = filterstring.split("-lessthan-")
        value = float(value)
        filtertype = "lessthan"

    gfile = IOTools.openFile(gtf)
    G = GTF.iterator(gfile)

    out = IOTools.openFile(tempout, "w")
    for item in G:
        D = item.asDict()
        D['contig'] = item.contig
        D['source'] = item.source
        D['feature'] = item.feature
        D['start'] = item.start
        D['end'] = item.end
        D['strand'] = item.strand
        D['frame'] = item.frame

        if filtertype == "in" or filtertype == 'in_file':
            if D[column] in value:
                out.write("%s\n" % str(item))
        elif filtertype == "notin" or filtertype == 'notin_file':
            if D[column] not in value:
                out.write("%s\n" % str(item))
        elif filtertype == "morethan":
            if float(D[column]) > value:
                out.write("%s\n" % str(item))
        elif filtertype == "lessthan":
            if float(D[column]) < value:
                out.write("%s\n" % str(item))
    out.close()
    gfile.close()
Example #5
0
def buildInputFiles(infile, outfiles):
    '''
    build input file based on parameters and fasta sequences
    that primers are to be designed for
    '''
    PARAMS["constraints_primer_mispriming_library"] = glob.glob("mispriming.dir/*.lib")[0]

    fasta, identifiers = infile[0], "identifiers.tsv"
    inf = IOTools.openFile(fasta)
    
    E.info("Reading ids for primer design")
    ids = readIdentifiers(identifiers)
    
    E.info("collecting sequences")
    for f in FastaIterator.iterate(IOTools.openFile(fasta)):
        if f.title in ids:
            outf = IOTools.openFile(os.path.join(
                "input.dir",f.title.replace(" ", "_").replace("/","_") + ".input").replace('"', ''), "w")
            seq = f.sequence
            outf.write("SEQUENCE_ID=%s\n" % f.title)
            for key, value in PARAMS.iteritems():
                if "constraints" in key:
                    outf.write("%s=%s\n" % (key.replace("constraints_", "").upper(), value))
            outf.write("SEQUENCE_TEMPLATE=%s\n=\n" % seq)
            outf.close()
Example #6
0
def chunk_iterator_lines(infile, args, prefix, use_header=False):
    """split by lines."""

    chunk_size = args[0]
    n = 0
    filename = "%s/%010i.in" % (prefix, n)
    outfile = IOTools.openFile(filename, "w")
    header = None

    for line in infile:
        if line[0] == "#":
            continue

        if not header and n == 0 and use_header:
            header = line
            outfile.write(header)
            continue

        n += 1

        if n % chunk_size == 0:
            outfile.close()
            yield filename
            filename = "%s/%010i.in" % (prefix, n)
            outfile = IOTools.openFile(filename, "w")
            if header:
                outfile.write(header)

        outfile.write(line)
    outfile.close()
    yield filename
    def copy(src, dst, name):

        # remove "template" and the pipeline type from file/directory
        # names.
        fn_dest = os.path.join(
            destination_dir,
            dst,
            rx_type.sub("", rx_file.sub(name, src)))

        fn_src = os.path.join(srcdir,
                              "pipeline_template_data", src)

        E.debug("fn_src=%s, fn_dest=%s, src=%s, dest=%s" %
                (fn_src, fn_dest, src, dst))

        if os.path.exists(fn_dest) and not options.force:
            raise OSError(
                "file %s already exists - not overwriting." % fn_dest)

        if fn_src.endswith(".png"):
            shutil.copyfile(fn_src, fn_dest)
        else:
            with IOTools.openFile(fn_dest, "w") as outfile:
                with IOTools.openFile(fn_src) as infile:
                    for line in infile:
                        outfile.write(rx_reportdir.sub(reportdir,
                                                       rx_template.sub(name, line)))
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", 
                             usage = globals()["__doc__"] )

    parser.add_option("-a", "--fastq1", dest="fastq1", type="string",
                      help="supply read1 fastq file"  )
    parser.add_option("-b", "--fastq2", dest="fastq2", type="string",
                      help="supply read2 fastq file"  )

    ## add common options (-h/--help, ...) and parse command line 
    (options, args) = E.Start( parser, argv = argv )

    fastq1 = IOTools.openFile(options.fastq1)
    fastq2 = IOTools.openFile(options.fastq2)

    E.info("iterating over fastq files")
    f1_count = 0
    for f1, f2 in itertools.izip_longest(Fastq.iterate(fastq1), Fastq.iterate(fastq2)):
        if not (f1 and f2) or (not f2 and f1):
            try:
                raise PairedReadError("unpaired reads detected. Are files sorted? are files of equal length?")
            except PairedReadError, e:
                raise PairedReadError(e), None, sys.exc_info()[2]
        else:
            assert f1.identifier.endswith("/1") and f2.identifier.endswith("/2"), "Reads in file 1 must end with /1 and reads in file 2 with /2"
            options.stdout.write(">%s\n%s\n>%s\n%s\n" % (f1.identifier, f1.seq, f2.identifier, f2.seq))
            f1_count += 1
def buildForegroundSets(infiles, outfile):
    '''
    build multiset of genes that are differentiallt
    expressed based on cluster assignments
    '''
    clusters, probe2gene_file = infiles

    # read probe 2 gene map
    probe2gene = {}
    probe2gene_file = IOTools.openFile(probe2gene_file)
    for line in probe2gene_file.readlines():
        data = line[:-1].split("\t")
        probe, gene = [x.replace('"', '') for x in data]
        probe2gene[probe] = gene
        
    # read probe 2 cluster map
    probe2cluster = {}
    clusters = IOTools.openFile(clusters)
    clusters.readline()
    for line in clusters.readlines():
        data = line[:-1].split("\t")
        probe, cluster = data
        probe2cluster[probe] = cluster
    
    # output genes in each cluster
    for c in set(probe2cluster.values()):
        outname = "pathways.dir/C%s.foreground" % c
        outf = IOTools.openFile(outname, "w")
        for probe, cluster in probe2cluster.iteritems():
            if cluster == c:
                outf.write("%s\n" % probe2gene[probe])
            else:
                continue
        outf.close()
def mergeAdaptorFasta(infiles, outfile):
    '''
    Merge fasta files of adapter contamination,
    include reverse complement, remove duplicate sequences
    '''

    fasta_dict = {}
    for each in infiles:
        with IOTools.openFile(each, "r") as infle:
            for line in infle:
                if line[0] == '>':
                    adapt = line.lstrip(">").rstrip("\n")
                    fasta_dict[adapt] = set()
                    fasta_dict[adapt + "_R"] = set()
                else:
                    seq = line.rstrip("\n")
                    rev_seq = reverseComplement(seq)
                    fasta_dict[adapt].add(seq)
                    fasta_dict[adapt + "_R"].add(rev_seq)

    # if there are no adapters to remove break the pipeline here
    if not len(fasta_dict):
        raise AttributeError("There are no overrepresented sequences in "
                             "these fastq files.  Please turn off this "
                             "feature and re-run the pipeline")
    else:
        pass
    with IOTools.openFile(outfile, "w") as outfle:
        for key, value in fasta_dict.items():
            outfle.write(">%s\n%s\n" % (key, list(value)[0]))
def exportSequencesFromBedFile( infile, outfile, masker = None, mode = "intervals" ):
    '''export sequences for intervals in :term:`bed`-formatted *infile* 
    to :term:`fasta` formatted *outfile*
    '''

    track = P.snip( infile, ".bed.gz" )

    fasta = IndexedFasta.IndexedFasta( os.path.join( PARAMS["genome_dir"], PARAMS["genome"] ) )
    outs = IOTools.openFile( outfile, "w")

    ids, seqs = [], []
    for bed in Bed.setName(Bed.iterator( IOTools.openFile(infile) )):
        lcontig = fasta.getLength( bed.contig )

        if mode == "intervals":
            seqs.append( fasta.getSequence( bed.contig, "+", bed.start, bed.end) )
            ids.append( "%s_%s %s:%i..%i" % (track, bed.name, bed.contig, bed.start, bed.end) )

        elif mode == "leftright":
            l = bed.end - bed.start

            start, end = max(0,bed.start-l), bed.end-l
            ids.append( "%s_%s_l %s:%i..%i" % (track, bed.name, bed.contig, start, end) )
            seqs.append( fasta.getSequence( bed.contig, "+", start, end) )
            
            start, end = bed.start+l, min(lcontig,bed.end+l)
            ids.append( "%s_%s_r %s:%i..%i" % (track, bed.name, bed.contig, start, end) )
            seqs.append( fasta.getSequence( bed.contig, "+", start, end) )
            
    masked = maskSequences( seqs, masker )
    outs.write("\n".join( [ ">%s\n%s" % (x,y) for x,y in zip(ids, masked) ] ) )

    outs.close()
def GATKpreprocessing(infile, outfile):
    '''Reorders BAM according to reference fasta and add read groups using
       SAMtools, realigns around indels and recalibrates base quality scores
       using GATK'''

    to_cluster = USECLUSTER
    track = P.snip(os.path.basename(infile), ".bam")
    tmpdir_gatk = P.getTempDir()
    job_memory = PARAMS["gatk_memory"]

    genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"],
                           PARAMS["genome"])

    outfile1 = outfile.replace(".bqsr", ".readgroups.bqsr")
    outfile2 = outfile.replace(".bqsr", ".realign.bqsr")

    PipelineExome.GATKReadGroups(infile, outfile1, genome,
                                 PARAMS["readgroup_library"],
                                 PARAMS["readgroup_platform"],
                                 PARAMS["readgroup_platform_unit"])

    PipelineExome.GATKIndelRealign(outfile1, outfile2, genome,
                                   PARAMS["gatk_threads"])

    IOTools.zapFile(outfile1)

    PipelineExome.GATKBaseRecal(outfile2, outfile, genome,
                                PARAMS["gatk_dbsnp"],
                                PARAMS["gatk_solid_options"])
    IOTools.zapFile(outfile2)
def buildCoverageStats(infile, outfile):
    '''Generate coverage statistics for regions of interest from a
       bed file using Picard'''

    # TS check whether this is always required or specific to current baits file

    # baits file requires modification to make picard accept it
    # this is performed before CalculateHsMetrics
    to_cluster = USECLUSTER
    baits = PARAMS["roi_baits"]
    modified_baits = infile + "_temp_baits_final.bed"
    regions = PARAMS["roi_regions"]
    statement = '''samtools view -H %(infile)s > %(infile)s_temp_header.txt;
                awk 'NR>2' %(baits)s |
                awk -F '\\t' 'BEGIN { OFS="\\t" } {print $1,$2,$3,"+",$4;}'
                > %(infile)s_temp_baits.bed;
                cat  %(infile)s_temp_header.txt %(infile)s_temp_baits.bed
                > %(modified_baits)s; checkpoint ;
                rm -rf %(infile)s_temp_baits.bed %(infile)s_temp_header.txt
                '''
    P.run()

    PipelineMappingQC.buildPicardCoverageStats(
        infile, outfile, modified_baits, modified_baits)

    IOTools.zapFile(modified_baits)
Example #14
0
def buildFastQCSummaryStatus(infiles, outfile):
    '''load fastqc status summaries into a single table.'''

    outf = IOTools.openFile(outfile, "w")
    first = True
    for infile in infiles:
        track = P.snip(infile, ".fastqc")
        filename = os.path.join(
            PARAMS["exportdir"], "fastqc", track + "*_fastqc",
            "fastqc_data.txt")

        for fn in glob.glob(filename):
            prefix = os.path.basename(os.path.dirname(fn))
            results = []

            names, stats = [], []
            for name, status, header, data in FastqcSectionIterator(
                    IOTools.openFile(fn)):
                stats.append(status)
                names.append(name)

            if first:
                outf.write("track\tfilename\t%s\n" % "\t".join(names))
                first = False

            outf.write("%s\t%s\t%s\n" %
                       (track, os.path.dirname(fn), "\t".join(stats)))
    outf.close()
Example #15
0
def extractEBioinfo(eBio_ids, vcfs, outfile):
    '''find the number of mutations identitified in previous studies (eBio_ids)
    for the mutated genes in the vcfs'''

    genes = set()

    n = 0
    for vcf in vcfs:
        if n > 0:
            break
        else:
            n += 1
        infile = VCF.VCFFile(IOTools.openFile(vcf))
        for vcf_entry in infile:
            # assumes all vcf entries without "REJECT" are "PASS"
            if vcf_entry.filter != "REJECT":
                info_entries = vcf_entry.info.split(";")
                for entry in info_entries:
                    if "SNPEFF_GENE_NAME" in entry:
                        genes.update((entry.split("=")[1],))

    eBio_ids = IOTools.openFile(eBio_ids, "r")

    tissue_counts = collections.defaultdict(
        lambda: collections.defaultdict(
            lambda: collections.defaultdict(int)))

    for line in eBio_ids:
        tissue, study, table = line.strip().split("\t")
        for gene in genes:
            url = ("http://www.cbioportal.org/webservice.do?cmd=getProfileData&"
                   "case_set_id=%(study)s_all&genetic_profile_id=%(table)s&"
                   "gene_list=%(gene)s" % locals())
            print url
            df = pd.io.parsers.read_csv(url, comment="#", sep="\t",
                                        header=False, index_col=0)

            # check dataframe contains data!
            if df.shape[0] != 0:
                tissue_counts[tissue][gene]["total"] += df.shape[1]-2
                tissue_counts[tissue][gene]["mutations"] += int(df.count(1))-1

    out = IOTools.openFile(outfile, "w")

    tissues = tissue_counts.keys()

    out.write("gene\t%s\n" % "\t".join([
        "%s_frequency" % x.replace(" ", "_") for x in tissues]))

    for gene in genes:
        freq_values = []
        for tissue in tissues:
            total = tissue_counts[tissue][gene]["total"]
            mutations = tissue_counts[tissue][gene]["mutations"]
            print "total: ", total, "mutations: ", mutations
            freq_values.append(np.divide(float(mutations), total))

        out.write("%s\t%s\n" % (gene, "\t".join(map(str, freq_values))))

    out.close()
def intersectBedFiles(infiles, outfile):
    '''merge :term:`bed` formatted *infiles* by intersection
    and write to *outfile*.

    Only intervals that overlap in all files are retained.
    Interval coordinates are given by the first file in *infiles*.

    Bed files are normalized (overlapping intervals within 
    a file are merged) before intersection. 

    Intervals are renumbered starting from 1.
    '''

    if len(infiles) == 1:

        shutil.copyfile(infiles[0], outfile)

    elif len(infiles) == 2:

        if IOTools.isEmpty(infiles[0]) or IOTools.isEmpty(infiles[1]):
            P.touch(outfile)
        else:
            statement = '''
        intersectBed -u -a %s -b %s 
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        | bgzip > %%(outfile)s 
        ''' % (infiles[0], infiles[1])
            P.run()

    else:

        tmpfile = P.getTempFilename(".")

        # need to merge incrementally
        fn = infiles[0]
        if IOTools.isEmpty(infiles[0]):
            P.touch(outfile)
            return

        statement = '''mergeBed -i %(fn)s > %(tmpfile)s'''
        P.run()

        for fn in infiles[1:]:
            if IOTools.isEmpty(infiles[0]):
                P.touch(outfile)
                os.unlink(tmpfile)
                return

            statement = '''mergeBed -i %(fn)s | intersectBed -u -a %(tmpfile)s -b stdin > %(tmpfile)s.tmp; mv %(tmpfile)s.tmp %(tmpfile)s'''
            P.run()

        statement = '''cat %(tmpfile)s
        | cut -f 1,2,3,4,5 
        | awk 'BEGIN { OFS="\\t"; } {$4=++a; print;}'
        | bgzip
        > %(outfile)s '''
        P.run()

        os.unlink(tmpfile)
Example #17
0
def fetchProbeFragments(probe_bed, digest_bed, outfile,
                        lookup_out):

    digest_fragments = pysam.TabixFile(digest_bed)
    bed = Bed.Bed()
    with IOTools.openFile(outfile, "w") as outf, \
         IOTools.openFile(lookup_out,"w") as lookup:

        lookup.write("probe\tfragment\n")
        for probe in Bed.iterator(IOTools.openFile(probe_bed)):
            
            frag = digest_fragments.fetch(probe.contig,
                                          probe.start,
                                          probe.end,
                                          parser=pysam.asBed())
            frag = list(frag)
            if not len(frag) == 1:
                E.warn("%i fragments found for probe %s, skipping" %
                       (len(frag), probe.name))
                continue

            frag = frag[0]
            bed.start = frag.start
            bed.end = frag.end
            bed.contig = frag.contig
            bed["name"] = probe.name
            bed["score"] = "."
            bed["strand"] = "+"

            lookup.write("%s\t%s\n" % (probe.name, frag.name))
            outf.write(str(bed) + "\n")
Example #18
0
def chunk_iterator_regex_split(infile, args, prefix, use_header=False):
    """split where regular expression is true.
    """

    rex = args[0]
    chunk_size = args[2]
    max_lines = args[3]

    nlines = 0
    n = 0
    filename = "%s/%010i.in" % (prefix, n)
    outfile = IOTools.openFile(filename, "w")

    for line in infile:

        if line[0] == "#":
            continue

        if rex.search(line[:-1]):
            if n > 0 and (n % chunk_size == 0 or
                          (max_lines and nlines > max_lines)):
                outfile.close()
                yield filename
                filename = "%s/%010i.in" % (prefix, n)
                outfile = IOTools.openFile(filename, "w")
                nlines = 0

            n += 1

        outfile.write(line)
        nlines += 1

    outfile.close()
    yield filename
def getProbeFragments(probe_bed, digest_bed, outfile,
                        lookup_out):
    
    # First find the length of the restriction enzyme cut, required to obtain the start and end coordinates
    # from the pregenerated file.
    # First iteration, no comparison
    first_iteration = True
    
    length_RE_cut = 0
    
    last_bed = None
    
    for bed_digest in Bed.iterator(IOTools.openFile(digest_bed)):
                
        if(first_iteration):
            first_iteration = False
        else:
            # If they are in the same contig they can be compared
            if(bed_digest.contig == last_bed.contig):
                length_RE_cut = bed_digest.start - last_bed.end
                break
        
        last_bed = bed_digest
    
    
    digest_fragments = pysam.TabixFile(digest_bed)
    bed = Bed.Bed()
    with IOTools.openFile(outfile, "w") as outf, \
         IOTools.openFile(lookup_out,"w") as lookup:

        lookup.write("probe\tfragment\n")
        for probe in Bed.iterator(IOTools.openFile(probe_bed)):
            
            frag = digest_fragments.fetch(probe.contig,
                                          probe.start,
                                          probe.end,
                                          parser=pysam.asBed())
            frag = list(frag)
            if not len(frag) == 1:
                E.warn("%i fragments found for probe %s, skipping" %
                       (len(frag), probe.name))
                continue

            frag = frag[0]
            
            # The restriction enzyme cut on the left side of the fragment
            # is the end site of the last restriction enzyme fragment + 1
            # (+1 because according to the manual coordinates are specified
            # in 1-origin for the bed start.)
            
            bed.start = frag.start-length_RE_cut+1
            bed.end = frag.end+length_RE_cut
            bed.contig = frag.contig
            bed["name"] = probe.name
            bed["score"] = "."
            bed["strand"] = "+"

            lookup.write("%s\t%s\n" % (probe.name, frag.name))
            outf.write(str(bed) + "\n")
Example #20
0
def iterator_psl_intervals(options):
    """iterate over psl file yield an entry together with overlapping entries.

    returns tuples of (match, list(query_intervals), list(target_intervals))
    """

    if options.filename_filter_query:
        intervals_query = readIntervals(
            IOTools.openFile(options.filename_filter_query, "r"), options)
    else:
        intervals_query = None

    if options.filename_filter_target:
        intervals_target = readIntervals(
            IOTools.openFile(options.filename_filter_target, "r"), options)
    else:
        intervals_target = None

    iterator = Blat.BlatIterator(options.stdin)

    ninput = 0

    while 1:

        match = iterator.next()
        if not match:
            break

        ninput += 1
        if options.test and ninput >= options.test:
            break

        if options.loglevel >= 1 and ninput % options.report_step == 0:
            options.stdlog.write("# progress: ninput=%i\n" % (ninput))
            options.stdlog.flush()

        qx, tx = None, None
        if intervals_query:
            try:
                qx = list(
                    intervals_query.get(match.mQueryId, match.mQueryFrom, match.mQueryTo))
            except KeyError:
                qx = []

        if intervals_target:
            try:
                tx = list(
                    intervals_target.get(match.mSbjctId, match.mSbjctFrom, match.mSbjctTo))
            except KeyError:
                tx = []

        if options.loglevel >= 2:
            options.stdlog.write(
                "###################################################\n")
            options.stdlog.write("# testing match %s\n" % (str(match)))
            options.stdlog.write(
                "###################################################\n")

        yield match, qx, tx
Example #21
0
    def getRunStatement(self, infile, outfile, controlfile):
        """
        Generate a specific run statement for each peakcaller class
        """
        # select location of the spp script to run
        if self.PARAMS_PEAKCALLER["spp_options_idr_script"] == "default":
            executable = IOTools.which("run_spp.R")
        elif self.PARAMS_PEAKCALLER["spp_options_idr_script"] == "nodups":
            executable = IOTools.which("run_spp_nodups.R")
        else:
            executable = self.PARAMS_PEAKCALLER["spp_options_idr_script"]
            try:
                os.path.exists(executable)
            except:
                raise IOError("SPP script not found: %s" % executable)

        # select the threshold for lax peak calling
        if self.PARAMS_PEAKCALLER["spp_options_npeaks"]:
            if self.PARAMS_PEAKCALLER["spp_options_fdr"]:
                raise Exception("Value specified for both SPP options"
                                " -npeaks and -fdr please select one or"
                                " other option, but not both")
            else:
                threshold = "-npeaks=" + \
                    str(self.PARAMS_PEAKCALLER["spp_options_npeaks"])
        elif self.PARAMS_PEAKCALLER["spp_options_fdr"]:
            threshold = "-fdr=" + \
                str(self.PARAMS_PEAKCALLER["spp_options_fdr"])
        else:
            raise Exception("Must specify a value for either"
                            " spp_options_npeaks or spp_options_fdr,"
                            " but not both")

        # build run statement for spp.
        # -savn is output.npeak.file (passed as NULL,
        #                             means filename based on infile)
        # -out is output.result.file
        # -odir defaults to os.path.dirname( infile )
        # -savn is save narrowpeak file
        # -savr is save regionpeak file
        #  (run_spp.R script throws an error if region peak is not output).
        statement = [("Rscript %(executable)s"
                      " -c=%(infile)s"
                      " -i=%(controlfile)s"
                      " %(threshold)s"
                      " -savn"
                      " -savr")]

        # add additional options
        statement.append(self.PARAMS_PEAKCALLER["spp_options_parameters"])

        # specify outfile
        statement.append(" -rf"
                         " -out=/stats/phantomPeakStatsReps.tab"
                         " >& %(outfile)s")

        statement = (" ".join(statement) % locals())

        return statement
Example #22
0
def loadFastqc(filename,
               backend="sqlite",
               database="csvdb",
               host="",
               username="",
               password="",
               port=3306):
    '''load FASTQC statistics into database.

    Each section will be uploaded to its own table.

    Arguments
    ----------
    filename : string
        Filename with FASTQC data
    backend : string
        Database backend. Only this is required for an sqlite database.
    host : string
        Database host name
    username : string
        Database user name
    password : string
        Database password
    port : int
        Database server port.
    '''

    parser = CSV2DB.buildParser()
    (options, args) = parser.parse_args([])

    options.database_backend = backend
    options.database_host = host
    options.database_name = database
    options.database_username = username
    options.database_password = password
    options.database_port = port
    options.allow_empty = True

    for fn in glob.glob(filename):
        prefix = os.path.basename(os.path.dirname(fn))
        results = []

        for name, status, header, data in FastqcSectionIterator(
                IOTools.openFile(fn)):
            # do not collect basic stats, see loadFastQCSummary
            if name == "Basic Statistics":
                continue

            options.tablename = prefix + "_" + re.sub(" ", "_", name)

            inf = StringIO("\n".join([header] + data) + "\n")
            CSV2DB.run(inf, options)
            results.append((name, status))

        # load status table
        options.tablename = prefix + "_status"

        inf = StringIO(
            "\n".join(["name\tstatus"] +
                      ["\t".join(x) for x in results]) + "\n")
        CSV2DB.run(inf, options)
def main(argv=None):
    """script main.
    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-b", "--bamfile", dest="bam", type="string",
                      help="BAM formated alignment file to test. Should have MD and NH tags set")
    parser.add_option("-t", "--quality-threshold", dest="threshold", type="int",
                       default=30,
                       help="minimum quality threshold for a mismatched base to count")
    parser.add_option("-f", "--fasta-path", dest="fastapath", type="string",
                       help="path to indexed fasta file for genome of choice")
    parser.add_option("-p", "--vcf-path", dest="vcfpath", type="string",
                       help="path to indexed vcf file for dataset  of choice")
    parser.add_option("-d", "--sample", dest="samppattern", type="string",
                       help="pattern to match and extract the donor name from the bam file, for use in parsing the vcf file") 
    parser.add_option("-n", "--REDI-path", dest="redipath", type="string",
                       help="path to Bed format REDIportal table containing RNA editing positions")      

    (options, args) = E.Start(parser, argv=argv)

    
    bamfile = pysam.AlignmentFile(options.bam)
    fastafile = IndexedFasta(options.fastapath)
    vcffile = vcf.Reader(open(options.vcfpath,"r"))
    BEDREDI = Bed.readAndIndex(IOTools.openFile(options.redipath), with_values=True)
    options.stdout.write("\t".join(["gene_id",
                                    "strand",
                                    "mismatches",
                                    "bases",
                                    "low_qual",
                                    "a","t","c","g",
                                    "a_to_t","a_to_g","a_to_c",
                                    "t_to_a","t_to_g","t_to_c",
                                    "g_to_a","g_to_t","g_to_c",
                                    "c_to_a","c_to_t","c_to_g",
                                    "indel_count","RNA_editing_events"]) + "\n")
    
    samplepattern = options.samppattern
    (not_reverse_g_to_t, reverse_g_to_t) = 0, 0
    donorfrombam = re.search(r"%s"%(samplepattern),options.bam,flags=0).group(1)

    # find the donorid:
    vcf_record = vcffile.next()
    samples = vcf_record.samples
    donors = [dnr.sample for dnr in samples]
    donorid = None
    for samp in donors:
        if donorfrombam in samp:
            donorid=samp

    if donorid is None:
        raise ValueError("Donor %s not found in VCF" % donorfrombam)
    
    reversecomplement = {"a":"t","t":"a","c":"g","g":"c"}

    for gene in GTF.flat_gene_iterator(GTF.iterator(options.stdin)):

        start = min(e.start for e in gene)
        end = max(e.end for e in gene)  
        strand = gene[0].strand

        seq = fastafile.getSequence(gene[0].contig, "+", start, end)
        thischr = gene[0].contig.replace("chr","")
        reads = bamfile.fetch(gene[0].contig, start, end)

        if all("chr" in c for c in vcffile.contigs.keys()) == False:            
            contig = (gene[0].contig).replace("chr","")
            if contig == "M":
                contig = contig + "T"
        else:
            contig = gene[0].contig
       
        try:
            vcfregion = vcffile.fetch(contig,start,end)
        except ValueError:
            vcfregion = []
                    

        regionchecker=list(vcfregion)
                
        BEDREDIregion = BEDREDI[gene[0].contig].find(start,end+1)
 
        editpositions = {edit_pos:edit_pos_field for
                         edit_pos,edit_pos_plus,edit_pos_field
                         in BEDREDIregion if edit_pos_field.fields[2] == strand}
              
        gene_id = gene[0].gene_id
        mm_count = 0
        base_count = 0
        skipped = 0
        indel_count = 0
        RNA_edits = 0
        matched_bases = defaultdict(int)
        transition = {"a_to_t":0,"a_to_g":0,"a_to_c":0,"t_to_a":0,"t_to_g":0,
        "t_to_c":0,"g_to_a":0,"g_to_t":0,"g_to_c":0,"c_to_a":0,"c_to_t":0,
        "c_to_g":0}
        
        snp_dict={}
        for snp in regionchecker:
            if snp.genotype(donorid)["GT"] != "0/0":
                snp_dict[snp.POS -1] = snp.ALT
            
        for read in reads:
            if read.is_unmapped:
                continue
            if read.is_duplicate:
                continue
            if read.mate_is_unmapped:
                continue
            if read.get_tag("NH") > 1:
                continue
            qualities = read.query_qualities

            alignmentcigar = read.cigarstring

            indel_count += (alignmentcigar.count("I") + alignmentcigar.count("D"))

            alignment = read.get_aligned_pairs(with_seq=True)

            # list[:] is weird syntax for copying the list
            
            alignment = [base for base in alignment 
                         if not base[0] is None 
                         and not base[1] is None]

            testalignment = alignment[:]           
#            base_count += sum(1 for base in alignment
#                          if start <= base[1] < end and
#                          base[2].lower() != "n")
#                          
            
            total_alignment = [base for base in alignment
                               if start <= base[1] < end and
                               base[2].lower() != "n"]

            base_count += len(total_alignment)
            for base in total_alignment:
                if seq[(base[1])-start].lower() != base[2].lower():
                    if (testalignment[0][1] is None) or (testalignment[-1][1] is None):
                        E.debug("first or last base of read is None")
                        E.debug("read sequence is %s" %(testalignment))
                        E.debug("position of first base in genome: %s" %testalignment[0][1])
                        E.debug("position of last base in genome: %s" %testalignment[-1][1])
                        E.debug("identity of first base in genome: %s" %testalignment[0][2])
                        E.debug("identity of last base in genome: %s" %testalignment[-1][2])
                        raise ValueError
                    else:    
                        E.debug("identity of error causing base from read sequence: %s" %(read.query_alignment_sequence)[base[0]].lower())
                        E.debug("read sequence: %s" %(read.query_alignment_sequence))
                        E.debug("identity start and end of read as calculated from start and end as described in gtffile and extracted from fasta: %s" %(seq[(testalignment[0][1]-start):(testalignment[-1][1]-start)]))
                        E.debug("section of the read 10 bp downstream and upstream of the sequence containing the error extracted from the fasta: %s" %(seq[((base[1]-10)-start):((base[1]+10)-start)].lower()))
                        E.debug("filename?: %s" %(read.tostring(bamfile)))
                        E.debug("positions of start and end of the gene based on the gtf: %s,%s" %(start, end))
                        E.debug("identity of start of gene extratced from gtf: %s" %(seq[(base[1])-start]))
                        E.debug("identity of error causing base from reference genome: %s" %base[2])
                        E.debug("position of base in read: %s" %base[0])
                        E.debug("position of base in genome: %s" %base[1])
                        E.debug("position of base in read as calculated from position of base in genome and and start from gtf: %s" %(base[1]-start))                      
                        E.debug("identity of error causing base (reference), calculated from fasta and testalignment info: %s" %(seq[(testalignment[0][1]-start):(testalignment[-1][1]-start)].upper()[base[0]]))
                        #E.debug("position of base in read from first alignment genome base minus start plus position of base in in read, should equal position of base in read: %s" %((testalignment[0][1]-start) + base[0]))
                        E.debug("identity of error causing base (reference), calculated from fasta and position of base in genome from aligned pairs: %s" %(seq[(base[1])-start]))
                        #E.debug("position of start base in genome from the alignment minus position of start base in genome from the gtf, should be zero: %s" %(alignment[0][1]-start))
                        E.debug("complete aligned pairs, unfiltered: %s" %(testalignment))
                        E.debug("full fasta sequence of read: %s" %(textwrap.fill(seq,50)))
                        raise ValueError
                           
                else:
                    matched_bases[base[2].lower()] += 1
            try:
                if read.get_tag("NM") == 0:
                    continue
            except KeyError:
                if read.get_tag("nM") == 0:
                    continue
            # mismatches
            

            readseq = read.query_sequence

	    def _is_snp(base):
                global got_snp_pos
                global wrong_base
                if snp_dict.has_key(base[1]):
                    read_base = readseq[base[0]].lower()
                    alt_base = snp_dict[base[1]][0].sequence.lower()
                    got_snp_pos += 1
                    if read_base != alt_base:
                        wrong_base += 1
                        return True
                    else:
                        return False                            
                else:
                    return True

            def _is_indel(base):
                if (len(readseq) >= (base[0] + 5)):
                    if (len(seq) < ((base[1] - start) + 5)):
                        upperrange = len(seq)-(base[1]-start)
                        lowerrange = 5 - upperrange
                        readindelwindow = readseq[(base[0] - lowerrange):(base[0] + upperrange)]
                        seqindelwindow = seq[(base[1] - start - lowerrange):(base[1] - start + upperrange)]
                        matchwindows = list()
                        for i in range(len(readindelwindow)):
                            try:
                                matchwindows.append((readindelwindow[i].lower()==seqindelwindow[i].lower()))
                            except IndexError:
                                print i
                                print readindelwindow
                                print seqindelwindow
                                print start
                                print lowerrange
                                print upperrange
                                print base[0]
                                print (base[0] - lowerrange)
                                print (base[0] + upperrange)
                                print base[1]
                                print (base[1] - start)
                                print ((((base[1])-start) - lowerrange)-1)
                                print ((((base[1])-start) + upperrange)-1)
                                print readseq
                                print seq
                                print gene_id
                                print gene[0].contig                            
                                raise
                    elif len(seq) >= (base[1] - start + 5):
                        readindelwindow = readseq[base[0]:(base[0]+5)]
                        seqindelwindow = seq[(base[1] - start):(base[1] - start + 5)]
                        matchwindows = []
                        for i in range(len(readindelwindow)):
                            try: 
                                matchwindows.append(readindelwindow[i].lower()==seqindelwindow[i].lower())
                            except IndexError:
                                print i
                                print readindelwindow
                                print seqindelwindow
                                print start
                                print base[0]
                                print base[1]
                                print (base[1] - start) - 1
                                print ((base[1] - start) + 5) - 1
                                print readseq
                                print seq
                                print gene_id
                                print gene[0].contig
                                raise
                    if matchwindows.count(False) >= 4:
                        return False
                    else:
                        return True
                elif (len(readseq) < (base[0] + 5)):
                    if len(seq) < (((base[1])-start) + 5):
                        readsequpperrange = len(readseq)-base[0]
                        readseqlowerrange = 5 - readsequpperrange
                        sequpperrange = len(seq) - (base[1] - start)
                        seqlowerrange = 5 - sequpperrange
                        if readsequpperrange < sequpperrange:
                            upperrange = readsequpperrange
                            lowerrange = readseqlowerrange
                        elif sequpperrange < readsequpperrange:
                            upperrange = sequpperrange
                            lowerrange = seqlowerrange
                        elif sequpperrange == readsequpperrange:
                            upperrange = sequpperrange
                            lowerrange = seqlowerrange                        
                    elif ((base[1] - start) - 4) < 0:
                        return True
                    else:
                        upperrange = len(readseq)-base[0]
                        lowerrange = 5 - upperrange
                    readindelwindow=readseq[(base[0] - lowerrange):(base[0] + upperrange)]
                    seqindelwindow=seq[(((base[1])-start) - lowerrange):(((base[1])-start)+ upperrange)]                
                    matchwindows=[]
                    for i in range(len(readindelwindow)):
                        try:
                            matchwindows.append((readindelwindow[i].lower()==seqindelwindow[i].lower()))
                        except IndexError:
                            print i
                            print readindelwindow
                            print seqindelwindow
                            print start
                            print lowerrange
                            print upperrange
                            print base[0]
                            print (base[0] - lowerrange)
                            print (base[0] + upperrange)
                            print base[1]
                            print (base[1] - start)
                            print ((((base[1])-start) - lowerrange))
                            print ((((base[1])-start) + upperrange))
                            print readseq
                            print seq
                            print gene_id
                            print gene[0].contig
                            
                            raise
                    if matchwindows.count(False) >= 4:
                        return False
                    else:
                        return True


            def _is_RNA_edit(base,editpositions):
                global got_edit_pos
                global wrong_edit_base
                genomebase = base[2]
                readbase = readseq[base[0]].lower()
                
                if not base[1] in editpositions.keys() or \
                   genomebase == "n" or \
                   readbase == "n" or \
                   not genomebase.islower():
                    return True
                else:
                    got_edit_pos += 1
                    if genomebase == editpositions[base[1]].fields[0].lower() and \
                       readbase == editpositions[base[1]].fields[1].lower():
                        return False
                    else:
                        wrong_edit_base += 1
                        return True


            for base in total_alignment:
                if _is_RNA_edit(base,editpositions) == False:
                    RNA_edits += 1

            mismatches = [base for base in total_alignment
                          if base[2].islower() and
		          qualities[base[0]] >= options.threshold and
                          _is_snp(base) and
                          _is_indel(base) and
                          _is_RNA_edit(base,editpositions) and
                          readseq[base[0]].lower() != "n"]

            
            total_mm = sum(1 for base in total_alignment
                        if base[2].islower() and
                        _is_snp(base) and
                        readseq[base[0]].lower() != "n")
            
            hq_mm = len(mismatches)


            for base in mismatches:
                genomebase = base[2].lower()
                readbase = readseq[base[0]].lower()
                try:
                    if strand == "-":
                        revgenomebase = reversecomplement[genomebase]
                        revreadbase = reversecomplement[readbase]
                        if revgenomebase == "g" and revreadbase == "a":
                            if read.is_reverse:
                                reverse_g_to_t += 1
                            else:
                                not_reverse_g_to_t +=1
                        transition["%s_to_%s"%(revgenomebase, revreadbase)] += 1
                    else:
                        transition["%s_to_%s"%(genomebase, readbase)] += 1
                except KeyError:
                    print transition
                    print read.query_alignment_sequence.upper() 
                    print seq[(alignment[0][1]-start):(alignment[-1][1]-start)].upper()
	            print read.tostring(bamfile)
                    raise

	    
		    
            mm_count += hq_mm
            skipped += total_mm - hq_mm

        outline = "\t".join(map(str,[gene_id,
                                     strand,
                                     mm_count,
                                     base_count,
                                     skipped,
                                     matched_bases['a'],
                                     matched_bases['t'],
                                     matched_bases['c'],
                                     matched_bases['g'],
                                     transition['a_to_t'],
                                     transition['a_to_g'],
                                     transition['a_to_c'],
                                     transition['t_to_a'],
                                     transition['t_to_g'],
                                     transition['t_to_c'],
                                     transition['g_to_a'],
                                     transition['g_to_t'],
                                     transition['g_to_c'],
                                     transition['c_to_a'],
                                     transition['c_to_t'],
                                     transition['c_to_g'],
                                     indel_count,
                                     RNA_edits]))
        options.stdout.write(outline + "\n")

    # write footer and output benchmark information.
    E.info("Out of %i mismatches at snp positions %i were the wrong base" %(got_snp_pos, wrong_base))
    E.info("Out of %i mismatches at RNA edit positions %i were the wrong base" %(got_edit_pos, wrong_edit_base))
    E.info("Out of %i g_to_c transitions on - strand genes, the read was on the + strand %i times" %
           (not_reverse_g_to_t, reverse_g_to_t))
    
    E.Stop()
Example #24
0
def peekParameters(workingdir,
                   pipeline,
                   on_error_raise=None,
                   prefix=None,
                   update_interface=False,
                   restrict_interface=False):
    '''peek configuration parameters from external pipeline.

    As the paramater dictionary is built at runtime, this method
    executes the pipeline in workingdir, dumping its configuration
    values and reading them into a dictionary.

    If either `pipeline` or `workingdir` are not found, an error is
    raised. This behaviour can be changed by setting `on_error_raise`
    to False. In that case, an empty dictionary is returned.

    Arguments
    ---------
    workingdir : string
       Working directory. This is the directory that the pipeline
       was executed in.
    pipeline : string
       Name of the pipeline script. The pipeline is assumed to live
       in the same directory as the current pipeline.
    on_error_raise : Bool
       If set to a boolean, an error will be raised (or not) if there
       is an error during parameter peeking, for example if
       `workingdir` can not be found. If `on_error_raise` is None, it
       will be set to the default, which is to raise an exception
       unless the calling script is imported or the option
       ``--is-test`` has been passed at the command line.
    prefix : string
       Add a prefix to all parameters. This is useful if the paramaters
       are added to the configuration dictionary of the calling pipeline.
    update_interface : bool
       If True, this method will prefix any options in the
       ``[interface]`` section with `workingdir`. This allows
       transparent access to files in the external pipeline.
    restrict_interface : bool
       If  True, only interface parameters will be imported.

    Returns
    -------
    config : dict
        Dictionary of configuration values.

    '''
    caller_locals = getCallerLocals()

    # check if we should raise errors
    if on_error_raise is None:
        on_error_raise = not isTest() and \
            "__name__" in caller_locals and \
            caller_locals["__name__"] == "__main__"

    # patch - if --help or -h in command line arguments,
    # do not peek as there might be no config file.
    if "--help" in sys.argv or "-h" in sys.argv:
        return {}

    # Attempt to locate directory with pipeline source code. This is a
    # patch as pipelines might be called within the repository
    # directory or from an installed location
    dirname = PARAMS["pipelinedir"]

    # called without a directory, use current directory
    if dirname == "":
        dirname = os.path.abspath(".")
    else:
        # if not exists, assume we want version located
        # in directory of calling script.
        if not os.path.exists(dirname):
            # directory is path of calling script
            dirname = os.path.dirname(caller_locals['__file__'])

    pipeline = os.path.join(dirname, pipeline)
    if not os.path.exists(pipeline):
        if on_error_raise:
            raise ValueError(
                "can't find pipeline at %s" % (pipeline))
        else:
            return {}

    if workingdir == "":
        workingdir = os.path.abspath(".")

    # patch for the "config" target - use default
    # pipeline directory if directory is not specified
    # working dir is set to "?!"
    if "config" in sys.argv or "check" in sys.argv or "clone" in sys.argv and workingdir == "?!":
        workingdir = os.path.join(PARAMS.get("pipelinedir"),
                                  IOTools.snip(pipeline, ".py"))

    if not os.path.exists(workingdir):
        if on_error_raise:
            raise ValueError(
                "can't find working dir %s" % workingdir)
        else:
            return {}

    statement = "python %s -f -v 0 dump" % pipeline
    process = subprocess.Popen(statement,
                               cwd=workingdir,
                               shell=True,
                               stdin=subprocess.PIPE,
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)

    # process.stdin.close()
    stdout, stderr = process.communicate()
    if process.returncode != 0:
        raise OSError(
            ("Child was terminated by signal %i: \n"
             "Statement: %s\n"
             "The stderr was: \n%s\n"
             "Stdout: %s") %
            (-process.returncode, statement, stderr, stdout))

    # subprocess only accepts encoding argument in py >= 3.6 so
    # decode here.
    stdout = stdout.decode("utf-8").splitlines()
    # remove any log messages
    stdout = [x for x in stdout if x.startswith("{")]
    if len(stdout) > 1:
        raise ValueError("received multiple configurations")
    dump = json.loads(stdout[0])

    # update interface
    if update_interface:
        for key, value in list(dump.items()):
            if key.startswith("interface"):
                dump[key] = os.path.join(workingdir, value)

    # keep only interface if so required
    if restrict_interface:
        dump = dict([(k, v) for k, v in dump.items()
                     if k.startswith("interface")])

    # prefix all parameters
    if prefix is not None:
        dump = dict([("%s%s" % (prefix, x), y) for x, y in list(dump.items())])

    return dump
Example #25
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    profiles = iCLIP.getters.profiles.keys()
    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-p",
                      "--profile",
                      dest="profile",
                      type="choice",
                      choices=profiles,
                      default="iclip",
                      help="Experiment profile to use. Sets various things"
                      "about obtaining 1-bp position from read. Options are"
                      " %s" % ", ".join(profiles))
    parser.add_option("-c",
                      "--use-centre",
                      dest="centre",
                      action="store_true",
                      default=None,
                      help="Use centre of read rather than frist base."
                      "Overrides profile")
    parser.add_option(
        "-f",
        "--format",
        dest="format",
        choices=[
            "bigWig", "bigwig", "BigWig", "bedGraph", "bg", "bedgraph", "bed",
            "Bed", "BED"
        ],
        help="Output format. Either bigWig (2 files, + and - strand)"
        ", bedGraph (2 files), or bed (1 file, depth in column 5,"
        "strand in column 6",
        default="bigWig")
    parser.add_option("-w",
                      "--wig",
                      dest="output_wig",
                      action="store_true",
                      default=False,
                      help="Write output to bedgraph file rather than bigwig")
    parser.add_option("--dtype",
                      dest="dtype",
                      type="string",
                      default="uint32",
                      help="dtype for storing depths")
    parser.add_option(
        "--cpm",
        dest="cpm",
        action="store_true",
        default=False,
        help=
        "Normalize output depths to number of mapped reads (in millions) in BAM"
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    options.format = options.format.lower()
    if options.format == "bg":
        options.format = "bedgraph"

    profile = iCLIP.getters.profiles[options.profile]

    if options.centre is not None:
        centre = True
    else:
        centre = profile.centre

    if options.stdin == sys.stdin:
        in_bam = pysam.Samfile("-", "rb")

    else:
        fn = options.stdin.name
        options.stdin.close()
        in_bam = pysam.Samfile(fn, "rb")

    getter = iCLIP.make_getter(in_bam, profile=profile, centre=centre)

    if options.cpm:
        scale_factor = sum(contig.mapped
                           for contig in in_bam.get_index_statistics())

        scale_factor = 1000000.0 / scale_factor

    if options.format == "bed":
        bedfile = IOTools.openFile(args[0], "w")
    else:
        plus_wig = tempfile.NamedTemporaryFile(delete=False)
        minus_wig = tempfile.NamedTemporaryFile(delete=False)

    contig_sizes = []

    for chrom, chrom_length in zip(in_bam.references, in_bam.lengths):

        # get depths over chromosome
        pos_depth, neg_depth, counter = getter(chrom,
                                               strand="both",
                                               dtype=options.dtype)
        pos_depth_sorted = pos_depth.sort_index()
        del pos_depth
        neg_depth_sorted = neg_depth.sort_index()
        del neg_depth
        neg_depth_sorted = -1 * neg_depth_sorted

        if options.cpm:
            pos_depth_sorted = pos_depth_sorted * scale_factor
            neg_depth_sorted = neg_depth_sorted * scale_factor

        if options.cpm:
            pos_depth = pos_depth * scale_factor
            neg_depth = neg_depth * scale_factor

        # output to temporary wig file
        if options.format == "bed":
            output2Bed(pos_depth_sorted, neg_depth_sorted, chrom, bedfile)
        else:
            outputToBG(pos_depth_sorted, chrom, chrom_length, plus_wig)
            outputToBG(neg_depth_sorted, chrom, chrom_length, minus_wig)

        contig_sizes.append([chrom, chrom_length])

        del pos_depth_sorted
        del neg_depth_sorted

    if options.format == "bed":
        bedfile.close()
    else:
        plus_wig_name = plus_wig.name
        minus_wig_name = minus_wig.name
        plus_wig.close()
        minus_wig.close()

    outname_plus = args[0] + "_plus"
    outname_minus = args[0] + "_minus"

    if options.format == "bedgraph":
        E.debug("Outputting to bedGraph")
        shutil.move(plus_wig_name, outname_plus + ".bg")
        shutil.move(minus_wig_name, outname_minus + ".bg")

    elif options.format == "bigwig":
        chrom_sizes_file = tempfile.NamedTemporaryFile(delete=False, dir=".")
        contig_sizes = ["\t".join(map(str, row)) for row in contig_sizes]
        contig_sizes = "\n".join(contig_sizes) + "\n"
        chrom_sizes_file.write(contig_sizes)
        chrom_sizes_filename = chrom_sizes_file.name
        chrom_sizes_file.close()

        outputToBW(plus_wig_name, outname_plus, chrom_sizes_filename)
        outputToBW(minus_wig_name, outname_minus, chrom_sizes_filename)

    # write footer and output benchmark information.
    E.Stop()
Example #26
0
def main(argv=sys.argv):

    parser = E.OptionParser(version="%prog version: $Id: mali2mali.py 2782 2009-09-10 11:40:29Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-i", "--input-format", dest="input_format", type="choice",
                      choices=(
                          "plain", "fasta", "clustal", "stockholm", "phylip"),
                      help="input format of multiple alignment [default=%default].")

    parser.add_option("-o", "--output-format", dest="output_format", type="choice",
                      choices=(
                          "plain", "fasta", "stockholm", "phylip", "nexus", "plain-fasta"),
                      help="output format of multiple alignment [default=%default].")

    parser.add_option("--with-ranges", dest="with_ranges", action="store_true",
                      help="output alignment ranges (suffix /from-to after identifier) [default=%default].")

    parser.add_option("--without-ranges", dest="with_ranges", action="store_false",
                      help="do not output alignment ranges (suffix /from-to after identifier) [default=%default].")

    parser.add_option("-u", "--allow-duplicates", dest="allow_duplicates", action="store_true",
                      help="permit duplicate entries [default=%default].")

    parser.add_option("-m", "--method", dest="methods", type="string",
                      help="""methods to apply. Several methods can be specified in a ','-separated list [default=%default]."""  )

    parser.add_option("-p", "--parameters", dest="parameters", type="string",
                      help="parameter stack for methods that require one [default=%default].")

    parser.add_option("-a", "--mask-char", dest="mask_char", type="string",
                      help="character to identify/set masked characters [default=%default].")

    parser.set_defaults(
        input_format="fasta",
        output_format="fasta",
        methods="",
        parameters="",
        mask_char="x",
        gap_chars="-.nN",
        with_ranges=True,
        allow_duplicates=False,
    )

    (options, args) = E.Start(parser)

    options.methods = options.methods.split(",")
    options.parameters = options.parameters.split(",")

    # 1. read multiple alignment in various formats
    if options.allow_duplicates:
        mali = Mali.SequenceCollection()
    else:
        mali = Mali.Mali()

    t1 = time.time()

    mali.readFromFile(options.stdin, format=options.input_format)

    E.info("read mali with %i entries in %i seconds." %
           (len(mali), time.time() - t1))

    if len(mali) == 0:
        raise ValueError("empty multiple alignment")

    for method in options.methods:

        t1 = time.time()

        if method == "remove-unaligned-ends":
            mali.removeUnalignedEnds()
        elif method == "remove-end-gaps":
            mali.removeEndGaps()
        elif method == "remove-all-gaps":
            mali.removeGaps(minimum_gaps=len(mali))
        elif method == "remove-any-gaps":
            mali.removeGaps(minimum_gaps=1)
        elif method == "remove-some-gaps":
            minimum_gaps = int(options.parameters[0])
            del options.parameters[0]
            mali.removeGaps(minimum_gaps=minimum_gaps)
        elif method == "remove-empty-sequences":
            mali.removeEmptySequences()
        elif method == "upper":
            mali.upperCase()
        elif method == "lower":
            mali.lowerCase()
        elif method == "mark-codons":
            mali.markCodons()
        elif method == "remove-stops":
            mali.removePattern(lambda x: x.upper() in ("TAG", "TAA", "TGA"),
                               allowed_matches=0,
                               minimum_matches=1,
                               delete_frame=3,
                               search_frame=3)
        elif method == "shift-alignment":
            map_id2offset = IOTools.ReadMap(open(options.parameters[0], "r"),
                                            map_functions=(str, int))
            del options.parameters[0]
            mali.shiftAlignment(map_id2offset)
        elif method == "propagate-masks":
            mali.propagateMasks(mask_char=options.mask_char)

        elif method == "recount":
            mali.recount()

        elif method in ("mark-transitions", "filter-odd-transitions", "filter-even-transitions",
                        "keep-even-segments", "keep-odd-segments"):

            if os.path.exists(options.parameters[0]):
                map_id2transitions = IOTools.readMultiMap(open(options.parameters[0], "r"),
                                                          map_functions=(str, int))
            else:
                map_id2transitions = {}
                r = map(int, options.parameters[0].split(':'))
                r.sort()
                map_id2transitions["mali"] = r

            del options.parameters[0]
            if method == "mark-transitions":
                mali.markTransitions(map_id2transitions)
            elif method in ("filter-odd-transitions", "keep-even-segments"):
                mali.markTransitions(map_id2transitions, mode="keep-odd")
            elif method in ("filter-even-transitions", "keep-odd-segments"):
                mali.markTransitions(map_id2transitions, mode="keep-even")

        elif method == "propagate-transitions":
            mali.propagateTransitions()

        elif method == "map-annotation":
            # map annotations in one mali (stockholm-format) to the annotations in another.
            # Note: the first two sequence identifiers must be shared and the sequence of the
            # same length
            other_mali = Mali.Mali()
            other_mali.readFromFile(
                open(options.parameters[0], "r"), format="stockholm")
            del options.parameters[0]
            mali.copyAnnotations(other_mali)

        elif method == "add-annotation":
            annotation_type, annotation_file = options.parameters[:2]
            del options.parameters[:2]
            AddAnnotation(mali, annotation_type, annotation_file)

        elif method == "mask-columns":
            annotation_type, annotation_file = options.parameters[:2]
            del options.parameters[:2]
            maskColumns(mali, annotation_type, annotation_file)

        elif method == "remove-unaligned-pairs":
            removeUnalignedPairs(mali, options)

        elif method == "filter-3rd":
            filterMali(mali, "3rd")

        elif method == "filter-4d":
            filterMali(mali, "4d")

        elif method in ("mask-seg", "mask-bias"):
            a, b = method.split("-")
            maskMali(mali, b)

        elif method == "exclude-with-stop":
            mali.filter(method="with-stop")

        elif method == "exclude-with-stop":
            mali.filter(method="with-frameshift")

        E.info("applied method %s in %i seconds." % (method, time.time() - t1))

    mali.writeToFile(options.stdout,
                     format=options.output_format,
                     write_ranges=options.with_ranges)

    E.Stop()
Example #27
0
def loadGLAM2SCAN(infile, outfile):
    '''parse mast file and load into database.

    Parse several motif runs and add them to the same
    table.
    '''
    tablename = outfile[:-len(".load")]
    tmpfile = tempfile.NamedTemporaryFile(delete=False)
    tmpfile.write(
        "motif\tid\tnmatches\tscore\tscores\tncontrols\tmax_controls\n")

    lines = IOTools.openFile(infile).readlines()
    chunks = [x for x in range(len(lines)) if lines[x].startswith("::")]
    chunks.append(len(lines))

    for chunk in range(len(chunks) - 1):

        # use real file, as parser can not deal with a
        # list of lines

        try:
            motif = re.match(":: motif = (\S+) ::",
                             lines[chunks[chunk]]).groups()[0]
        except AttributeError:
            raise P.PipelineError("parsing error in line '%s'" %
                                  lines[chunks[chunk]])

        if chunks[chunk] + 1 == chunks[chunk + 1]:
            L.warn("no results for motif %s - ignored" % motif)
            continue

        tmpfile2 = tempfile.NamedTemporaryFile(delete=False)
        tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]]))
        tmpfile2.close()
        glam = Glam2Scan.parse(IOTools.openFile(tmpfile2.name, "r"))

        os.unlink(tmpfile2.name)

        # collect control data
        full_matches = collections.defaultdict(list)
        controls = collections.defaultdict(list)
        for match in glam.matches:
            m = match.id.split("_")
            track, id = m[:2]
            if len(m) == 2:
                full_matches[id].append(match)
            else:
                controls[id].append(match.score)

        for id, matches in full_matches.iteritems():

            nmatches = len(matches)
            scores = [x.score for x in matches]
            score = max(scores)
            # move to genomic coordinates
            #contig, start, end = re.match( "(\S+):(\d+)..(\d+)", match.id).groups()
            #start, end = int(start), int(end)
            #match.start += start
            #match.end += start
            contig = ""

            if id not in controls:
                P.warn("no controls for %s - increase evalue?" % id)

            c = controls[id]
            if len(c) == 0:
                mmax = ""
            else:
                mmax = max(c)

            tmpfile.write("\t".join(
                map(str, (motif, id, nmatches, score,
                          ",".join(map(str, scores)), len(c), mmax))) + "\n")

    tmpfile.close()
    tmpfilename = tmpfile.name

    statement = '''
   python %(scriptsdir)s/csv2db.py %(csv2db_options)s \
              -b sqlite \
              --index=id \
              --index=motif \
              --index=id,motif \
              --table=%(tablename)s \
              --map=base_qualities:text \
    < %(tmpfilename)s > %(outfile)s
    '''

    P.run()
    os.unlink(tmpfile.name)
Example #28
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-m", "--min-overlap", dest="min_overlap", type="float",
                      help="minimum overlap [%default]")

    parser.add_option("-k", "--keep-temp", dest="keep_temp", action="store_true",
                      help="do not delete temporary files [%default]")

    parser.add_option("-a", "--filename-bam", dest="filename_bam", metavar="bam", type="string",
                      help="bam-file to use [%default]")

    parser.add_option("-b", "--filename-bed", dest="filename_bed", metavar="bam", type="string",
                      help="bed-file to use [%default]")

    parser.set_defaults(
        min_overlap=0.5,
        keep_temp=False,
        filename_bam=None,
        filename_bed=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    filename_bam = options.filename_bam
    filename_bed = options.filename_bed

    if filename_bam is None and filename_bed is None:
        if len(args) != 2:
            raise ValueError(
                "please supply a bam and a bed file or two bed-files.")

        filename_bam, filename_bed = args

    if filename_bed is None:
        raise ValueError("please supply a bed file to compare to.")

    if filename_bam is None:
        raise ValueError("please supply a bam file to compare with.")

    E.info("intersecting the two files")

    tmpfile = tempfile.NamedTemporaryFile(delete=False)
    tmpfile.close()
    tmpfilename = tmpfile.name

    min_overlap = options.min_overlap

    options.stdout.write("category\talignments\n")

    # get number of columns of reference bed file
    for bed in Bed.iterator(IOTools.openFile(filename_bed)):
        ncolumns_bed = bed.columns
        break
    E.info("assuming %s is bed%i format" % (filename_bed, ncolumns_bed))

    if ncolumns_bed < 4:
        raise ValueError("please supply a name attribute in the bed file")

    # get information about
    if filename_bam.endswith(".bam"):
        format = "-abam"
        samfile = pysam.Samfile(filename_bam, "rb")
        total = samfile.mapped
        # latest bedtools uses bed12 format when bam is input
        ncolumns_bam = 12
        # count per read
        sort_key = lambda x: x.name
    else:
        format = "-a"
        total = IOTools.getNumLines(filename_bam)
        # get bed format
        ncolumns_bam = 0
        for bed in Bed.iterator(IOTools.openFile(filename_bam)):
            ncolumns_bam = bed.columns
            break

        if ncolumns_bam > 0:
            E.info("assuming %s is bed%i fomat" % (filename_bam, ncolumns_bam))
            if ncolumns_bam == 3:
                # count per interval
                sort_key = lambda x: (x.contig, x.start, x.end)
            else:
                # count per interval category
                sort_key = lambda x: x.name

    # use fields for bam/bed file (regions to count with)
    data_fields = ["contig", "start", "end", "name",
                   "score", "strand", "thickstart", "thickend", "rgb",
                   "blockcount", "blockstarts", "blockends"][:ncolumns_bam]

    # add fields for second bed (regions to count in)
    data_fields.extend(["contig2", "start2", "end2", "name2",
                        "score2", "strand2", "thickstart2", "thickend2", "rgb2",
                        "blockcount2", "blockstarts2", "blockends2"][:ncolumns_bed])

    # add bases overlap
    data_fields.append("bases_overlap")

    data = collections.namedtuple("data", data_fields)

    options.stdout.write("total\t%i\n" % total)

    if total == 0:
        E.warn("no data in %s" % filename_bam)
        return

    # IMS: newer versions of intersectBed have a very high memory requirement unless
    #     passed sorted bed files.
    statement = """intersectBed %(format)s %(filename_bam)s -b <( zcat %(filename_bed)s | sort -k1,1 -k2,2n) -sorted -bed -wo -f %(min_overlap)f > %(tmpfilename)s""" % locals()

    E.info("running %s" % statement)
    retcode = E.run(statement)

    if retcode != 0:
        raise ValueError("error while executing statement %s" % statement)

    infile = open(tmpfilename, "r")
    counts_per_alignment = collections.defaultdict(int)

    E.info("counting")

    take_columns = len(data._fields)

    def iter(infile):
        for line in infile:
            if not line.strip():
                continue
            yield data._make(line[:-1].split()[:take_columns])

    for read, overlaps in itertools.groupby(iter(infile), key=sort_key):
        annotations = [x.name2 for x in overlaps]
        for anno in annotations:
            counts_per_alignment[anno] += 1
    infile.close()

    for key, counts in counts_per_alignment.iteritems():
        options.stdout.write("%s\t%i\n" % (key, counts))

    if not options.keep_temp:
        os.unlink(tmpfilename)

    # write footer and output benchmark information.
    E.Stop()
Example #29
0
def buildResults(bedfile, fg_file, control_file, counter, options):
    '''compute densities and peakshape parameters.'''

    options.stdout.write("\t".join(
        ("contig", "start", "end", "name",
         "\t".join(_bam2peakshape.PeakShapeResult._fields))) + "\n")

    if options.window_size:
        # bins are centered at peak-center and then stretching outwards.
        bins = numpy.arange(-options.window_size + options.bin_size // 2,
                            +options.window_size, options.bin_size)

    #contigs = set(pysam_in.references)

    strand_specific = options.strand_specific

    result = []
    c = E.Counter()
    c.input = 0

    for bed in Bed.iterator(IOTools.openFile(bedfile)):
        c.input += 1

        # if bed.contig not in contigs:
        #    c.skipped += 1
        #    continue

        if c.input % options.report_step == 0:
            E.info("iteration: %i" % c.input)

        features = counter.countInInterval(
            fg_file,
            bed.contig,
            bed.start,
            bed.end,
            window_size=options.window_size,
            bins=bins,
            only_interval=options.only_interval,
            centring_method=options.centring_method)

        if control_file:
            control = counter.countAroundPos(control_file,
                                             bed.contig,
                                             features.peak_center,
                                             bins=features.bins)

        else:
            control = None

        if options.random_shift:
            direction = numpy.random.randint(0, 2)
            if direction:
                pos = features.peak_center + 2 * bins[0]
            else:
                pos = features.peak_center + 2 * bins[-1]
            shifted = counter.countAroundPos(fg_file,
                                             bed.contig,
                                             pos,
                                             bins=features.bins)
        else:
            shifted = None

        if strand_specific and bed.strand == "-":
            features._replace(hist=hist[::-1])
            if control:
                control._replace(hist=hist[::-1])
            if shifted:
                shift._replace(hist=hist[::-1])

        result.append((features, bed, control, shifted))
        c.added += 1

    E.info("interval processing: %s" % c)

    return result, bins
Example #30
0
def compileMutationalSignature(infiles, outfiles):
    '''takes a list of mutect output files and compiles per sample mutation
    signatures'''

    delim = ":"

    def lookup(b1, b2):
        '''return lookup key for a pair of bases'''
        return(b1 + delim + b2)

    def breakKey(key):
        '''take a lookup key and return the elements'''
        return key.split(delim)

    def comp(base):
        '''return complementary base'''
        comp_dict = {"C": "G", "G": "C", "A": "T", "T": "A"}
        return comp_dict[base]

    def getID(infile):
        return P.snip(os.path.basename(infile),
                      ".mutect.snp.annotated.filtered.vcf")

    outfile1 = IOTools.openFile(outfiles[0], "w")
    mutations = ["C:T", "C:A", "C:G", "A:C", "A:T", "A:G"]

    outfile1.write("%s\t%s\t%s\t%s\t%s\n" % ("patient_id", "base_change",
                                             "ref", "alt", "frequency"))
    patient_freq = {}

    for infile in infiles:
        patient_id = getID(infile)
        mut_dict = {}
        for comb in mutations:
            mut_dict[comb] = 0

        with IOTools.openFile(infile, "r") as f:
            for line in f.readlines():
                if line.startswith('#'):
                    continue
                values = line.split("\t")
                key = lookup(values[3], values[4])
                if key in mut_dict:
                    mut_dict[key] += 1
                else:
                    comp_key = lookup(
                        comp(values[3]), comp(values[4]))
                    mut_dict[comp_key] += 1

        patient_freq[patient_id] = mut_dict

    for mutation in mutations:
        base1, base2 = breakKey(mutation)
        for infile in infiles:
            patient_id = getID(infile)
            outfile1.write("%s\t%s\t%s\t%s\t%s\n" % (patient_id, mutation,
                                                     base1, base2,
                                                     patient_freq[patient_id]
                                                     [mutation]))
    outfile1.close()

    outfile2 = IOTools.openFile(outfiles[1], "w")
    outfile2.write("%s\t%s\n" % ("patient_id",
                                 "\t".join(mutations)))
    for infile in infiles:
        patient_id = getID(infile)
        frequencies = "\t".join(map(str, [patient_freq[patient_id][x]
                                          for x in mutations]))
        outfile2.write("%s\t%s\n" % (patient_id, frequencies))
    outfile2.close()
Example #31
0
def filterMutect(infile, outfile, logfile,
                 min_t_alt, min_n_depth,
                 max_n_alt_freq, min_t_alt_freq,
                 min_ratio):
    ''' filter MuTect2 snps and indels'''

    reasons = collections.Counter()
    control_id = "NORMAL"
    tumour_id = "TUMOR"

    def comp(base):
        '''return complementary base'''
        comp_dict = {"C": "G", "G": "C", "A": "T", "T": "A"}
        return comp_dict[base]

    with IOTools.openFile(outfile, "w") as outf:
        with IOTools.openFile(infile, "r") as inf:
            for line in inf.readlines():
                # need to find location of control and tumor columns
                if line.startswith('#CHROM'):
                    columns = line.split("\t")
                    for x in range(0, len(columns)):
                        if control_id in columns[x]:
                            control_col = x
                        elif tumour_id in columns[x]:
                            tumor_col = x

                if line.startswith('#'):
                    # write out all comment lines
                    outf.write(line)

                if line.startswith('chr'):
                    values = line.split("\t")

                    if values[6] == "PASS":
                        t_values = values[tumor_col].split(":")
                        t_ref, t_alt = list(
                            map(float, (t_values[1].split(","))))
                        t_depth = t_alt + t_ref
                        n_values = values[control_col].split(":")
                        n_ref, n_alt = list(
                            map(float, (n_values[1].split(","))))
                        n_depth = n_alt + n_ref
                        np.seterr(divide='ignore')

                        t_freq = np.divide(t_alt, t_depth)
                        n_freq = np.divide(n_alt, n_depth)

                        # filter
                        if not t_alt > min_t_alt:
                            reasons["Low_tumour_alt_count"] += 1
                            continue

                        if not t_freq >= min_t_alt_freq:
                            reasons["Low_tumour_alt_freq"] += 1
                            continue

                        if not n_depth >= min_n_depth:
                            reasons["Low_normal_depth"] += 1
                            continue

                        if not n_freq <= max_n_alt_freq:
                            reasons["high_normal_alt_freq"] += 1
                            continue

                        if (np.divide(t_freq, n_freq) >= min_ratio or n_freq == 0):
                            outf.write(line)
                    else:
                        reasons["Mutect_reject"] += 1

    with IOTools.openFile(logfile, "w") as outf:
        outf.write("%s\n" % "\t".join(("reason", "count")))
        for reason in reasons:
            outf.write("%s\t%i\n" % (reason, reasons[reason]))
Example #32
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: csv_intersection.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option("-u",
                      "--unique",
                      dest="unique",
                      action="store_true",
                      help="output rows are uniq.")

    parser.set_defaults(
        remove=False,
        unique=False,
    )

    (options, args) = E.Start(parser, add_csv_options=True)

    if len(args) != 2:
        raise ValueError("please specify two files to join")

    options.filename1, options.filename2 = args

    table1 = CSV.readTable(IOTools.openFile(options.filename1, "r"))
    table2 = CSV.readTable(IOTools.openFile(options.filename2, "r"))

    if options.unique:
        outfile = UniqueBuffer(sys.stdout)
    else:
        outfile = options.stdout

    # build new field list
    new_fields = []

    for x in options.join_fields1:
        new_fields.append(x)

    for x in fields1:
        if x not in options.join_fields1:
            new_fields.append(x)
        if x not in options.join_fields2:
            new_fields.append(x)

        writer = csv.DictWriter(outfile,
                                fields,
                                dialect=options.csv_dialect,
                                lineterminator=options.csv_lineterminator,
                                extrasaction='ignore')

    if len(lines) > 0:

        old_fields = lines[0][:-1].split("\t")

        if options.remove:
            fields = []
            for x in old_fields:
                if x not in input_fields:
                    fields.append(x)
        else:
            fields = input_fields

        reader = csv.DictReader(lines, dialect=options.csv_dialect)

        print("\t".join(fields))

        first_row = True
        for row in reader:
            row = IOTools.convertDictionary(row)
            writer.writerow(row)

    E.Stop()
Example #33
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: compare_clusters.py 2782 2009-09-10 11:40:29Z andreas $"
    )

    parser.add_option("-o",
                      "--output-pattern",
                      dest="output_pattern",
                      type="string",
                      help="output pattern for filenames.")

    parser.set_defaults(
        output_pattern=None,
        format="%5.2f",
    )

    (options, args) = E.Start(parser, add_pipe_options=True)

    if len(args) != 2:
        raise "please supply to filenames with the clusters."

    map_id2cluster1, map_cluster2ids1 = IOTools.ReadMap(open(args[0]),
                                                        both_directions=True)
    map_id2cluster2, map_cluster2ids2 = IOTools.ReadMap(open(args[1]),
                                                        both_directions=True)

    graph = networkx.Graph()

    for a in map_cluster2ids1.keys():
        graph.add_node((1, a))
    for b in map_cluster2ids2.keys():
        graph.add_node((2, b))

    ## build graph between clusters
    for cluster1, ids1 in map_cluster2ids1.items():
        for id1 in ids1:
            if id1 in map_id2cluster2:
                graph.add_edge((1, cluster1), (2, map_id2cluster2[id1]))

    components = networkx.connected_components(graph)

    #######################################################
    #######################################################
    #######################################################
    ## write components and compute counts
    #######################################################
    outfile = getFile("components", options)
    outfile.write("id\ttotal\tn1\tn2\tmembers1\tmembers2\n")
    n = 0
    counts = {}
    subsets = []
    for component in components:

        m1, m2 = [], []

        for x in component:
            if x[0] == 1:
                m1.append(x[1])
            else:
                m2.append(x[1])

        t = len(component)
        n1 = len(m1)
        n2 = len(m2)
        cc = (n1, n2)
        if cc not in counts: counts[cc] = 0
        counts[cc] += 1

        if cc == (1, 1): subsets.append(n)

        n += 1
        outfile.write("%i\t%i\t%i\t%i\t%s\t%s\n" %
                      (n, t, n1, n2, ",".join(m1), ",".join(m2)))

    if outfile != options.stdout:
        outfile.close()
    else:
        outfile.write("//\n")

    #######################################################
    #######################################################
    #######################################################
    ## write counts
    #######################################################
    outfile = getFile("counts", options)
    outfile.write("n1\tn2\tcounts\tpcounts1\tpcounts2\n")
    for cc, c in counts.items():
        outfile.write(
            "%i\t%i\t%i\t%s\t%s\n" %
            (cc[0], cc[1], c, options.format %
             (100.0 * float(c) / len(map_cluster2ids1)), options.format %
             (100.0 * float(c) / len(map_cluster2ids2))))

    if outfile != options.stdout:
        outfile.close()
    else:
        outfile.write("//\n")

    #######################################################
    #######################################################
    #######################################################
    ## analyze subsets - how many of the 1:1 clusters
    ## contain the exact members?
    #######################################################
    outfile = getFile("subsets", options)
    outfile.write("id\tn1\tn2\tunion\tinter\tunique1\tunique2\n")

    ntrue = 0
    nrest1 = 0
    nrest2 = 0
    nother = 0

    for component_id in subsets:
        component = components[component_id]
        if component[0][0] == 1:
            id1, id2 = component[0][1], component[1][1]
        else:
            id1, id2 = component[1][1], component[0][1]

        members1 = set(map_cluster2ids1[id1])
        members2 = set(map_cluster2ids2[id2])

        union = len(members1.union(members2))
        intersection = len(members1.intersection(members2))
        rest1 = len(members1.difference(members2))
        rest2 = len(members2.difference(members1))

        if rest1 == 0 and rest2 == 0:
            ntrue += 1
        elif rest1 == 0:
            nrest1 += 1
        elif rest2 == 0:
            nrest2 += 1
        else:
            nother += 1

        outfile.write("%i\t%i\t%i\t%i\t%i\t%i\t%i\n" %
                      (component_id, len(members1), len(members2), union,
                       intersection, rest1, rest2))

    if outfile != options.stdout:
        outfile.close()
    else:
        outfile.write("//\n")

    ## write subset statistics
    ntotal = len(subsets)
    options.stdout.write("# subset statistics of 1:1 corresponding clusters\n")
    options.stdout.write("class\tcounts\ttotal\n")
    options.stdout.write("%s\t%i\t%s\n" %
                         ("total", ntotal, options.format % 100))
    options.stdout.write("%s\t%i\t%s\n" % ("true", ntrue, options.format %
                                           (100.0 * ntrue / ntotal)))
    options.stdout.write("%s\t%i\t%s\n" % ("unique1", nrest1, options.format %
                                           (100.0 * nrest1 / ntotal)))
    options.stdout.write("%s\t%i\t%s\n" % ("unique2", nrest2, options.format %
                                           (100.0 * nrest2 / ntotal)))
    options.stdout.write("%s\t%i\t%s\n" % ("other", nother, options.format %
                                           (100.0 * nother / ntotal)))

    E.Stop()
Example #34
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-i",
                      "--in-sam",
                      dest="in_sam",
                      action="store_true",
                      help="Input file is in sam format",
                      default=False)
    parser.add_option("-o",
                      "--out-sam",
                      dest="out_sam",
                      action="store_true",
                      help="Output alignments in sam format",
                      default=False)
    parser.add_option("--ignore-umi",
                      dest="ignore_umi",
                      action="store_true",
                      help="Ignore UMI and dedup only on position",
                      default=False)
    parser.add_option("--subset",
                      dest="subset",
                      type="string",
                      help="Use only a fraction of reads, specified by subset",
                      default=1.1)
    parser.add_option("--spliced-is-unique",
                      dest="spliced",
                      action="store_true",
                      help="Treat a spliced read as different to an unspliced"
                      " one",
                      default=False)
    parser.add_option("--soft-clip-threshold",
                      dest="soft",
                      type="float",
                      help="number of bases clipped from 5' end before"
                      "read os counted as spliced",
                      default=4)
    parser.add_option("--edit-distance-theshold",
                      dest="threshold",
                      type="int",
                      help="Edit distance theshold at which to join two UMIs"
                      "when clustering",
                      default=1)
    parser.add_option("--chrom",
                      dest="chrom",
                      type="string",
                      help="Restrict to one chromosome",
                      default=None)
    parser.add_option("--paired",
                      dest="paired",
                      action="store_true",
                      default=False,
                      help="Use second-in-pair position when deduping")
    parser.add_option("--method",
                      dest="method",
                      type="choice",
                      choices=("adjacency", "directional-adjacency",
                               "percentile", "unique", "cluster"),
                      default="directional-adjacency",
                      help="method to use for umi deduping")
    parser.add_option("--output-stats",
                      dest="stats",
                      type="string",
                      default=False,
                      help="Specify location to output stats")
    parser.add_option("--further-stats",
                      dest="further_stats",
                      action="store_true",
                      default=False,
                      help="Output further stats")
    parser.add_option("--per-contig",
                      dest="per_contig",
                      action="store_true",
                      default=False,
                      help=("dedup per contig,"
                            " e.g for transcriptome where contig = gene"))
    parser.add_option(
        "--whole-contig",
        dest="whole_contig",
        action="store_true",
        default=False,
        help=
        "Read whole contig before outputting bundles: guarantees that no reads"
        "are missed, but increases memory usage")
    parser.add_option("--multimapping-detection-method",
                      dest="detection_method",
                      type="choice",
                      choices=("NH", "X0", "XT"),
                      default=None,
                      help=("Some aligners identify multimapping using bam "
                            "tags. Setting this option to NH, X0 or XT will "
                            "use these tags when selecting the best read "
                            "amongst reads with the same position and umi"))
    parser.add_option("--mapping-quality",
                      dest="mapping_quality",
                      type="int",
                      help="Minimum mapping quality for a read to be retained",
                      default=0)
    parser.add_option("--read-length",
                      dest="read_length",
                      action="store_true",
                      default=False,
                      help=("use read length in addition to position and UMI"
                            "to identify possible duplicates"))

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if options.stdin != sys.stdin:
        in_name = options.stdin.name
        options.stdin.close()
    else:
        raise ValueError("Input on standard in not currently supported")

    if options.stdout != sys.stdout:
        out_name = options.stdout.name
        options.stdout.close()
    else:
        out_name = "-"

    if options.in_sam:
        in_mode = "r"
    else:
        in_mode = "rb"

    if options.out_sam:
        out_mode = "w"
    else:
        out_mode = "wb"

    if options.stats:
        if options.ignore_umi:
            raise ValueError("'--output-stats' and '--ignore-umi' options"
                             " cannot be used together")

    if options.further_stats:
        if not options.stats:
            raise ValueError("'--further-stats' options requires "
                             "'--output-stats' option")
        if options.method not in ["cluster", "adjacency"]:
            raise ValueError("'--further-stats' only enabled with 'cluster' "
                             "and 'adjacency' methods")

    infile = pysam.Samfile(in_name, in_mode)
    outfile = pysam.Samfile(out_name, out_mode, template=infile)

    nInput, nOutput = 0, 0

    if options.detection_method:
        bam_features = detect_bam_features(infile.filename)

        if not bam_features[options.detection_method]:
            if sum(bam_features.values()) == 0:
                raise ValueError(
                    "There are no bam tags available to detect multimapping. "
                    "Do not set --multimapping-detection-method")
            else:
                raise ValueError(
                    "The chosen method of detection for multimapping (%s) "
                    "will not work with this bam. Multimapping can be detected"
                    " for this bam using any of the following: %s" %
                    (options.detection_method, ",".join(
                        [x for x in bam_features if bam_features[x]])))

    if options.stats:
        # set up arrays to hold stats data
        stats_pre_df_dict = {"UMI": [], "counts": []}
        stats_post_df_dict = {"UMI": [], "counts": []}
        pre_cluster_stats = []
        post_cluster_stats = []
        pre_cluster_stats_null = []
        post_cluster_stats_null = []
        topology_counts = collections.Counter()
        node_counts = collections.Counter()
        read_gn = random_read_generator(infile.filename)

    for bundle in get_bundles(infile,
                              ignore_umi=options.ignore_umi,
                              subset=float(options.subset),
                              quality_threshold=options.mapping_quality,
                              paired=options.paired,
                              chrom=options.chrom,
                              spliced=options.spliced,
                              soft_clip_threshold=options.soft,
                              per_contig=options.per_contig,
                              whole_contig=options.whole_contig,
                              read_length=options.read_length,
                              detection_method=options.detection_method):

        nOutput += 1
        nInput += sum([bundle[umi]["count"] for umi in bundle])

        if nOutput % 10000 == 0:
            E.debug("Outputted %i" % nOutput)

        if nInput % 1000000 == 0:
            E.debug("Read %i input reads" % nInput)

        if options.stats:
            # generate pre-dudep stats
            average_distance = get_average_umi_distance(bundle.keys())
            pre_cluster_stats.append(average_distance)
            cluster_size = len(bundle)
            random_umis = read_gn.getUmis(cluster_size)
            average_distance_null = get_average_umi_distance(random_umis)
            pre_cluster_stats_null.append(average_distance_null)

        if options.ignore_umi:
            for umi in bundle:
                outfile.write(bundle[umi]["read"])
                # IMS: add paired output for ignore_umi:
                if options.paired:
                    outfile.write(infile.mate(bundle[umi]["read"]))
        else:

            # set up ClusterAndReducer functor with methods specific to
            # specified options.method
            processor = ClusterAndReducer(options.method)

            # dedup using umis and write out deduped bam

            reads, umis, umi_counts, topologies, nodes = processor(
                bundle, options.threshold, options.stats,
                options.further_stats)

            for read in reads:
                outfile.write(read)
                if options.paired:
                    # TS - write out paired end mate
                    outfile.write(infile.mate(read))

            if options.stats:

                # collect pre-dudupe stats
                stats_pre_df_dict['UMI'].extend(bundle)
                stats_pre_df_dict['counts'].extend(
                    [bundle[UMI]['count'] for UMI in bundle])

                # collect post-dudupe stats
                post_cluster_umis = [x.qname.split("_")[-1] for x in reads]
                stats_post_df_dict['UMI'].extend(umis)
                stats_post_df_dict['counts'].extend(umi_counts)

                average_distance = get_average_umi_distance(post_cluster_umis)
                post_cluster_stats.append(average_distance)

                cluster_size = len(post_cluster_umis)
                random_umis = read_gn.getUmis(cluster_size)
                average_distance_null = get_average_umi_distance(random_umis)
                post_cluster_stats_null.append(average_distance_null)

                if options.further_stats:
                    for c_type, count in topologies.most_common():
                        topology_counts[c_type] += count
                    for c_type, count in nodes.most_common():
                        node_counts[c_type] += count

    if options.stats:

        stats_pre_df = pd.DataFrame(stats_pre_df_dict)
        stats_post_df = pd.DataFrame(stats_post_df_dict)

        # generate histograms of counts per UMI at each position
        UMI_counts_df_pre = pd.DataFrame(
            stats_pre_df.pivot_table(columns=stats_pre_df["counts"],
                                     values="counts",
                                     aggfunc=len))
        UMI_counts_df_post = pd.DataFrame(
            stats_post_df.pivot_table(columns=stats_post_df["counts"],
                                      values="counts",
                                      aggfunc=len))

        UMI_counts_df_pre.columns = ["instances"]
        UMI_counts_df_post.columns = ["instances"]

        UMI_counts_df = pd.merge(UMI_counts_df_pre,
                                 UMI_counts_df_post,
                                 how='left',
                                 left_index=True,
                                 right_index=True,
                                 sort=True,
                                 suffixes=["_pre", "_post"])

        # TS - if count value not observed either pre/post-dedup,
        # merge will leave an empty cell and the column will be cast as a float
        # see http://pandas.pydata.org/pandas-docs/dev/missing_data.html
        # --> Missing data casting rules and indexing
        # so, back fill with zeros and convert back to int
        UMI_counts_df = UMI_counts_df.fillna(0).astype(int)

        UMI_counts_df.to_csv(options.stats + "_per_umi_per_position.tsv",
                             sep="\t")

        # aggregate stats pre/post per UMI
        agg_pre_df = aggregateStatsDF(stats_pre_df)
        agg_post_df = aggregateStatsDF(stats_post_df)

        agg_df = pd.merge(agg_pre_df,
                          agg_post_df,
                          how='left',
                          left_index=True,
                          right_index=True,
                          sort=True,
                          suffixes=["_pre", "_post"])

        # TS - see comment above regarding missing values
        agg_df = agg_df.fillna(0).astype(int)
        agg_df.to_csv(options.stats + "_per_umi.tsv", sep="\t")

        # bin distances into integer bins
        max_ed = int(
            max(
                map(max, [
                    pre_cluster_stats, post_cluster_stats,
                    pre_cluster_stats_null, post_cluster_stats_null
                ])))

        cluster_bins = range(-1, int(max_ed) + 2)

        def bin_clusters(cluster_list, bins=cluster_bins):
            ''' take list of floats and return bins'''
            return np.digitize(cluster_list, bins, right=True)

        def tallyCounts(binned_cluster, max_edit_distance):
            ''' tally counts per bin '''
            return np.bincount(binned_cluster, minlength=max_edit_distance + 3)

        pre_cluster_binned = bin_clusters(pre_cluster_stats)
        post_cluster_binned = bin_clusters(post_cluster_stats)
        pre_cluster_null_binned = bin_clusters(pre_cluster_stats_null)
        post_cluster_null_binned = bin_clusters(post_cluster_stats_null)

        edit_distance_df = pd.DataFrame({
            "unique":
            tallyCounts(pre_cluster_binned, max_ed),
            "unique_null":
            tallyCounts(pre_cluster_null_binned, max_ed),
            options.method:
            tallyCounts(post_cluster_binned, max_ed),
            "%s_null" % options.method:
            tallyCounts(post_cluster_null_binned, max_ed),
            "edit_distance":
            cluster_bins
        })

        # TS - set lowest bin (-1) to "Single_UMI"
        edit_distance_df['edit_distance'][0] = "Single_UMI"

        edit_distance_df.to_csv(options.stats + "_edit_distance.tsv",
                                index=False,
                                sep="\t")

        if options.further_stats:
            with IOTools.openFile(options.stats + "_topologies.tsv",
                                  "w") as outf:
                outf.write("\n".join([
                    "\t".join((x, str(y)))
                    for x, y in topology_counts.most_common()
                ]) + "\n")

            with IOTools.openFile(options.stats + "_nodes.tsv", "w") as outf:
                outf.write("\n".join([
                    "\t".join(map(str, (x, y)))
                    for x, y in node_counts.most_common()
                ]) + "\n")

    # write footer and output benchmark information.
    E.info("Number of reads in: %i, Number of reads out: %i" %
           (nInput, nOutput))
    E.Stop()
Example #35
0
def FilterExacCols(infile, exac_suffs, exac_thresh):
    '''
    Returns a set of line indices indicating lines where either of the alleles
    called have a frequency of greater that exac_thresh in any of the
    populations specified as exac_suffs.
    Where no data is available an allele frequency of -1 is used.

    Exac provide data as AC_xxx and AN_xxx where AC is the allele count
    - the number of times the allele has been called
    - and AN is chromosome count - the number of
    samples in which the allele could have been called - in population xxx.
    AC / AN = allele frequecy.

    exac_suffs are any columns where an AC_xxx and AN_xxx column is provided
    in the VCF, e.g. Adj will calculate allele frequency from the AC_Adj
    and AN_Adj columns

    '''
    # read columns from the input VCF
    exac_suffs = exac_suffs.split(",")
    cols = IOTools.openFile(infile).readline().strip().split("\t")
    nD = dict()
    afdict = dict()
    for e in exac_suffs:
        # find the columns with the appropriate information
        # Allele count
        AC_i = cols.index("AC_%s" % (e))
        # Allele Number
        AN_i = cols.index("AN_%s" % (e))
        # Genotype
        GT_i = cols.index('GT')
        nlist = set()
        n = 0
        AFS = []
        with IOTools.openFile(infile) as input:
            for line in input:
                if n > 1:
                    line = line.strip().split("\t")
                    # At multi-allelic sites, comma delimited AC and AN values
                    # are provided
                    # "." and "NA" indicate no data here - this is represented
                    # as an AF of -1
                    AC = line[AC_i].replace(".", "-1").replace(
                        "NA", "-1").split(",")
                    AN = line[AN_i].replace(".", "1").replace(
                        "NA", "1").split(",")
                    AC = np.array([float(a) for a in AC])
                    AN = np.array([float(a) for a in AN])
                    AF = AC / AN
                    AF2 = [af if af > 0 else 0 for af in AF]
                    AF = np.insert(AF, 0, (1 - sum(AF2)))

                    # Chromosome count is usually the same for all minor
                    # alleles (but not always)
                    # If it is not the same the AC and AN lists should be the
                    # same length
                    # Otherwise AN will have length 1
                    if len(AC) != len(AN):
                        AN = [AN] * len(AC)

                    # Record the genotype called in this sample for this SNP
                    GT = line[GT_i]
                    GT = GT.replace(".", '0')
                    GT = GT.split("/")
                    GT[0], GT[1] = int(GT[0]), int(GT[1])

                    # If the variant is not in ExAC the ExAC columns show "."
                    # but the site
                    # may still have been called as multi allelic
                    # - use -1 for all frequencies
                    # in this case
                    if max(GT) > (len(AF) - 1):
                        AF = np.array([-1] * (max(GT) + 1))

                    AF1 = AF[GT[0]]
                    AF2 = AF[GT[1]]
                    AFS.append((AF1, AF2))
                    # Remember where both allele frequencies are
                    # greater than exac_thresh
                    if AF1 >= exac_thresh and AF2 >= exac_thresh:
                        nlist.add(n)
                else:
                    AFS.append(('NA', 'NA'))
                n += 1
        afdict[e] = AFS
        nD[e] = nlist

    ns = set.union(*list(nD.values()))
    return afdict, ns
Example #36
0
def runMAST(infiles, outfile):
    '''run mast on all intervals and motifs.

    Collect all results for an E-value up to 10000 so that
    all sequences are output and MAST curves can be computed. 

    10000 is a heuristic.
    '''
    to_cluster = True

    # job_options = "-l mem_free=8000M"

    controlfile, dbfile, motiffiles = infiles

    if IOTools.isEmpty(dbfile):
        P.touch(outfile)
        return

    if not os.path.exists(controlfile):
        raise P.PipelineError("control file %s for %s does not exist" %
                              (controlfile, dbfile))

    # remove previous results
    if os.path.exists(outfile):
        os.remove(outfile)

    tmpdir = P.getTempDir(".")
    tmpfile = P.getTempFilename(".")

    for motiffile in motiffiles:
        if IOTools.isEmpty(motiffile):
            L.info("skipping empty motif file %s" % motiffile)
            continue

        of = IOTools.openFile(tmpfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s - foreground ::\n" % motif)
        of.close()

        # mast bails if the number of nucleotides gets larger than
        # 2186800982?
        # To avoid this, run db and control file separately.
        statement = '''
        cat %(dbfile)s 
        | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1;
        cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1
        '''
        P.run()

        of = IOTools.openFile(tmpfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s - background ::\n" % motif)
        of.close()

        statement = '''
        cat %(controlfile)s 
        | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1;
        cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1
        '''
        P.run()

    statement = "gzip < %(tmpfile)s > %(outfile)s"
    P.run()

    shutil.rmtree(tmpdir)
    os.unlink(tmpfile)
Example #37
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv is None:
        argv = sys.argv

    parser = E.OptionParser(
        version=
        "%prog version: $Id: optic/regions2gff.py 2781 2009-09-10 11:33:14Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="pattern to look for sequence filename.")

    parser.add_option(
        "-i",
        "--ids",
        dest="ids",
        type="string",
        help=
        "comma separated list of prediction ids. Use 'all' to use all predictions."
    )

    parser.add_option("-f",
                      "--filename-ids",
                      dest="filename_ids",
                      type="string",
                      help="filename with prediction ids.")

    parser.add_option("-t",
                      "--type",
                      dest="type",
                      type="choice",
                      choices=("genes", "mrnas", "introns", "intronic",
                               "exons", "exonic", "intergenic",
                               "exons-third-codons"),
                      help="type to output.")

    parser.add_option(
        "-e",
        "--extend-region",
        dest="extend_region",
        type="int",
        help="regions are extended by this margin at either end.")

    parser.add_option(
        "-r",
        "--shorten-region",
        dest="shorten_region",
        type="int",
        help="regions are shortened by this margin at either end.")

    parser.add_option("-m",
                      "--min-length",
                      dest="min_length",
                      type="int",
                      help="minimum length of segment.")

    parser.add_option("-s",
                      "--schema",
                      dest="schema",
                      type="string",
                      help="schema to take data from.")

    parser.add_option("-o",
                      "--output-format",
                      dest="output_format",
                      type="choice",
                      choices=("fasta", "table", "region"),
                      help="output formats.")

    parser.add_option("--fasta-format",
                      dest="fasta_format",
                      type="choice",
                      choices=("id-coordinates", "coordinates",
                               "schema-coordinates"),
                      help="output formats for fasta formatted headers.")

    parser.add_option("--orthologs",
                      dest="orthologs",
                      action="store_true",
                      help="lookup up orthologs of prediction ids.")

    parser.add_option("--multiple",
                      dest="multiple",
                      action="store_true",
                      help="""lookup up predictions in multiple species.
                       Identifiers should be given as schema|prediction_id[|additional_fields].
                       Note that the genome file locations have to be consistent."""
                      )

    parser.add_option("--id-format",
                      dest="id_format",
                      type="choice",
                      choices=("id", "schema-id", "full"),
                      help="output format for ids.")

    parser.add_option("--taboo-regions",
                      dest="taboo_regions",
                      type="choice",
                      choices=("same", "both"),
                      help="check for overlap in same/both strands.")

    parser.add_option("--filename-taboo-regions",
                      dest="filename_taboo_regions",
                      type="string",
                      help="filename with information about taboo regions.")

    parser.add_option(
        "--filename-properties",
        dest="filename_properties",
        type="string",
        help=
        "filename with mapping information between features and properties.")

    parser.add_option(
        "--invert-properties",
        dest="invert-properties",
        action="store_true",
        help=
        "instead of printing features which have properties, print those that have not."
    )

    parser.add_option(
        "--output-coordinate-format",
        dest="output_coordinate_format",
        type="choice",
        choices=("full", "long"),
        help=
        """output format of coordinates. Output format is contig:strand:from:to in zero based
/forward/reverse strand coordinates in open/closed notation. 'long' includes the contig length as fifth field"""
    )

    parser.set_defaults(genome_file="genome",
                        identifiers=None,
                        filename_ids="-",
                        ids=None,
                        extend_region=0,
                        shorten_region=0,
                        tablename_predictions="predictions",
                        tablename_exons="exons",
                        tablename_genes="genes",
                        tablename_quality="quality",
                        schema=None,
                        output_format="fasta",
                        fasta_format="id-coordinates",
                        type="mrnas",
                        min_length=1,
                        id_format="id",
                        mmultiple=False,
                        separator="|",
                        filename_taboo_regions=False,
                        output_coordinate_format="full",
                        filename_properties=None,
                        invert_property=False,
                        report_step=10000)

    (options, args) = E.Start(parser, add_psql_options=True)

    if options.orthologs:
        options.id_format = "schema-id"

    # database handle for connecting to postgres
    dbhandle = pgdb.connect(options.psql_connection)

    # Step 1 : Input of predictions

    # read identifiers from file, command line arguments or stdin.

    if options.ids in ("all", "nr"):
        prediction_ids = options.ids
        if options.loglevel >= 1:
            options.stdlog.write("# using all prediction ids.\n")
            options.stdlog.flush()
    elif options.ids:
        prediction_ids = options.ids.split(",")
    elif len(args) > 0:
        prediction_ids = args

    elif options.filename_ids:
        prediction_ids = []

        if options.filename_ids == "-":
            prediction_ids += IOTools.ReadList(sys.stdin)[0]
        elif options.filename_ids:
            prediction_ids += IOTools.ReadList(open(options.filename_ids,
                                                    "r"))[0]

        if len(prediction_ids) == 0:
            raise "no prediction identifiers given."

        if options.loglevel >= 1:
            options.stdlog.write("# read %i prediction ids.\n" %
                                 len(prediction_ids))
            options.stdlog.flush()

    if options.filename_taboo_regions:
        # Note: the input has to be in forward coordinates in order for option
        # "both" to work.
        taboo_regions = Regions.RegionFilter()
        if options.taboo_regions == "both":
            ignore_strand = True
        else:
            ignore_strand = False
        taboo_regions.readFromFile(open(options.filename_taboo_regions, "r"),
                                   ignore_strand=ignore_strand)
    else:
        taboo_regions = None

    map_feature2property = getMapFeature2Property(options)

    processPredictions(dbhandle, options.schema, options, prediction_ids,
                       taboo_regions, map_feature2property)

    E.Stop()
Example #38
0
def loadMAST(infile, outfile):
    '''parse mast file and load into database.

    Parse several motif runs and add them to the same
    table.

    Add columns for the control data as well.
    '''

    tablename = P.toTable(outfile)

    tmpfile = P.getTempFile(".")

    tmpfile.write(MAST.Match().header + "\tmotif\tcontig"
                  "\tl_evalue\tl_pvalue\tl_nmatches\tl_length\tl_start\tl_end"
                  "\tr_evalue\tr_pvalue\tr_nmatches\tr_length\tr_start\tr_end"
                  "\tmin_evalue\tmin_pvalue\tmax_nmatches" + "\n")

    lines = IOTools.openFile(infile).readlines()
    chunks = [x for x in range(len(lines)) if lines[x].startswith("::")]
    chunks.append(len(lines))

    def readChunk(lines, chunk):
        # use real file, as MAST parser can not deal with a
        # list of lines
        tmpfile2 = P.getTempFile(".")
        try:
            motif, part = re.match(":: motif = (\S+) - (\S+) ::",
                                   lines[chunks[chunk]]).groups()
        except AttributeError:
            raise P.PipelineError("parsing error in line '%s'" %
                                  lines[chunks[chunk]])

        E.info("reading %s - %s" % (motif, part))

        tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]]))
        tmpfile2.close()

        mast = MAST.parse(IOTools.openFile(tmpfile2.name, "r"))

        os.unlink(tmpfile2.name)

        return motif, part, mast

    def splitId(s, mode):
        '''split background match id

        has three parts: track _ id _ pos

        track might contain '_'.
        '''
        d = match.id.split("_")
        if mode == "bg":
            return "_".join(d[:-2]), d[-2], d[-1]
        elif mode == "fg":
            return "_".join(d[:-1]), d[-1]

    for chunk in range(0, len(chunks) - 1, 2):

        motif_fg, part, mast_fg = readChunk(lines, chunk)
        assert part == "foreground"
        motif_bg, part, mast_bg = readChunk(lines, chunk + 1)
        assert part == "background"
        assert motif_fg == motif_bg

        # index control data
        controls = collections.defaultdict(dict)
        for match in mast_bg.matches:
            track, id, pos = splitId(match.id, "bg")
            controls[id][pos] = (match.evalue, match.pvalue, match.nmotifs,
                                 match.length, match.start, match.end)

        for match in mast_fg.matches:
            # remove track and pos
            track, match.id = splitId(match.id, "fg")
            # move to genomic coordinates
            contig, start, end = re.match("(\S+):(\d+)..(\d+)",
                                          match.description).groups()
            if match.nmotifs > 0:
                start, end = int(start), int(end)
                match.start += start
                match.end += start
                match.positions = [x + start for x in match.positions]

            id = match.id
            if id not in controls:
                P.warn("no controls for %s - increase MAST evalue" % id)

            if "l" not in controls[id]:
                controls[id]["l"] = (float(PARAMS["mast_evalue"]), 1, 0, 0, 0,
                                     0)
            if "r" not in controls[id]:
                controls[id]["r"] = (float(PARAMS["mast_evalue"]), 1, 0, 0, 0,
                                     0)

            min_evalue = min(controls[id]["l"][0], controls[id]["r"][0])
            min_pvalue = min(controls[id]["l"][1], controls[id]["r"][1])
            max_nmatches = max(controls[id]["l"][2], controls[id]["r"][2])

            tmpfile.write(
                str(match) + "\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (
                    motif_fg,
                    contig,
                    "\t".join(map(str, controls[id]["l"])),
                    "\t".join(map(str, controls[id]["r"])),
                    str(min_evalue),
                    str(min_pvalue),
                    str(max_nmatches),
                ) + "\n")

    tmpfile.close()
    tmpfilename = tmpfile.name

    statement = '''
    python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
              -b sqlite 
              --index=id 
              --index=motif 
              --index=id,motif 
              --table=%(tablename)s 
              --allow-empty
              --map=base_qualities:text 
    < %(tmpfilename)s > %(outfile)s
    '''

    P.run()
    os.unlink(tmpfile.name)
Example #39
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("-t", "--tags-tsv-file", dest="input_filename_tags",
                      type="string",
                      help="input file with tag counts [default=%default].")

    parser.add_option(
        "--result-tsv-file", dest="input_filename_result",
        type="string",
        help="input file with results (for plotdetagstats) "
        "[default=%default].")

    parser.add_option("-d", "--design-tsv-file", dest="input_filename_design",
                      type="string",
                      help="input file with experimental design "
                      "[default=%default].")

    parser.add_option("-m", "--method", dest="method", type="choice",
                      choices=("edger", "deseq2", "mock"),
                      help="differential expression method to apply "
                      "[default=%default].")

    parser.add_option("--deseq-dispersion-method",
                      dest="deseq_dispersion_method",
                      type="choice",
                      choices=("pooled", "per-condition", "blind"),
                      help="dispersion method for deseq [default=%default].")

    parser.add_option("--deseq-fit-type", dest="deseq_fit_type", type="choice",
                      choices=("parametric", "local"),
                      help="fit type for deseq [default=%default].")

    parser.add_option("--deseq-sharing-mode",
                      dest="deseq_sharing_mode",
                      type="choice",
                      choices=("maximum", "fit-only", "gene-est-only"),
                      help="deseq sharing mode [default=%default].")

    parser.add_option("--edger-dispersion",
                      dest="edger_dispersion", type="float",
                      help="dispersion value for edgeR if there are no "
                      "replicates [default=%default].")

    parser.add_option("-f", "--fdr", dest="fdr", type="float",
                      help="fdr to apply [default=%default].")

    parser.add_option("-R", "--output-R-code", dest="save_r_environment",
                      type="string",
                      help="save R environment [default=%default].")

    parser.add_option("-r", "--reference-group", dest="ref_group",
                      type="string",
                      help="Group to use as reference to compute "
                      "fold changes against [default=$default]")

    parser.add_option("--filter-min-counts-per-row",
                      dest="filter_min_counts_per_row",
                      type="int",
                      help="remove rows with less than this "
                      "number of counts in total [default=%default].")

    parser.add_option("--filter-min-counts-per-sample",
                      dest="filter_min_counts_per_sample",
                      type="int",
                      help="remove samples with a maximum count per sample of "
                      "less than this number   [default=%default].")

    parser.add_option("--filter-percentile-rowsums",
                      dest="filter_percentile_rowsums",
                      type="int",
                      help="remove percent of rows with "
                      "lowest total counts [default=%default].")

    parser.add_option("--model",
                      dest="model",
                      type="string",
                      help=("model for GLM"))

    parser.add_option("--contrasts",
                      dest="contrasts",
                      action="append",
                      help=("contrasts for post-hoc testing writen as comma "
                            "seperated list `condition,replicate` etc"))

    parser.set_defaults(
        input_filename_tags="-",
        input_filename_result=None,
        input_filename_design=None,
        output_filename=sys.stdout,
        method="deseq2",
        fdr=0.1,
        deseq_dispersion_method="pooled",
        deseq_fit_type="parametric",
        deseq_sharing_mode="maximum",
        edger_dispersion=0.4,
        ref_group=None,
        save_r_environment=None,
        filter_min_counts_per_row=None,
        filter_min_counts_per_sample=None,
        filter_percentile_rowsums=None,
        spike_foldchange_max=4.0,
        spike_expression_max=5.0,
        spike_expression_bin_width=0.5,
        spike_foldchange_bin_width=0.5,
        spike_max_counts_per_bin=50,
        model=None,
        contrasts=None,
        output_filename_pattern=None
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    #  assert options.input_filename_design and os.path.exists(
    #    options.input_filename_design)

    # assert options.output_filename_pattern, "specify --output-filename-pattern"

    # create Counts object
    if options.input_filename_tags == "-":
        counts = Counts.Counts(pd.io.parsers.read_csv(
            sys.stdin, sep="\t", index_col=0, comment="#"))
    else:
        counts = Counts.Counts(pd.io.parsers.read_csv(
            IOTools.openFile(options.input_filename_tags, "r"),
            sep="\t", index_col=0, comment="#"))

    # create Design object
    design = Expression.ExperimentalDesign(
        pd.read_csv(IOTools.openFile(options.input_filename_design, "r"),
                    sep="\t", index_col=0, comment="#"))

    # validate design against counts and model
    design.validate(counts, options.model)

    # restrict counts to samples in design table
    counts.restrict(design)

    # remove sample with low counts
    if options.filter_min_counts_per_sample:
        counts.removeSamples(
            min_counts_per_sample=options.filter_min_counts_per_sample)

    # remove observations with low counts
    if options.filter_min_counts_per_row:
        counts.removeObservationsFreq(
            min_counts_per_row=options.filter_min_counts_per_row)

    # remove bottom percentile of observations
    if options.filter_percentile_rowsums:
        counts.removeObservationsPerc(
            percentile_rowsums=options.filter_percentile_rowsums)

    # check samples are the same in counts and design following counts
    # filtering and, if not, restrict design table and re-validate
    design.revalidate(counts, options.model)

    # set up experiment and run tests
    outfile_prefix = options.output_filename_pattern + options.method

    if options.method == "ttest":
        experiment = Expression.DEExperiment_TTest()
        results = experiment.run(counts, design)

    elif options.method == "edger":
        experiment = Expression.DEExperiment_edgeR()
        results = experiment.run(counts,
                                 design,
                                 model=options.model,
                                 disperion=options.edger_dispersion,
                                 ref_group=options.ref_group,
                                 contrasts=options.contrasts,
                                 outfile_prefix=outfile_prefix)

    elif options.method == "deseq2":
        experiment = Expression.DEExperiment_DESeq2()
        results = experiment.run(counts,
                                 design,
                                 model=options.model,
                                 contrasts=options.contrasts,
                                 outfile_prefix=outfile_prefix,
                                 fdr=options.fdr)

    results.getResults(fdr=options.fdr)

    results.summariseDEResults()

    for contrast in set(results.table['contrast']):
        results.plotVolcano(contrast, outfile_prefix=outfile_prefix)
        results.plotMA(contrast, outfile_prefix=outfile_prefix)

    results.table.to_csv(sys.stdout, sep="\t", na_rep="NA", index=False)

    # write out summary tables for each comparison/contrast
    for test_group in results.Summary.keys():
        outf = IOTools.openFile("_".join(
            [outfile_prefix, test_group, "summary.tsv"]), "w")
        outf.write("category\tcounts\n%s\n"
                   % results.Summary[test_group].asTable())
        outf.close()

    E.Stop()
Example #40
0
def writeSequencesForIntervals(track,
                               filename,
                               dbhandle,
                               full=False,
                               halfwidth=None,
                               maxsize=None,
                               proportion=None,
                               masker=[],
                               offset=0,
                               shuffled=False,
                               num_sequences=None,
                               min_sequences=None,
                               order="peakval",
                               shift=None):
    '''build a sequence set for motif discovery. Intervals are taken from
    the table <track>_intervals in the database *dbhandle* and save to
    *filename* in :term:`fasta` format.

    If num_shuffles is set, shuffled copies are created as well with
    the shuffled number appended to the filename.

    The sequences are masked before shuffling (is this appropriate?)

    If *full* is set, the whole intervals will be output, otherwise
    only the region around the peak given by *halfwidth*

    If *maxsize* is set, the output is truncated at *maxsize* characters
    in order to create jobs that take too long.

    If proportion is set, only the top *proportion* intervals are output
    (sorted by peakval).

    If *num_sequences* is set, the first *num_sequences* will be used.

    *masker* can be a combination of
        * dust, dustmasker: apply dustmasker
        * softmask: mask softmasked genomic regions

    *order* is the order by which peaks should be sorted. Possible
    values are 'peakval' (peak value, descending order), 'score' (peak
    score, descending order)

    If *shift* is set, intervals will be shifted. ``leftright``
    creates two intervals on the left and right of the actual
    interval. The intervals will be centered around the mid-point and
    truncated the same way as the main intervals.

    '''

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], PARAMS["genome"]))

    cc = dbhandle.cursor()

    if order == "peakval":
        orderby = " ORDER BY peakval DESC"
    elif order == "max":
        orderby = " ORDER BY score DESC"
    else:
        raise ValueError(
            "Unknown value passed as order parameter, check your ini file")

    tablename = "%s_intervals" % P.quote(track)
    statement = '''SELECT contig, start, end, interval_id, peakcenter 
                       FROM %(tablename)s 
                       ''' % locals() + orderby

    cc.execute(statement)
    data = cc.fetchall()
    cc.close()

    if proportion:
        cutoff = int(len(data) * proportion) + 1
        if min_sequences:
            cutoff = max(cutoff, min_sequences)
    elif num_sequences:
        cutoff = num_sequences
    else:
        cutoff = len(data)
        L.info(
            "writeSequencesForIntervals %s: using at most %i sequences for pattern finding"
            % (track, cutoff))

    data = data[:cutoff]

    L.info("writeSequencesForIntervals %s: masker=%s" % (track, str(masker)))

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], PARAMS["genome"]))

    # modify the ranges
    if shift:
        if shift == "leftright":
            new_data = [(contig, start - (end - start), start,
                         str(interval_id) + "_left", peakcenter)
                        for contig, start, end, interval_id, peakcenter in data
                        ]
            new_data.extend([
                (contig, end, end + (end - start), str(interval_id) + "_right",
                 peakcenter)
                for contig, start, end, interval_id, peakcenter in data
            ])
        data = new_data

    if halfwidth:
        # center around peakcenter, add halfwidth on either side
        data = [(contig, peakcenter - halfwidth, peakcenter + halfwidth,
                 interval_id)
                for contig, start, end, interval_id, peakcenter in data]
    else:
        # remove peakcenter
        data = [(contig, start, end, interval_id)
                for contig, start, end, interval_id, peakcenter in data]

    # get the sequences - cut at number of nucleotides
    sequences = []
    current_size, nseq = 0, 0
    new_data = []
    for contig, start, end, interval_id in data:
        lcontig = fasta.getLength(contig)
        start, end = max(0, start + offset), min(end + offset, lcontig)
        if start >= end:
            L.info(
                "writeSequencesForIntervals %s: sequence %s is empty: start=%i, end=%i, offset=%i - ignored"
                % (track, id, start, end, offset))
            continue
        seq = fasta.getSequence(contig, "+", start, end)
        sequences.append(seq)
        new_data.append((start, end, interval_id, contig))
        current_size += len(seq)
        if maxsize and current_size >= maxsize:
            L.info(
                "writeSequencesForIntervals %s: maximum size (%i) reached - only %i sequences output (%i ignored)"
                % (track, maxsize, nseq, len(data) - nseq))
            break
        nseq += 1

    data = new_data

    if shuffled:
        # note that shuffling is done on the unmasked sequences
        # Otherwise N's would be interspersed with real sequence
        # messing up motif finding unfairly. Instead, masking is
        # done on the shuffled sequence.
        sequences = [list(x) for x in sequences]
        for sequence in sequences:
            random.shuffle(sequence)
        sequences = maskSequences(["".join(x) for x in sequences], masker)

    c = E.Counter()
    outs = IOTools.openFile(filename, "w")
    for masker in masker:
        if masker not in ("unmasked", "none", None):
            sequences = maskSequences(sequences, masker)

    for sequence, d in zip(sequences, data):
        c.input += 1
        if len(sequence) == 0:
            c.empty += 1
            continue
        start, end, id, contig = d
        id = "%s_%s %s:%i-%i" % (track, str(id), contig, start, end)
        outs.write(">%s\n%s\n" % (id, sequence))
        c.output += 1
    outs.close()

    E.info("%s" % c)

    return c.output
Example #41
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-m",
        "--method",
        dest="methods",
        type="choice",
        action="append",
        choices=(
            "geneprofile",
            "tssprofile",
            "utrprofile",
            "intervalprofile",
            "midpointprofile",
            "geneprofilewithintrons",
            "geneprofileabsolutedistancefromthreeprimeend",
        ),
        help='counters to use. Counters describe the meta-gene structure to use '
        '[%default]. \n Note using geneprofilewithintrons, or geneprofileabsolutedistancefromthreeprimeend will automatically turn on the --base-accuracy option'
    )

    parser.add_option(
        "-b",
        "--bamfile",
        "--bedfile",
        "--bigwigfile",
        dest="infiles",
        metavar="BAM",
        type="string",
        action="append",
        help="BAM/bed/bigwig files to use. Do not mix different types"
        "[%default]")

    parser.add_option(
        "-c",
        "--controlfile",
        dest="controlfiles",
        metavar="BAM",
        type="string",
        action="append",
        help=
        "control/input to use. Should be of the same type as the bam/bed/bigwig file"
        " [%default]")

    parser.add_option("-g",
                      "--gtffile",
                      dest="gtffile",
                      type="string",
                      metavar="GTF",
                      help="GTF file to use. "
                      "[%default]")

    parser.add_option(
        "-n",
        "--normalization",
        dest="normalization",
        type="choice",
        choices=("none", "max", "sum", "total-max", "total-sum"),
        help=
        "normalization to apply on each transcript profile before adding to meta-gene profile. "
        "[%default]")

    parser.add_option(
        "-p",
        "--normalize-profile",
        dest="profile_normalizations",
        type="choice",
        action="append",
        choices=("all", "none", "area", "counts", "background"),
        help="normalization to apply on meta-gene profile normalization. "
        "[%default]")

    parser.add_option(
        "-r",
        "--reporter",
        dest="reporter",
        type="choice",
        choices=("gene", "transcript"),
        help="report results for genes or transcripts."
        " When 'genes` is chosen, exons across all transcripts for"
        " a gene are merged. When 'transcript' is chosen, counts are"
        " computed for each transcript separately with each transcript"
        " contributing equally to the meta-gene profile."
        " [%default]")

    parser.add_option(
        "-i",
        "--shift",
        dest="shifts",
        type="int",
        action="append",
        help=
        "shift reads in :term:`bam` formatted file before computing densities (ChIP-Seq). "
        "[%default]")

    parser.add_option(
        "-a",
        "--merge-pairs",
        dest="merge_pairs",
        action="store_true",
        help="merge pairs in :term:`bam` formatted file before computing"
        " densities (ChIP-Seq)."
        "[%default]")

    parser.add_option(
        "-u",
        "--base-accuracy",
        dest="base_accuracy",
        action="store_true",
        help="compute densities with base accuracy. The default is to"
        " only use the start and end of the aligned region (RNA-Seq)"
        " [%default]")

    parser.add_option(
        "-e",
        "--extend",
        dest="extends",
        type="int",
        action="append",
        help="extend reads in :term:`bam` formatted file (ChIP-Seq). "
        "[%default]")

    parser.add_option("--resolution-upstream",
                      dest="resolution_upstream",
                      type="int",
                      help="resolution of upstream region in bp "
                      "[%default]")

    parser.add_option("--resolution-downstream",
                      dest="resolution_downstream",
                      type="int",
                      help="resolution of downstream region in bp "
                      "[%default]")

    parser.add_option("--resolution-upstream-utr",
                      dest="resolution_upstream_utr",
                      type="int",
                      help="resolution of upstream UTR region in bp "
                      "[%default]")

    parser.add_option("--resolution-downstream-utr",
                      dest="resolution_downstream_utr",
                      type="int",
                      help="resolution of downstream UTR region in bp "
                      "[%default]")

    parser.add_option("--resolution-cds",
                      dest="resolution_cds",
                      type="int",
                      help="resolution of cds region in bp "
                      "[%default]")

    parser.add_option("--resolution-introns",
                      dest="resolution_introns",
                      type="int",
                      help="resolution of introns region in bp "
                      "[%default]")

    parser.add_option(
        "--resolution-exons-absolute-distance-topolya",
        dest="resolution_exons_absolute_distance_topolya",
        type="int",
        help="resolution of exons absolute distance topolya in bp "
        "[%default]")

    parser.add_option(
        "--resolution-introns-absolute-distance-topolya",
        dest="resolution_introns_absolute_distance_topolya",
        type="int",
        help="resolution of introns absolute distance topolya in bp "
        "[%default]")

    parser.add_option(
        "--extension-exons-absolute-distance-topolya",
        dest="extension_exons_absolute_distance_topolya",
        type="int",
        help=
        "extension for exons from the absolute distance from the topolya in bp"
        "[%default]")

    parser.add_option(
        "--extension-introns-absolute-distance-topolya",
        dest="extension_introns_absolute_distance_topolya",
        type="int",
        help=
        "extension for introns from the absolute distance from the topolya in bp"
        "[%default]")

    parser.add_option("--extension-upstream",
                      dest="extension_upstream",
                      type="int",
                      help="extension upstream from the first exon in bp"
                      "[%default]")

    parser.add_option("--extension-downstream",
                      dest="extension_downstream",
                      type="int",
                      help="extension downstream from the last exon in bp"
                      "[%default]")

    parser.add_option("--extension-inward",
                      dest="extension_inward",
                      type="int",
                      help="extension inward from a TSS start site in bp"
                      "[%default]")

    parser.add_option("--extension-outward",
                      dest="extension_outward",
                      type="int",
                      help="extension outward from a TSS start site in bp"
                      "[%default]")

    parser.add_option("--scale-flank-length",
                      dest="scale_flanks",
                      type="int",
                      help="scale flanks to (integer multiples of) gene length"
                      "[%default]")

    parser.add_option(
        "--matrix-format",
        dest="matrix_format",
        type="choice",
        choices=("multiple", "single"),
        help="matrix output format, either 'multiple' files or a 'single' file "
        "[%default]")

    parser.add_option(
        "--control-factor",
        dest="control_factor",
        type="float",
        help="factor for normalizing control and fg data. Computed from data "
        "if not set. "
        "[%default]")

    parser.add_option(
        "--output-all-profiles",
        dest="output_all_profiles",
        action="store_true",
        help="keep individual profiles for each transcript and output. "
        "[%default]")

    parser.add_option(
        "--input-filename-counts",
        dest="input_filename_counts",
        type="string",
        help="filename with count data for each transcript. Use this instead "
        "of recomputing the profile. Useful for plotting the meta-gene profile "
        "from previously computed counts "
        "[%default]")

    parser.add_option(
        "--background-region",
        dest="background-region",
        type="int",
        help="number of bins on either side of the profile to be considered "
        "for background meta-gene normalizatian "
        "[%default]")

    parser.set_defaults(
        remove_rna=False,
        ignore_pairs=False,
        force_output=False,
        bin_size=10,
        extends=[],
        shifts=[],
        sort=[],
        reporter="transcript",
        resolution_cds=1000,
        resolution_introns=1000,
        resolution_exons_absolute_distance_topolya=
        3000,  #3kb is a good balance of seeing long enough 3 prime bias and not omit too many genes. Tim 31th Aug 2013
        resolution_introns_absolute_distance_topolya=
        500,  #introns is only for assess the noise level, thus do ont need a long region, a long region has the side effect of omit more genes. Tim 31th Aug 2013
        extension_exons_absolute_distance_topolya=
        3000,  #3kb is a good balance of seeing long enough 3 prime bias and not omit too many genes. Tim 31th Aug 2013
        extension_introns_absolute_distance_topolya=
        500,  #introns is only for assess the noise level, thus do ont need a long region, a long region has the side effect of omit more genes. Tim 31th Aug 2013
        resolution_upstream_utr=1000,
        resolution_downstream_utr=1000,
        resolution_upstream=1000,
        resolution_downstream=1000,
        # mean length of transcripts: about 2.5 kb
        extension_upstream=2500,
        extension_downstream=2500,
        extension_inward=3000,
        extension_outward=3000,
        plot=True,
        methods=[],
        infiles=[],
        controlfiles=[],
        gtffile=None,
        profile_normalizations=[],
        normalization=None,
        scale_flanks=0,
        merge_pairs=False,
        min_insert_size=0,
        max_insert_size=1000,
        base_accuracy=False,
        matrix_format="single",
        control_factor=None,
        output_all_profiles=False,
        background_region=10,
        input_filename_counts=None,
    )

    ## add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    # Keep for backwards compatability
    if len(args) == 2:
        infile, gtf = args
        options.infiles.append(infile)
        options.gtffile = gtf

    if not options.gtffile:
        raise ValueError("no GTF file specified")

    if options.gtffile == "-":
        options.gtffile = options.stdin
    else:
        options.gtffile = IOTools.openFile(options.gtffile)

    if len(options.infiles) == 0:
        raise ValueError("no bam/wig/bed files specified")

    for methodsRequiresBaseAccuracy in [
            "geneprofilewithintrons",
            "geneprofileabsolutedistancefromthreeprimeend",
    ]:
        # If you implemented any methods that you do not want the spliced out introns
        # or exons appear to be covered by non-existent reads, it is better you let those
        # methods imply --base-accurarcy by add them here.
        if methodsRequiresBaseAccuracy in options.methods:
            options.base_accuracy = True

    if options.reporter == "gene":
        gtf_iterator = GTF.flat_gene_iterator(GTF.iterator(options.gtffile))
    elif options.reporter == "transcript":
        gtf_iterator = GTF.transcript_iterator(GTF.iterator(options.gtffile))

    # Select rangecounter based on file type
    if len(options.infiles) > 0:
        if options.infiles[0].endswith(".bam"):
            bamfiles = [pysam.Samfile(x, "rb") for x in options.infiles]

            if options.controlfiles:
                controlfiles = [
                    pysam.Samfile(x, "rb") for x in options.controlfiles
                ]
            else:
                controlfiles = None

            format = "bam"
            if options.merge_pairs:
                range_counter = _bam2geneprofile.RangeCounterBAM(
                    bamfiles,
                    shifts=options.shifts,
                    extends=options.extends,
                    merge_pairs=options.merge_pairs,
                    min_insert_size=options.min_insert_size,
                    max_insert_size=options.max_insert_size,
                    controfiles=controlfiles,
                    control_factor=options.control_factor)
            elif options.shifts or options.extends:
                range_counter = _bam2geneprofile.RangeCounterBAM(
                    bamfiles,
                    shifts=options.shifts,
                    extends=options.extends,
                    controlfiles=controlfiles,
                    control_factor=options.control_factor)

            elif options.base_accuracy:
                range_counter = _bam2geneprofile.RangeCounterBAMBaseAccuracy(
                    bamfiles,
                    controlfiles=controlfiles,
                    control_factor=options.control_factor)
            else:
                range_counter = _bam2geneprofile.RangeCounterBAM(
                    bamfiles,
                    controlfiles=controlfiles,
                    control_factor=options.control_factor)

        elif options.infiles[0].endswith(".bed.gz"):
            bedfiles = [pysam.Tabixfile(x) for x in options.infiles]

            if options.controlfiles:
                controlfiles = [
                    pysam.Tabixfile(x) for x in options.controlfiles
                ]
            else:
                controlfiles = None

            format = "bed"
            range_counter = _bam2geneprofile.RangeCounterBed(
                bedfiles,
                controlfiles=controlfiles,
                control_factor=options.control_factor)

        elif options.infiles[0].endswith(".bw"):
            wigfiles = [BigWigFile(file=open(x)) for x in options.infiles]
            format = "bigwig"
            range_counter = _bam2geneprofile.RangeCounterBigWig(wigfiles)

        else:
            raise NotImplementedError("can't determine file type for %s" %
                                      bamfile)

    counters = []
    for method in options.methods:
        if method == "utrprofile":
            counters.append(
                _bam2geneprofile.UTRCounter(
                    range_counter,
                    options.resolution_upstream,
                    options.resolution_upstream_utr,
                    options.resolution_cds,
                    options.resolution_downstream_utr,
                    options.resolution_downstream,
                    options.extension_upstream,
                    options.extension_downstream,
                ))

        elif method == "geneprofile":
            counters.append(
                _bam2geneprofile.GeneCounter(
                    range_counter, options.resolution_upstream,
                    options.resolution_cds, options.resolution_downstream,
                    options.extension_upstream, options.extension_downstream,
                    options.scale_flanks))

        elif method == "geneprofilewithintrons":
            counters.append(
                _bam2geneprofile.GeneCounterWithIntrons(
                    range_counter, options.resolution_upstream,
                    options.resolution_cds, options.resolution_introns,
                    options.resolution_downstream, options.extension_upstream,
                    options.extension_downstream, options.scale_flanks))

        elif method == "geneprofileabsolutedistancefromthreeprimeend":
            counters.append(
                _bam2geneprofile.GeneCounterAbsoluteDistanceFromThreePrimeEnd(
                    range_counter,
                    options.resolution_upstream,
                    options.resolution_downstream,
                    options.resolution_exons_absolute_distance_topolya,
                    options.resolution_introns_absolute_distance_topolya,
                    # options.resolution_exons_absolute_distance_tostartsite,
                    # options.resolution_introns_absolute_distance_tostartsite,
                    # Tim 31th Aug 2013: a possible feature for future,  if five prime bias is of your interest.
                    #(you need to create another class). It is not very difficult to derive from this class, but is not implemented yet
                    # This future feature is slightly different the TSS profile already implemented, because in this future feature introns are skipped,
                    options.extension_upstream,
                    options.extension_downstream,
                    options.extension_exons_absolute_distance_topolya,
                    options.extension_introns_absolute_distance_topolya,
                    # options.extension_exons_absolute_distance_tostartsite,
                    # options.extension_introns_absolute_distance_tostartsite,
                    # Tim 31th Aug 2013: a possible feature for future,  if five prime bias is of your interest.
                    #(you need to create another class). It is not very difficult to derive from this class, but is not implemented yet
                    # This future feature is slightly different the TSS profile already implemented, because in this future feature introns are skipped,
                    options.scale_flanks))

        elif method == "tssprofile":
            counters.append(
                _bam2geneprofile.TSSCounter(range_counter,
                                            options.extension_outward,
                                            options.extension_inward))

        elif method == "intervalprofile":
            counters.append(
                _bam2geneprofile.RegionCounter(range_counter,
                                               options.resolution_upstream,
                                               options.resolution_cds,
                                               options.resolution_downstream,
                                               options.extension_upstream,
                                               options.extension_downstream))

        elif method == "midpointprofile":
            counters.append(
                _bam2geneprofile.MidpointCounter(range_counter,
                                                 options.resolution_upstream,
                                                 options.resolution_downstream,
                                                 options.extension_upstream,
                                                 options.extension_downstream))

    # set normalization
    for c in counters:
        c.setNormalization(options.normalization)
        if options.output_all_profiles:
            c.setOutputProfiles(
                IOTools.openFile(
                    E.getOutputFile(c.name) + ".profiles.tsv.gz", "w"))

    if options.input_filename_counts:
        # read counts from file
        E.info("reading counts from %s" % options.input_filename_counts)
        all_counts = pandas.read_csv(IOTools.openFile(
            options.input_filename_counts),
                                     sep='\t',
                                     header=0,
                                     index_col=0)

        if len(counters) != 1:
            raise NotImplementedError(
                'counting from matrix only implemented for 1 counter.')
        # build counter based on reference counter
        counter = _bam2geneprofile.UnsegmentedCounter(counters[0])
        counters = [counter]
        _bam2geneprofile.countFromCounts(counters, all_counts)

    else:
        E.info("starting counting with %i counters" % len(counters))
        _bam2geneprofile.countFromGTF(counters, gtf_iterator)

    # output matrices
    if not options.profile_normalizations:
        options.profile_normalizations.append("none")
    elif "all" in options.profile_normalizations:
        options.profile_normalizations = [
            "none", "area", "counts", "background"
        ]

    for method, counter in zip(options.methods, counters):
        if options.matrix_format == "multiple":
            # output multiple files, each containing results of one normalization
            for norm in options.profile_normalizations:
                with IOTools.openFile(
                        E.getOutputFile(counter.name) + ".%s.tsv.gz" % norm,
                        "w") as outfile:
                    counter.writeMatrix(
                        outfile,
                        normalize=norm,
                        background_region=options.background_region)

        elif options.matrix_format == "single":
            # build a single output
            matrices = []
            for norm in options.profile_normalizations:
                # build matrix, apply normalization
                matrix = counter.buildMatrix(
                    normalize=norm,
                    background_region=options.background_region)
                nrows, ncols = matrix.shape
                matrix.shape = (nrows * ncols, 1)
                matrices.append(matrix)

            for x in range(1, len(matrices)):
                assert matrices[0].shape == matrices[x].shape

            # build a single matrix
            matrix = numpy.hstack(matrices)
            nrows, ncols = matrix.shape
            with IOTools.openFile(
                    E.getOutputFile(counter.name) + ".matrix.tsv.gz",
                    "w") as outfile:
                outfile.write( "bin\tregion\tregion_bin\t%s\n" % "\t".join( \
                        options.profile_normalizations) )
                fields = []
                bins = []
                for field, nbins in zip(counter.fields, counter.nbins):
                    fields.extend([field] * nbins)
                    bins.extend(list(range(nbins)))

                for row, cols in enumerate(zip(fields, bins, matrix)):
                    outfile.write("%i\t%s\t" %
                                  (row, "\t".join([str(x)
                                                   for x in cols[:-1]])))
                    outfile.write("%s\n" %
                                  ("\t".join([str(x) for x in cols[-1]])))

        with IOTools.openFile(
                E.getOutputFile(counter.name) + ".lengths.tsv.gz",
                "w") as outfile:
            counter.writeLengthStats(outfile)

        if options.output_all_profiles:
            counter.closeOutputProfiles()

    if options.plot:

        import matplotlib
        # avoid Tk or any X
        matplotlib.use("Agg")
        import matplotlib.pyplot as plt

        for method, counter in zip(options.methods, counters):

            if method in ("geneprofile", "geneprofilewithintrons",
                          "geneprofileabsolutedistancefromthreeprimeend",
                          "utrprofile", "intervalprofile"):

                plt.figure()
                plt.subplots_adjust(wspace=0.05)
                max_scale = max([max(x) for x in counter.aggregate_counts])

                for x, counts in enumerate(counter.aggregate_counts):
                    plt.subplot(5, 1, x + 1)
                    plt.plot(range(len(counts)), counts)
                    plt.title(counter.fields[x])
                    plt.ylim(0, max_scale)

                figname = counter.name + ".full"

                fn = E.getOutputFile(figname) + ".png"
                plt.savefig(os.path.expanduser(fn))

                plt.figure()

                points = []
                cuts = []
                for x, counts in enumerate(counter.aggregate_counts):
                    points.extend(counts)
                    cuts.append(len(counts))

                plt.plot(range(len(points)), points)
                xx, xxx = 0, []
                for x in cuts:
                    xxx.append(xx + x // 2)
                    xx += x
                    plt.axvline(xx, color="r", ls="--")

                plt.xticks(xxx, counter.fields)

                figname = counter.name + ".detail"

                fn = E.getOutputFile(figname) + ".png"
                plt.savefig(os.path.expanduser(fn))

            elif method == "tssprofile":

                plt.figure()
                plt.subplot(1, 3, 1)
                plt.plot(
                    range(-options.extension_outward,
                          options.extension_inward),
                    counter.aggregate_counts[0])
                plt.title(counter.fields[0])
                plt.subplot(1, 3, 2)
                plt.plot(
                    range(-options.extension_inward,
                          options.extension_outward),
                    counter.aggregate_counts[1])
                plt.title(counter.fields[1])
                plt.subplot(1, 3, 3)
                plt.title("combined")
                plt.plot(
                    range(-options.extension_outward,
                          options.extension_inward),
                    counter.aggregate_counts[0])
                plt.plot(
                    range(-options.extension_inward,
                          options.extension_outward),
                    counter.aggregate_counts[1])
                plt.legend(counter.fields[:2])

                fn = E.getOutputFile(counter.name) + ".png"
                plt.savefig(os.path.expanduser(fn))

            elif method == "midpointprofile":

                plt.figure()
                plt.plot(numpy.arange(-options.resolution_upstream, 0),
                         counter.aggregate_counts[0])
                plt.plot(numpy.arange(0, options.resolution_downstream),
                         counter.aggregate_counts[1])

                fn = E.getOutputFile(counter.name) + ".png"
                plt.savefig(os.path.expanduser(fn))

    ## write footer and output benchmark information.
    E.Stop()
Example #42
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(version="%prog version: $Id",
                                   usage=globals()["__doc__"])

    parser.add_option("--bin",
                      dest="bin",
                      action="store_true",
                      help="output average in bins across the interval")
    parser.add_option("-n",
                      "--num-bins",
                      dest="bin_number",
                      type=int,
                      help="number of bins for coverage profile")
    parser.add_option("-o",
                      "--output-filename-prefix",
                      dest="output_filename_prefix",
                      help="pattern to write coverage bins to")

    parser.set_defaults(bin=False, bin_number=10)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    inf = options.stdin

    coverage_result = collections.defaultdict(list)
    E.info("reading in coverage data")
    for line in inf.readlines():
        data = line[:-1].split("\t")
        contig, coverage = data[0], data[2]
        coverage_result[contig].append(coverage)
    E.info("read %i contigs" % len(list(coverage_result.keys())))

    options.stdout.write("contig\tcov_mean\tcov_sd\n")
    if options.bin:
        outf = IOTools.openFile(options.output_filename_prefix + ".binned",
                                "w")
        outf.write(
            "%s" %
            "\t".join([str(i)
                       for i in range(1, options.bin_number + 1, 1)]) + "\n")
    for contig, coverage in coverage_result.items():
        coverage = list(map(float, coverage))
        options.stdout.write(
            "%s\t%s\t%s\n" %
            (contig, str(np.mean(coverage)), str(np.std(coverage))))
        if options.bin:
            bin_means = []
            bins = np.linspace(0, len(coverage), options.bin_number + 1)
            if len(coverage) < len(bins) - 1:
                E.warn("will not calculate coverage means for %s: too short" %
                       contig)
                continue
            for i in range(len(bins)):
                try:
                    bin_mean = np.mean(coverage[int(bins[i]):int(bins[i + 1])])
                except IndexError:
                    continue
                bin_means.append(bin_mean)
            outf.write(contig + "\t" + "\t".join(map(str, bin_means)) + "\n")
    outf.close()

    # write footer and output benchmark information.
    E.Stop()
Example #43
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                                   usage=globals()["__doc__"])

    parser.add_option("-g", "--gi-accessions", dest="gi_accessions", type="string",
                      help="list of gi accession numbers")
    parser.add_option("-m", "--ncbi-map", dest="ncbi_map", type="string",
                      help="ncbi.map file downloaded from the MEGAN website")
    parser.add_option("-n", "--nucl-map", dest="nucl_map", type="string",
                      help="gi mapping to tax id downloaded from ncbi website")
    parser.add_option("-c", "--taxa-code", dest="taxa_code", type="string",
                      help="code for different levels of the taxonomy downloaded from the MEGAN website")
    parser.add_option("-t", "--tree", dest="tree", type="string",
                      help="description of parents in the taxonomy")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    E.info("reading gi accession numbers")
    gi_accessions = set()
    for line in IOTools.openFile(options.gi_accessions).readlines():
        gi_accessions.add(line[:-1])
    E.info("read gi accession numbers")

    E.info("building gi2taxid map")
    gi2taxid = {}
    c_gi = 0
    for line in IOTools.openFile(options.nucl_map).readlines():
        data = line[:-1].split("\t")
        if data[0] not in gi_accessions:
            continue
        else:
            c_gi += 1
            gi2taxid[data[0]] = data[1]
    E.info("built gi2taxid for %i gi accession numbers" % c_gi)

    E.info("building code map")
    code2taxa = {}

    for line in IOTools.openFile(options.taxa_code).readlines():
        data = line[:-1].split("\t")
        code2taxa[data[0]] = data[1]
    E.info("built taxa code map")

    E.info("building taxa2name map")
    taxid2name = {}
    for line in IOTools.openFile(options.ncbi_map).readlines():
        data = line[:-1].split("\t")
        # keep the taxa code
        taxid2name[data[0]] = (data[1], data[3])
    E.info("built taxa2name map")

    E.info("build taxid2parentmap")
    taxid2parents = {}
    for line in IOTools.openFile(options.tree).readlines():
        data = line[:-1].split("\t")
        data = [x for x in data if x != "|"]
        taxid2parents[data[0]] = data[1]
    E.info("built taxid2parentmap")

    E.info("retrieving parents for each gi accession number")
    options.stdout.write(
        "gi\tsub_species\tspecies\tgenus\tfamily\torder\tclass\tphylum\n")
    for gi, taxid in gi2taxid.items():
        # this will be the sub species id
        # walk through the parents
        parents = {}
        sub_species = taxid2name[taxid][0]
        for i in range(len(list(code2taxa.keys()))):
            parent_taxid = taxid2parents[taxid]
            parent_name = taxid2name[parent_taxid][0]
            parent_code = taxid2name[parent_taxid][1]
            # ignore codes that we are not  interested in
            if parent_code not in list(code2taxa.keys()):
                continue
            parent_taxa = code2taxa[parent_code]
            parents[parent_taxa] = parent_name
            taxid = parent_taxid

        if "genus" not in parents:
            genus = "NA"
        else:
            genus = parents["genus"]
        if "family" not in parents:
            family = "NA"
        else:
            family = parents["family"]
        if "order" not in parents:
            order = "NA"
        else:
            order = parents["order"]
        if "class" not in parents:
            _class = "NA"
        else:
            _class = parents["class"]
        if "phylum" not in parents:
            phylum = "NA"
        else:
            phylum = parents["phylum"]
            if phylum.find("<phylum>") != -1:
                phylum = phylum.replace(" <phylum>", "")
        if "species" not in parents:
            species = "NA"
        else:
            species = parents["species"]
        options.stdout.write("\t".join([gi, sub_species.replace(" ", "_"), species.replace(
            " ", "_"), genus, family, order, _class, phylum]) + "\n")

    # write footer and output benchmark information.
    E.Stop()
Example #44
0
def main(args=sys.argv):
    """command line control function for a pipeline.

    This method defines command line options for the pipeline and
    updates the global configuration dictionary correspondingly.

    It then provides a command parser to execute particular tasks
    using the ruffus pipeline control functions. See the generated
    command line help for usage.

    To use it, add::

        import CGAT.Pipeline as P

        if __name__ == "__main__":
            sys.exit(P.main(sys.argv))

    to your pipeline script.

    Arguments
    ---------
    args : list
        List of command line arguments.

    """

    global GLOBAL_OPTIONS
    global GLOBAL_ARGS

    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=USAGE)

    parser.add_option("--pipeline-action", dest="pipeline_action",
                      type="choice",
                      choices=(
                          "make", "show", "plot", "dump", "config", "clone",
                          "check", "regenerate", "printconfig"),
                      help="action to take [default=%default].")

    parser.add_option("--pipeline-format", dest="pipeline_format",
                      type="choice",
                      choices=("dot", "jpg", "svg", "ps", "png"),
                      help="pipeline format [default=%default].")

    parser.add_option("-n", "--dry-run", dest="dry_run",
                      action="store_true",
                      help="perform a dry run (do not execute any shell "
                      "commands) [default=%default].")

    parser.add_option("-f", "--force-output", dest="force",
                      action="store_true",
                      help="force running the pipeline even if there "
                      "are uncommited changes "
                      "in the repository [default=%default].")

    parser.add_option("-p", "--multiprocess", dest="multiprocess", type="int",
                      help="number of parallel processes to use on "
                      "submit host "
                      "(different from number of jobs to use for "
                      "cluster jobs) "
                      "[default=%default].")

    parser.add_option("-e", "--exceptions", dest="log_exceptions",
                      action="store_true",
                      help="echo exceptions immediately as they occur "
                      "[default=%default].")

    parser.add_option("-i", "--terminate", dest="terminate",
                      action="store_true",
                      help="terminate immediately at the first exception "
                      "[default=%default].")

    parser.add_option("-d", "--debug", dest="debug",
                      action="store_true",
                      help="output debugging information on console, "
                      "and not the logfile "
                      "[default=%default].")

    parser.add_option("-s", "--set", dest="variables_to_set",
                      type="string", action="append",
                      help="explicitly set paramater values "
                      "[default=%default].")

    parser.add_option("-c", "--checksums", dest="ruffus_checksums_level",
                      type="int",
                      help="set the level of ruffus checksums"
                      "[default=%default].")

    parser.add_option("-t", "--is-test", dest="is_test",
                      action="store_true",
                      help="this is a test run"
                      "[default=%default].")

    parser.add_option("--rabbitmq-exchange", dest="rabbitmq_exchange",
                      type="string",
                      help="RabbitMQ exchange to send log messages to "
                      "[default=%default].")

    parser.add_option("--rabbitmq-host", dest="rabbitmq_host",
                      type="string",
                      help="RabbitMQ host to send log messages to "
                      "[default=%default].")

    parser.add_option("--input-validation", dest="input_validation",
                      action="store_true",
                      help="perform input validation before starting "
                      "[default=%default].")

    parser.set_defaults(
        pipeline_action=None,
        pipeline_format="svg",
        pipeline_targets=[],
        multiprocess=40,
        logfile="pipeline.log",
        dry_run=False,
        force=False,
        log_exceptions=False,
        exceptions_terminate_immediately=False,
        debug=False,
        variables_to_set=[],
        is_test=False,
        ruffus_checksums_level=0,
        rabbitmq_host="saruman",
        rabbitmq_exchange="ruffus_pipelines",
        input_validation=False)

    (options, args) = E.Start(parser,
                              add_cluster_options=True)

    GLOBAL_OPTIONS, GLOBAL_ARGS = options, args
    E.info("Started in: %s" % PARAMS.get("workingdir"))
    # At this point, the PARAMS dictionary has already been
    # built. It now needs to be updated with selected command
    # line options as these should always take precedence over
    # configuration files.

    PARAMS["dryrun"] = options.dry_run
    PARAMS["input_validation"] = options.input_validation

    # use cli_cluster_* keys in PARAMS to ensure highest priority
    # of cluster_* options passed with the command-line
    if options.cluster_memory_default is not None:
        PARAMS["cli_cluster_memory_default"] = options.cluster_memory_default
        PARAMS["cluster_memory_default"] = options.cluster_memory_default
    if options.cluster_memory_resource is not None:
        PARAMS["cli_cluster_memory_resource"] = options.cluster_memory_resource
        PARAMS["cluster_memory_resource"] = options.cluster_memory_resource
    if options.cluster_num_jobs is not None:
        PARAMS["cli_cluster_num_jobs"] = options.cluster_num_jobs
        PARAMS["cluster_num_jobs"] = options.cluster_num_jobs
    if options.cluster_options is not None:
        PARAMS["cli_cluster_options"] = options.cluster_options
        PARAMS["cluster_options"] = options.cluster_options
    if options.cluster_parallel_environment is not None:
        PARAMS["cli_cluster_parallel_environment"] = options.cluster_parallel_environment
        PARAMS["cluster_parallel_environment"] = options.cluster_parallel_environment
    if options.cluster_priority is not None:
        PARAMS["cli_cluster_priority"] = options.cluster_priority
        PARAMS["cluster_priority"] = options.cluster_priority
    if options.cluster_queue is not None:
        PARAMS["cli_cluster_queue"] = options.cluster_queue
        PARAMS["cluster_queue"] = options.cluster_queue
    if options.cluster_queue_manager is not None:
        PARAMS["cli_cluster_queue_manager"] = options.cluster_queue_manager
        PARAMS["cluster_queue_manager"] = options.cluster_queue_manager

    PARAMS["ruffus_checksums_level"] = options.ruffus_checksums_level

    for variables in options.variables_to_set:
        variable, value = variables.split("=")
        PARAMS[variable.strip()] = IOTools.str2val(value.strip())

    if args:
        options.pipeline_action = args[0]
        if len(args) > 1:
            options.pipeline_targets.extend(args[1:])

    # see inputValidation function in Parameters.py
    if options.input_validation:
        inputValidation(PARAMS, sys.argv[0])

    if options.pipeline_action == "check":
        counter, requirements = Requirements.checkRequirementsFromAllModules()
        for requirement in requirements:
            E.info("\t".join(map(str, requirement)))
        E.info("version check summary: %s" % str(counter))
        E.Stop()
        return

    elif options.pipeline_action == "debug":
        # create the session proxy
        startSession()

        method_name = options.pipeline_targets[0]
        caller = getCaller()
        method = getattr(caller, method_name)
        method(*options.pipeline_targets[1:])

    elif options.pipeline_action in ("make", "show", "svg", "plot",
                                     "touch", "regenerate"):

        # set up extra file logger
        handler = logging.FileHandler(filename=options.logfile,
                                      mode="a")
        handler.setFormatter(
            MultiLineFormatter(
                '%(asctime)s %(levelname)s %(module)s.%(funcName)s.%(lineno)d %(message)s'))
        logger = logging.getLogger()
        logger.addHandler(handler)
        messenger = None

        try:
            if options.pipeline_action == "make":

                # get tasks to be done. This essentially replicates
                # the state information within ruffus.
                stream = io.StringIO()
                pipeline_printout(
                    stream,
                    options.pipeline_targets,
                    verbose=5,
                    checksum_level=options.ruffus_checksums_level)

                messenger = LoggingFilterRabbitMQ(
                    stream.getvalue(),
                    project_name=getProjectName(),
                    pipeline_name=getPipelineName(),
                    host=options.rabbitmq_host,
                    exchange=options.rabbitmq_exchange)

                logger.addFilter(messenger)

                if not options.without_cluster and HAS_DRMAA:
                    global task
                    # use threading instead of multiprocessing in order to
                    # limit the number of concurrent jobs by using the
                    # GIL
                    #
                    # Note that threading might cause problems with rpy.
                    task.Pool = ThreadPool

                    # create the session proxy
                    startSession()

                #
                #   make sure we are not logging at the same time in
                #   different processes
                #
                # session_mutex = manager.Lock()
                E.info(E.GetHeader())
                E.info("code location: %s" % PARAMS["pipeline_scriptsdir"])
                E.info("Working directory is: %s" % PARAMS["workingdir"])

                pipeline_run(
                    options.pipeline_targets,
                    multiprocess=options.multiprocess,
                    logger=logger,
                    verbose=options.loglevel,
                    log_exceptions=options.log_exceptions,
                    exceptions_terminate_immediately=options.exceptions_terminate_immediately,
                    checksum_level=options.ruffus_checksums_level,
                )

                E.info(E.GetFooter())

                closeSession()

            elif options.pipeline_action == "show":
                pipeline_printout(
                    options.stdout,
                    options.pipeline_targets,
                    verbose=options.loglevel,
                    checksum_level=options.ruffus_checksums_level)

            elif options.pipeline_action == "touch":
                pipeline_run(
                    options.pipeline_targets,
                    touch_files_only=True,
                    verbose=options.loglevel,
                    checksum_level=options.ruffus_checksums_level)

            elif options.pipeline_action == "regenerate":
                pipeline_run(
                    options.pipeline_targets,
                    touch_files_only=options.ruffus_checksums_level,
                    verbose=options.loglevel)

            elif options.pipeline_action == "svg":
                pipeline_printout_graph(
                    options.stdout.buffer,
                    options.pipeline_format,
                    options.pipeline_targets,
                    checksum_level=options.ruffus_checksums_level)

            elif options.pipeline_action == "plot":
                outf, filename = tempfile.mkstemp()
                pipeline_printout_graph(
                    os.fdopen(outf, "wb"),
                    options.pipeline_format,
                    options.pipeline_targets,
                    checksum_level=options.ruffus_checksums_level)
                execute("inkscape %s" % filename)
                os.unlink(filename)

        except ruffus_exceptions.RethrownJobError as value:

            if not options.debug:
                E.error("%i tasks with errors, please see summary below:" %
                        len(value.args))
                for idx, e in enumerate(value.args):
                    task, job, error, msg, traceback = e

                    if task is None:
                        # this seems to be errors originating within ruffus
                        # such as a missing dependency
                        # msg then contains a RethrownJobJerror
                        msg = str(msg)
                        pass
                    else:
                        task = re.sub("__main__.", "", task)
                        job = re.sub("\s", "", job)

                    if messenger:
                        messenger.send_error(task, job, error, msg)

                    # display only single line messages
                    if len([x for x in msg.split("\n") if x != ""]) > 1:
                        msg = ""

                    E.error("%i: Task=%s Error=%s %s: %s" %
                            (idx, task, error, job, msg))

                E.error("full traceback is in %s" % options.logfile)

                # write full traceback to log file only by removing the stdout
                # handler
                lhStdout = logger.handlers[0]
                logger.removeHandler(lhStdout)
                logger.error("start of error messages")
                logger.error(value)
                logger.error("end of error messages")
                logger.addHandler(lhStdout)

                # raise error
                raise ValueError(
                    "pipeline failed with %i errors" % len(value.args))
            else:
                raise

    elif options.pipeline_action == "dump":
        print(json.dumps(PARAMS))

    elif options.pipeline_action == "printconfig":
        print("Printing out pipeline parameters: ")
        for k in sorted(PARAMS):
            print(k, "=", PARAMS[k])
        printConfigFiles()

    elif options.pipeline_action == "config":
        f = sys._getframe(1)
        caller = f.f_globals["__file__"]
        pipeline_path = os.path.splitext(caller)[0]
        general_path = os.path.join(os.path.dirname(pipeline_path),
                                    "configuration")
        writeConfigFiles(pipeline_path, general_path)

    elif options.pipeline_action == "clone":
        clonePipeline(options.pipeline_targets[0])

    else:
        raise ValueError("unknown pipeline action %s" %
                         options.pipeline_action)

    E.Stop()
Example #45
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id",
                            usage=globals()["__doc__"])

    parser.add_option("-f",
                      "--target-format",
                      dest="change_format",
                      type="choice",
                      choices=('sanger', 'solexa', 'phred64', 'integer'),
                      help="set quality scores to format "
                      "[default=%default].")

    parser.add_option(
        "--guess-format",
        dest="guess_format",
        type="choice",
        choices=('sanger', 'solexa', 'phred64', 'integer'),
        help="quality score format to assume if ambiguous [default=%default].")

    parser.add_option("--pattern-identifier",
                      dest="pattern",
                      type="string",
                      help="filename prefix [default=%default].")

    parser.set_defaults(change_format=None, guess_format=None, pattern="%s.gz")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    c = E.Counter()

    outfile_seq = IOTools.openFile(options.pattern % "csfasta", "w")
    outfile_qual = IOTools.openFile(options.pattern % "qual", "w")

    if options.change_format:
        iter = Fastq.iterate_convert(options.stdin,
                                     format=options.change_format,
                                     guess=options.guess_format)
    else:
        iter = Fastq.iterate(options.stdin)

    for record in iter:
        c.input += 1
        outfile_seq.write(">%s\n%s\n" % (record.identifier, record.seq))
        outfile_qual.write(">%s\n%s\n" % (record.identifier, record.quals))
        c.output += 1

    outfile_seq.close()
    outfile_qual.close()

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.Stop()
Example #46
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.set_defaults()

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    # do sth
    assert len(args) == 3, "expected three command line arguments"

    fastqfile1, fastqfile2 = args[1], args[2]

    # only output compressed data
    if not fastqfile1.endswith(".gz"):
        fastqfile1 += ".gz"
    if not fastqfile2.endswith(".gz"):
        fastqfile2 += ".gz"

    samfile = pysam.Samfile(args[0], "rb")

    tmpdir = tempfile.mkdtemp()

    outtemp1 = os.path.join(tmpdir, "pair1.gz")
    outtemp2 = os.path.join(tmpdir, "pair2.gz")

    outstream1 = IOTools.openFile(outtemp1, "w")
    outstream2 = IOTools.openFile(outtemp2, "w")

    E.info('writing fastq files to temporary directory %s' % tmpdir)

    found1, found2 = set(), set()
    read1_qlen, read2_qlen = 0, 0

    c = E.Counter()
    for read in samfile.fetch():
        c.input += 1
        if read.is_read1:
            if read.qname not in found1:
                outstream1.write("\t".join((read.qname, read.seq, read.qual)) +
                                 "\n")
                found1.add(read.qname)
                if not read1_qlen:
                    read1_qlen = read.qlen
                c.output1 += 1
        elif read.is_read2:
            if read.qname not in found2:
                outstream2.write("\t".join((read.qname, read.seq, read.qual)) +
                                 "\n")
                found2.add(read.qname)
                if not read2_qlen:
                    read2_qlen = read.qlen
                c.output2 += 1

    for qname in found2.difference(found1):
        outstream1.write("\t".join((qname, "N" * read1_qlen,
                                    "B" * read1_qlen)) + "\n")
        c.extra1 += 1

    for qname in found1.difference(found2):
        outstream2.write("\t".join((qname, "N" * read2_qlen,
                                    "B" * read2_qlen)) + "\n")
        c.extra2 += 1

    E.info("%s" % str(c))

    outstream1.close()
    outstream2.close()

    E.info("sorting fastq files")
    statement = '''zcat %s
                   | sort -k1,1
                   | awk '{printf("@%%s\\n%%s\\n+\\n%%s\\n", $1,$2,$3)}'
                   | gzip > %s'''

    E.run(statement % (outtemp1, fastqfile1))
    E.run(statement % (outtemp2, fastqfile2))

    # write footer and output benchmark information.
    E.Stop()
Example #47
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if argv == None: argv = sys.argv

    parser.add_option("-s", "--species", dest="species", type="string",
                      help="schema of master species."  )

    parser.set_defaults(
        tablename_orthologs = "orthology_pairwise1v5.orthologlinks_first",
        filename_ids = "-",
        schemas = None,
        species = None,
    )

    (options, args) = E.Start( parser, add_psql_options = True )

    dbhandle = pgdb.connect( options.psql_connection )

    if options.filename_ids == "-":
        ids, errors = IOTools.ReadList(sys.stdin)

    extra_options = ["schema1 = '%s'" % options.species,
                     "prediction_id1 IN ('%s')" % "','".join( ids ) ]
    
    if options.schemas:
        extra_options.append( "schema2 IN ('%s')" % "','".join(options.schemas))
        
    statement = """SELECT prediction_id1, schema2, prediction_id2, gene_id2, gd1, gd2, td1, td2
    FROM %s
    WHERE schema1 != schema2 AND %s
    ORDER BY prediction_id1""" % (options.tablename_orthologs,
                                  " AND ".join(extra_options))

    cc = dbhandle.cursor()
    cc.execute(statement)
    result = cc.fetchall()
    cc.close()

    if options.schemas:
        schemas = options.schemas
    else:
        schemas = set( map( lambda x: x[1], result) )

    ## compute counts
    degeneracies = {}
    for x in ids:
        degeneracies[x] = {}
        for s in schemas:
            degeneracies[x][s] = (0,0,0,0)
            
    for prediction_id1, schema2, prediction_id2, gene_id2, gd1, gd2, td1, td2 in result:
        degeneracies[prediction_id1][schema2] = (gd1, gd2, td1, td2)

    ## output
    options.stdout.write("%s\t%s\n" % ("prediction_id", "\t".join(schemas)))
    for x in ids:
        options.stdout.write("%s" % x)
        for s in schemas:
            options.stdout.write("\t%s:%s:%s:%s" % degeneracies[x][s])
        options.stdout.write("\n")
    
    E.Stop()
Example #48
0
def intersectionHeatmap(infiles, outfile):
    ''' calculate the intersection between the infiles and plot'''

    pandas2ri.activate()

    name2genes = {}
    df = pd.DataFrame(columns=["id_1", "id_2", "intersection", "perc"])

    ix = 0
    for inf in infiles:

        name = P.snip(os.path.basename(inf)).split(".")[0]
        name = name.replace(".", "_")

        with IOTools.openFile(inf, "r") as f:
            genes = set()

            for line in f:
                if line[0] == "#":
                    continue

                values = line.strip().split("\t")
                info = values[7].split(";")

                for x in info:
                    if x.split("=")[0] == "SNPEFF_GENE_NAME":
                        gene_name = x.split("=")[1]
                        break

                # if no gene name found, line is skipped
                if gene_name:
                    genes.update((gene_name,))

        name2genes[name] = genes
        df.loc[ix] = [name, name, len(genes), 1.0]
        ix += 1

    for pair in itertools.permutations(list(name2genes.keys()), 2):
        id_1, id_2 = pair
        intersection = len(name2genes[id_1].intersection(name2genes[id_2]))
        not_intersecting = len(
            name2genes[id_1].symmetric_difference(name2genes[id_2]))
        intersection_perc = float(intersection) / (intersection +
                                                   not_intersecting)

        df.loc[ix] = [id_1, id_2, intersection, intersection_perc]
        ix += 1

    variant = os.path.basename(outfile).replace(
        "overlap_", "").replace("_heatmap.png", "")

    plotIntersectionHeatmap = R('''
    function(df){
    library(ggplot2)
    m_txt = element_text(size=15)
    m_txt_90 = element_text(size=15, angle=90, vjust=0.5, hjust=1)
    l_txt = element_text(size=20)

    p = ggplot(df, aes(id_1, id_2, fill=100*perc)) +
    geom_tile() +
    geom_text(aes(label=intersection), size=3) +
    scale_fill_gradient(name="Intersection (%%)", limits=c(0,100),
                       low="yellow", high="dodgerblue4") +
    theme(axis.text.x = m_txt_90, axis.text.y = m_txt,
          legend.text = m_txt, legend.title = m_txt,
          aspect.ratio=1) +
    xlab("") + ylab("") +
    ggtitle("%(variant)s")

    ggsave("%(outfile)s", width=10, height=10)
    }''' % locals())

    plotIntersectionHeatmap(df)
Example #49
0
def filterQuality(infile, qualstr, qualfilter, outfiles):
    '''
    Filter variants based on quality.  Columns to filter on and
    how they should be filtered can be specified in the pipeline.ini.
    Currently only implemented to filter numeric columns.  "." is assumed
    to mean pass.
    '''
    columns = IOTools.openFile(infile).readline()
    columns = columns.split("\t")
    qualparams = qualstr.split(",")
    qualdict = dict()
    fdict = dict()
    for param in qualparams:
        param = param.split("'")

        # column to filter on
        col = param[0]
        # string of >, <, >= or <= depending how the column should
        # be filtered
        lessmore = param[1]

        # score to filter by
        score = float(param[2])

        assert col in columns, "column %s not in variant table" % col

        ind = columns.index(col)
        i = 0
        iset = set([0, 1])
        with IOTools.openFile(infile) as input:
            for line in input:
                # rows one and two are headers
                if i > 1:
                    line = line.strip().split("\t")
                    if line[ind] == ".":
                        iset.add(i)
                    elif lessmore == ">":
                        if float(line[ind]) > score:
                            iset.add(i)
                    elif lessmore == ">=":
                        if float(line[ind]) > score:
                            iset.add(i)
                    elif lessmore == "<":
                        if float(line[ind]) < score:
                            iset.add(i)
                    elif lessmore == "<=":
                        if float(line[ind]) <= score:
                            iset.add(i)
                    if i not in iset:
                        fdict.setdefault(i, [])
                        fdict[i].append("%s=%s" % (col, line[ind]))
                i += 1
        qualdict[col] = iset
    if qualfilter == "all":
        allqual = set.intersection(*list(qualdict.values()))
    elif qualfilter == "any":
        allqual = set.union(*list(qualdict.values()))
    i = 0
    out = IOTools.openFile(outfiles[0], "w")
    out2 = IOTools.openFile(outfiles[1], "w")
    with IOTools.openFile(infile) as input:
        for line in input:
            if i in allqual:
                out.write(line)
            else:
                line = line.strip()
                out2.write("%s\t%s\n" % (line, ",".join(fdict[i])))
            i += 1
    out.close()
    out2.close()
Example #50
0
def extractEBioinfo(eBio_ids, vcfs, outfile):
    '''find the number of mutations identitified in previous studies (eBio_ids)
    for the mutated genes in the vcfs'''

    genes = set()

    for vcf in vcfs:
        infile = VCF.VCFFile(IOTools.openFile(vcf))
        for vcf_entry in infile:
            # assumes all vcf entries without "REJECT" are "PASS"
            if vcf_entry.filter != "REJECT":
                info_entries = vcf_entry.info.split(";")
                for entry in info_entries:
                    if "SNPEFF_GENE_NAME" in entry:
                        genes.update((entry.split("=")[1],))

    eBio_ids = IOTools.openFile(eBio_ids, "r")

    tissue_counts = collections.defaultdict(
        lambda: collections.defaultdict(
            lambda: collections.defaultdict(int)))

    def chunks(l, n):
        ''' Yield successive n-sized chunks from l '''
        for i in range(0, len(l), n):
            yield l[i:i + n]

    # delete me
    E.info("number of genes: %i" % len(list(genes)))

    for line in eBio_ids:
        tissue, study, table = line.strip().split("\t")

        n = 0

        for i in range(0, len(list(genes)), 250):

            genes_chunk = list(genes)[i:i + 250]

            # TS sporadic error when querying with a single gene at a time
            # "urllib2.URLError: <urlopen error [Errno 110] Connection timed out>"
            # max URL length appears to be 8200 characters,
            # try doing 250 genes at a time?

            gene_list = "+".join(list(genes_chunk))

            n += len(genes_chunk)

            E.info("number of genes processed: %i" % n)

            url = ("http://www.cbioportal.org/webservice.do?cmd=getProfileData&"
                   "case_set_id=%(study)s_all&genetic_profile_id=%(table)s&"
                   "gene_list=%(gene_list)s" % locals())

            df = pd.io.parsers.read_csv(
                url, comment="#", sep="\t", index_col=0)

            for gene in genes_chunk:

                tmp_df = df[df['COMMON'] == gene]

                # check dataframe contains data!
                if tmp_df.shape[0] != 0:
                    # seem to be having issues with gene set containing duplicates!
                    # --> dataframe with repeated instances of gene after selection
                    # so splice to first row and recreate dataframe from series
                    if tmp_df.shape[0] > 1:
                        tmp_df = pd.DataFrame(tmp_df.iloc[0]).T

                    tissue_counts[tissue][gene]["total"] += tmp_df.shape[1] - 2
                    tissue_counts[tissue][gene][
                        "mutations"] += int(tmp_df.count(1)) - 1

    out = IOTools.openFile(outfile, "w")

    tissues = list(tissue_counts.keys())

    out.write("gene\t%s\n" % "\t".join([
        "%s_frequency" % x.replace(" ", "_") for x in tissues]))

    for gene in genes:
        freq_values = []
        for tissue in tissues:
            total = tissue_counts[tissue][gene]["total"]
            mutations = tissue_counts[tissue][gene]["mutations"]
            freq_values.append(round(np.divide(float(mutations), total), 4))

        out.write("%s\t%s\n" % (gene, "\t".join(map(str, freq_values))))

    out.close()
Example #51
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id$",
                            usage=globals()["__doc__"])

    parser.add_option("--set-nh", dest="set_nh", action="store_true",
                      help="sets the NH flag. The file needs to be "
                      "sorted by readname [%default]")

    parser.add_option("--unset-unmapped-mapq", dest="unset_unmapped_mapq",
                      action="store_true",
                      help="sets the mapping quality of unmapped "
                      "reads to 0 [%default]")

    parser.add_option("--set-sequence", dest="set_sequence",
                      action="store_true",
                      help="sets the sequence to 'A's (a valid base) and "
                      "the quality to 'F's "
                      ",which is defined in all fastq scoring schemes "
                      "[%default]")

    parser.add_option("--strip", dest="strip", type="choice",
                      choices=("sequence", "quality", "match"),
                      help = "remove parts of the bam-file. Note that "
                      "stripping the sequence will "
                      "also strip the quality values [%default]")

    parser.add_option("--unstrip", dest="unstrip", action="store_true",
                      help="add sequence and quality into bam file [%default]")

    parser.add_option("--filter", dest="filter",
                      action="append", type="choice",
                      choices=('NM', 'CM', 'mapped', 'unique', "non-unique"),
                      help = "filter bam file. The option denotes "
                      "the property that is  "
                      "used to determine better match [%default]")

    parser.add_option("--reference-bam", dest="reference_bam", type="string",
                      help="bam-file to filter with [%default]")

    parser.add_option("--force", dest="force", action="store_true",
                      help="force processing. Some methods such "
                      "as strip/unstrip will stop processing if "
                      "they think it not necessary "
                      "[%default]")

    parser.add_option("--sam", dest="output_sam", action="store_true",
                      help="output in sam format [%default]")

    parser.add_option("--inplace", dest="inplace", action="store_true",
                      help="modify bam files in-place. Bam files need "
                      "to be given "
                      "as arguments. Temporary bam files are written "
                      "to /tmp [%default]")

    parser.add_option("--fastq1", "-1", dest="fastq_pair1", type="string",
                      help="fastq file with read information for first "
                      "in pair or unpaired [%default]")

    parser.add_option("--fastq2", "-2", dest="fastq_pair2", type="string",
                      help="fastq file with read information for second "
                      "in pair [%default]")

    parser.add_option("--keep-first-base", dest="keep_first_base",
                      action="store_true",
                      help="keep first base of reads such that gtf2table.py "
                      "will only consider the "
                      "first base in its counts.")

    parser.set_defaults(
        filter=[],
        set_nh=False,
        unset_unmapped_mapq=False,
        output_sam=False,
        reference_bam=None,
        strip=None,
        unstrip=None,
        force=False,
        set_sequence=False,
        inplace=False,
        fastq_pair1=None,
        fastq_pair2=None,
        keep_first_base=False
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    bamfiles = []

    if options.stdin != sys.stdin:
        bamfiles.append(options.stdin.name)

    if options.inplace:
        bamfiles.extend(args)
        if len(bamfiles) == 0:
            raise ValueError(
                "please one or more bam-files as command line arguments")

        if "-" in bamfiles:
            raise ValueError(
                "can not read from stdin if ``--inplace`` is selected")

    if len(bamfiles) == 0:
        bamfiles = ["-"]

    for bamfile in bamfiles:

        E.info('processing %s' % bamfile)

        if os.path.islink(bamfile):
            E.warn('ignoring link %s' % bamfile)
            continue

        if IOTools.isEmpty(bamfile):
            E.warn('ignoring empty file %s' % bamfile)
            continue

        # reading bam from stdin does not work with only the "r" tag
        pysam_in = pysam.Samfile(bamfile, "rb")

        if bamfile == "-":
            if options.output_sam:
                pysam_out = pysam.Samfile("-", "wh", template=pysam_in)
            else:
                pysam_out = pysam.Samfile("-", "wb", template=pysam_in)
        else:
            if IOTools.isEmpty(bamfile):
                E.warn('skipping empty file %s' % bamfile)
                continue
            tmpfile = tempfile.NamedTemporaryFile(delete=False, prefix="ctmp")
            tmpfile.close()

            E.debug("writing temporary bam-file to %s" % tmpfile.name)
            pysam_out = pysam.Samfile(tmpfile.name, "wb", template=pysam_in)

        if options.filter:

            remove_mismatches, colour_mismatches = False, False

            if "NM" in options.filter:
                remove_mismatches = True

            elif "CM" in options.filter:
                remove_mismatches = True
                colour_mismatches = True

            if remove_mismatches:
                if not options.reference_bam:
                    raise ValueError(
                        "requiring reference bam file for removing by "
                        "mismatches")

                pysam_ref = pysam.Samfile(options.reference_bam, "rb")
            else:
                pysam_ref = None

            # filter and flags are the opposite way around
            c = _bam2bam.filter_bam(
                pysam_in, pysam_out, pysam_ref,
                remove_nonunique="unique" in options.filter,
                remove_unique="non-unique" in options.filter,
                remove_contigs=None,
                remove_unmapped="mapped" in options.filter,
                remove_mismatches=remove_mismatches,
                colour_mismatches=colour_mismatches)

            options.stdlog.write("category\tcounts\n%s\n" % c.asTable())
        else:

            # set up the modifying iterators
            it = pysam_in.fetch(until_eof=True)

            # function to check if processing should start
            pre_check_f = lambda x: None

            if options.unset_unmapped_mapq:
                def unset_unmapped_mapq(i):
                    for read in i:
                        if read.is_unmapped:
                            read.mapq = 0
                        yield read
                it = unset_unmapped_mapq(it)

            if options.set_nh and False:
                def set_nh(i):

                    for key, reads in itertools.groupby(i, lambda x: x.qname):
                        l = list(reads)
                        nh = len(l)
                        for read in l:
                            if not read.is_unmapped:
                                t = dict(read.tags)
                                t['NH'] = nh
                                read.tags = list(t.iteritems())
                            yield read
                it = set_nh(it)

            if options.set_sequence:
                def set_sequence(i):
                    for read in i:
                        # can't get at length of unmapped reads
                        if read.is_unmapped:
                            read.seq = "A"
                            read.qual = "F"
                        else:
                            read.seq = "A" * read.inferred_length
                            read.qual = "F" * read.inferred_length

                        yield read
                it = set_sequence(it)

            if options.strip is not None:
                def strip_sequence(i):
                    for read in i:
                        read.seq = None
                        yield read

                def check_sequence(reads):
                    if reads[0].seq is None:
                        return 'no sequence present'
                    return None

                def strip_quality(i):
                    for read in i:
                        read.qual = None
                        yield read

                def check_quality(reads):
                    if reads[0].qual is None:
                        return 'no quality information present'
                    return None

                def strip_match(i):
                    for read in i:
                        try:
                            nm = read.opt('NM')
                        except KeyError:
                            nm = 1
                        if nm == 0:
                            read.seq = None
                        yield read

                if options.strip == "sequence":
                    it = strip_sequence(it)
                    pre_check_f = check_sequence
                elif options.strip == "quality":
                    it = strip_quality(it)
                    pre_check_f = check_quality
                elif options.strip == "match":
                    it = strip_match(it)

            if options.unstrip:
                def buildReadDictionary(filename):
                    if not os.path.exists(filename):
                        raise OSError("file not found: %s" % filename)
                    fastqfile = pysam.Fastqfile(filename)
                    fastq2sequence = {}
                    for x in fastqfile:
                        if x.name in fastq2sequence:
                            raise ValueError(
                                "read %s duplicate - can not unstrip" % x.name)

                        fastq2sequence[x.name] = (x.sequence, x.quality)
                    return fastq2sequence

                if not options.fastq_pair1:
                    raise ValueError(
                        "please supply fastq file(s) for unstripping")
                fastq2sequence1 = buildReadDictionary(options.fastq_pair1)
                if options.fastq_pair2:
                    fastq2sequence2 = buildReadDictionary(options.fastq_pair2)

                def unstrip_unpaired(i):
                    for read in i:
                        read.seq, read.qual = fastq2sequence1[read.qname]
                        yield read

                def unstrip_pair(i):
                    for read in i:
                        if read.is_read1:
                            read.seq, read.qual = fastq2sequence1[read.qname]
                        else:
                            read.seq, read.qual = fastq2sequence2[read.qname]
                        yield read

                if options.fastq_pair2:
                    it = unstrip_pair(it)
                else:
                    it = unstrip_unpaired(it)

            if options.set_nh:
                it = _bam2bam.SetNH(it)

            # keep first base of reads by changing the cigarstring to
            # '1M' and, in reads mapping to the reverse strand,
            # changes the pos to aend - 1
            if options.keep_first_base:
                def keep_first_base(i):
                    for read in i:
                        if read.is_reverse:
                            read.pos = read.aend - 1
                            read.cigarstring = '1M'
                        elif not read.is_unmapped:
                            read.cigarstring = '1M'
                        yield read
                it = keep_first_base(it)

            # read first read and check if processing should continue
            # only possible when not working from stdin
            if bamfile != "-":
                # get first read for checking pre-conditions
                first_reads = list(pysam_in.head(1))

                msg = pre_check_f(first_reads)
                if msg is not None:
                    if options.force:
                        E.warn('proccessing continues, though: %s' % msg)
                    else:
                        E.warn('processing not started: %s' % msg)
                        pysam_in.close()
                        pysam_out.close()
                        continue

            # continue processing till end
            for read in it:
                pysam_out.write(read)

            pysam_in.close()
            pysam_out.close()

        if options.inplace:
            # set date and file permissions according to original
            # Note: currently it will not update user and group.
            original = os.stat(bamfile)
            os.utime(tmpfile.name, (original.st_atime, original.st_mtime))
            os.chmod(tmpfile.name, original.st_mode)
            # move new file over original copy
            shutil.move(tmpfile.name, bamfile)
            # re-index
            pysam.index(bamfile)

    # write footer and output benchmark information.
    E.Stop()