def exportSequencesFromBedFile( infile, outfile, masker = None, mode = "intervals" ):
    '''export sequences for intervals in :term:`bed`-formatted *infile* 
    to :term:`fasta` formatted *outfile*
    '''

    track = P.snip( infile, ".bed.gz" )

    fasta = IndexedFasta.IndexedFasta( os.path.join( PARAMS["genome_dir"], PARAMS["genome"] ) )
    outs = IOTools.openFile( outfile, "w")

    ids, seqs = [], []
    for bed in Bed.setName(Bed.iterator( IOTools.openFile(infile) )):
        lcontig = fasta.getLength( bed.contig )

        if mode == "intervals":
            seqs.append( fasta.getSequence( bed.contig, "+", bed.start, bed.end) )
            ids.append( "%s_%s %s:%i..%i" % (track, bed.name, bed.contig, bed.start, bed.end) )

        elif mode == "leftright":
            l = bed.end - bed.start

            start, end = max(0,bed.start-l), bed.end-l
            ids.append( "%s_%s_l %s:%i..%i" % (track, bed.name, bed.contig, start, end) )
            seqs.append( fasta.getSequence( bed.contig, "+", start, end) )
            
            start, end = bed.start+l, min(lcontig,bed.end+l)
            ids.append( "%s_%s_r %s:%i..%i" % (track, bed.name, bed.contig, start, end) )
            seqs.append( fasta.getSequence( bed.contig, "+", start, end) )
            
    masked = maskSequences( seqs, masker )
    outs.write("\n".join( [ ">%s\n%s" % (x,y) for x,y in zip(ids, masked) ] ) )

    outs.close()
def annotate(infile, outfile, geneset):
    '''
    annotate NOGs into functional categories
    '''
    annotation = {}
    E.info("loading geneset")
    anno = IOTools.openFile(geneset)
    for line in anno.readlines():
        data = line[:-1].split("\t")
        nog, funccat = data[1], data[3]
        annotation[nog] = funccat
    E.info("finished loading gene set")

    E.info("annotating infile")
    inf = IOTools.openFile(infile)
    header = inf.readline()
    outf = IOTools.openFile(outfile, "w")
    outf.write(header[:-1] + "\ttaxa\n")
    for line in inf.readlines():
        data = line[:-1].split("\t")
        nog = data[0]
        try:
            pathway = annotation[nog]
        except KeyError:
            pathway = "Function unknown"
        outf.write(line[:-1] + "\t" + pathway + "\n")
    outf.close()
Beispiel #3
0
def fetchProbeFragments(probe_bed, digest_bed, outfile,
                        lookup_out):

    digest_fragments = pysam.TabixFile(digest_bed)
    bed = Bed.Bed()
    with IOTools.openFile(outfile, "w") as outf, \
         IOTools.openFile(lookup_out,"w") as lookup:

        lookup.write("probe\tfragment\n")
        for probe in Bed.iterator(IOTools.openFile(probe_bed)):
            
            frag = digest_fragments.fetch(probe.contig,
                                          probe.start,
                                          probe.end,
                                          parser=pysam.asBed())
            frag = list(frag)
            if not len(frag) == 1:
                E.warn("%i fragments found for probe %s, skipping" %
                       (len(frag), probe.name))
                continue

            frag = frag[0]
            bed.start = frag.start
            bed.end = frag.end
            bed.contig = frag.contig
            bed["name"] = probe.name
            bed["score"] = "."
            bed["strand"] = "+"

            lookup.write("%s\t%s\n" % (probe.name, frag.name))
            outf.write(str(bed) + "\n")
def buildForegroundSets(infiles, outfile):
    '''
    build multiset of genes that are differentiallt
    expressed based on cluster assignments
    '''
    clusters, probe2gene_file = infiles

    # read probe 2 gene map
    probe2gene = {}
    probe2gene_file = IOTools.openFile(probe2gene_file)
    for line in probe2gene_file.readlines():
        data = line[:-1].split("\t")
        probe, gene = [x.replace('"', '') for x in data]
        probe2gene[probe] = gene
        
    # read probe 2 cluster map
    probe2cluster = {}
    clusters = IOTools.openFile(clusters)
    clusters.readline()
    for line in clusters.readlines():
        data = line[:-1].split("\t")
        probe, cluster = data
        probe2cluster[probe] = cluster
    
    # output genes in each cluster
    for c in set(probe2cluster.values()):
        outname = "pathways.dir/C%s.foreground" % c
        outf = IOTools.openFile(outname, "w")
        for probe, cluster in probe2cluster.iteritems():
            if cluster == c:
                outf.write("%s\n" % probe2gene[probe])
            else:
                continue
        outf.close()
Beispiel #5
0
def extractEBioinfo(eBio_ids, vcfs, outfile):
    '''find the number of mutations identitified in previous studies (eBio_ids)
    for the mutated genes in the vcfs'''

    genes = set()

    n = 0
    for vcf in vcfs:
        if n > 0:
            break
        else:
            n += 1
        infile = VCF.VCFFile(IOTools.openFile(vcf))
        for vcf_entry in infile:
            # assumes all vcf entries without "REJECT" are "PASS"
            if vcf_entry.filter != "REJECT":
                info_entries = vcf_entry.info.split(";")
                for entry in info_entries:
                    if "SNPEFF_GENE_NAME" in entry:
                        genes.update((entry.split("=")[1],))

    eBio_ids = IOTools.openFile(eBio_ids, "r")

    tissue_counts = collections.defaultdict(
        lambda: collections.defaultdict(
            lambda: collections.defaultdict(int)))

    for line in eBio_ids:
        tissue, study, table = line.strip().split("\t")
        for gene in genes:
            url = ("http://www.cbioportal.org/webservice.do?cmd=getProfileData&"
                   "case_set_id=%(study)s_all&genetic_profile_id=%(table)s&"
                   "gene_list=%(gene)s" % locals())
            print url
            df = pd.io.parsers.read_csv(url, comment="#", sep="\t",
                                        header=False, index_col=0)

            # check dataframe contains data!
            if df.shape[0] != 0:
                tissue_counts[tissue][gene]["total"] += df.shape[1]-2
                tissue_counts[tissue][gene]["mutations"] += int(df.count(1))-1

    out = IOTools.openFile(outfile, "w")

    tissues = tissue_counts.keys()

    out.write("gene\t%s\n" % "\t".join([
        "%s_frequency" % x.replace(" ", "_") for x in tissues]))

    for gene in genes:
        freq_values = []
        for tissue in tissues:
            total = tissue_counts[tissue][gene]["total"]
            mutations = tissue_counts[tissue][gene]["mutations"]
            print "total: ", total, "mutations: ", mutations
            freq_values.append(np.divide(float(mutations), total))

        out.write("%s\t%s\n" % (gene, "\t".join(map(str, freq_values))))

    out.close()
Beispiel #6
0
def filterGTF(gtf, filterstring, tempout):

    if "!=" in filterstring:
        column, value = filterstring.split("!=")
        value = value.split("+")
        filtertype = "notin"

    elif "=" in filterstring:
        column, value = filterstring.split("=")
        value = value.split("+")
        filtertype = "in"

    elif "-in_file-" in filterstring:
        column, value = filterstring.split("-in_file-")
        value = [line.strip() for line in IOTools.openFile(value)]
        filtertype = "in_file"

    elif "-notin_file-" in filterstring:
        column, value = filterstring.split("-notin_file-")
        value = [line.strip() for line in IOTools.openFile(value)]
        filtertype = "notin_file"

    elif "-morethan-" in filterstring:
        column, value = filterstring.split("-morethan-")
        value = float(value)
        filtertype = "morethan"

    elif "-lessthan-" in filterstring:
        column, value = filterstring.split("-lessthan-")
        value = float(value)
        filtertype = "lessthan"

    gfile = IOTools.openFile(gtf)
    G = GTF.iterator(gfile)

    out = IOTools.openFile(tempout, "w")
    for item in G:
        D = item.asDict()
        D['contig'] = item.contig
        D['source'] = item.source
        D['feature'] = item.feature
        D['start'] = item.start
        D['end'] = item.end
        D['strand'] = item.strand
        D['frame'] = item.frame

        if filtertype == "in" or filtertype == 'in_file':
            if D[column] in value:
                out.write("%s\n" % str(item))
        elif filtertype == "notin" or filtertype == 'notin_file':
            if D[column] not in value:
                out.write("%s\n" % str(item))
        elif filtertype == "morethan":
            if float(D[column]) > value:
                out.write("%s\n" % str(item))
        elif filtertype == "lessthan":
            if float(D[column]) < value:
                out.write("%s\n" % str(item))
    out.close()
    gfile.close()
Beispiel #7
0
def buildFastQCSummaryStatus(infiles, outfile):
    '''load fastqc status summaries into a single table.'''

    outf = IOTools.openFile(outfile, "w")
    first = True
    for infile in infiles:
        track = P.snip(infile, ".fastqc")
        filename = os.path.join(
            PARAMS["exportdir"], "fastqc", track + "*_fastqc",
            "fastqc_data.txt")

        for fn in glob.glob(filename):
            prefix = os.path.basename(os.path.dirname(fn))
            results = []

            names, stats = [], []
            for name, status, header, data in FastqcSectionIterator(
                    IOTools.openFile(fn)):
                stats.append(status)
                names.append(name)

            if first:
                outf.write("track\tfilename\t%s\n" % "\t".join(names))
                first = False

            outf.write("%s\t%s\t%s\n" %
                       (track, os.path.dirname(fn), "\t".join(stats)))
    outf.close()
Beispiel #8
0
def chunk_iterator_lines(infile, args, prefix, use_header=False):
    """split by lines."""

    chunk_size = args[0]
    n = 0
    filename = "%s/%010i.in" % (prefix, n)
    outfile = IOTools.openFile(filename, "w")
    header = None

    for line in infile:
        if line[0] == "#":
            continue

        if not header and n == 0 and use_header:
            header = line
            outfile.write(header)
            continue

        n += 1

        if n % chunk_size == 0:
            outfile.close()
            yield filename
            filename = "%s/%010i.in" % (prefix, n)
            outfile = IOTools.openFile(filename, "w")
            if header:
                outfile.write(header)

        outfile.write(line)
    outfile.close()
    yield filename
Beispiel #9
0
def summarizeAllProcessing( infiles, outfile ):
    '''summarize processing information.'''

    outf = IOTools.openFile( outfile, "w" )
    data = []
    for infile in infiles:
        inf = IOTools.openFile( infile )
        for line in inf:
            track, step, pair, ninput, noutput = line[:-1].split("\t")
            if track == "track": continue
            data.append( (track, step, pair, ninput, noutput) )
            
    # sort by track, pair, input
    data.sort( key = lambda x: (x[0], x[2], -int(x[3])))
    first = True
    for key, v in itertools.groupby( data, lambda x: (x[0], x[2])):
        vals = list(v)
        track,pair = key
        ninput = int(vals[0][3])
        outputs = [int(x[4]) for x in vals]
        if first:
            outf.write( "track\tpair\tninput\t%s\t%s\t%s\t%s\n" % ("\t".join( [x[1] for x in vals] ),
                                                                   "noutput",
                                                                   "\t".join( ["percent_%s" % x[1] for x in vals] ),
                                                                   "percent_output" ))
            first = False
        outf.write( "%s\t%s\t%i\t%s\t%i\t%s\t%s\n" % ( track, pair, ninput, 
                                                       "\t".join( map(str,outputs)),
                                                       outputs[-1], 
                                                       "\t".join( [ "%5.2f" % (100.0 * x / ninput) for x in outputs ] ),
                                                       "%5.2f" % (100.0 * outputs[-1] / ninput)))
    outf.close()
Beispiel #10
0
def buildFastQCSummaryStatus(infiles, outfile, datadir):
    '''load fastqc status summaries into a single table.'''

    outf = IOTools.openFile(outfile, "w")
    names = set()
    results = []
    for infile in infiles:
        track = P.snip(os.path.basename(infile), ".fastqc")
        filename = os.path.join(datadir,
                                track + "*_fastqc",
                                "fastqc_data.txt")
        
        # there can be missing sections
        for fn in glob.glob(filename):
            stats = collections.defaultdict(str)
            for name, status, header, data in FastqcSectionIterator(
                    IOTools.openFile(fn)):
                stats[name] = status

            results.append((track, fn, stats))
            names.update(stats.keys())
            
    names = list(names)
    outf.write("track\tfilename\t%s\n" % "\t".join(names))
    for track, fn, stats in results:
        outf.write("%s\t%s\t%s\n" %
                   (track, os.path.dirname(fn),
                    "\t".join(stats[x] for x in names)))
    outf.close()
Beispiel #11
0
def chunk_iterator_regex_split(infile, args, prefix, use_header=False):
    """split where regular expression is true.
    """

    rex = args[0]
    chunk_size = args[2]
    max_lines = args[3]

    nlines = 0
    n = 0
    filename = "%s/%010i.in" % (prefix, n)
    outfile = IOTools.openFile(filename, "w")

    for line in infile:

        if line[0] == "#":
            continue

        if rex.search(line[:-1]):
            if n > 0 and (n % chunk_size == 0 or
                          (max_lines and nlines > max_lines)):
                outfile.close()
                yield filename
                filename = "%s/%010i.in" % (prefix, n)
                outfile = IOTools.openFile(filename, "w")
                nlines = 0

            n += 1

        outfile.write(line)
        nlines += 1

    outfile.close()
    yield filename
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", 
                             usage = globals()["__doc__"] )

    parser.add_option("-a", "--fastq1", dest="fastq1", type="string",
                      help="supply read1 fastq file"  )
    parser.add_option("-b", "--fastq2", dest="fastq2", type="string",
                      help="supply read2 fastq file"  )

    ## add common options (-h/--help, ...) and parse command line 
    (options, args) = E.Start( parser, argv = argv )

    fastq1 = IOTools.openFile(options.fastq1)
    fastq2 = IOTools.openFile(options.fastq2)

    E.info("iterating over fastq files")
    f1_count = 0
    for f1, f2 in itertools.izip_longest(Fastq.iterate(fastq1), Fastq.iterate(fastq2)):
        if not (f1 and f2) or (not f2 and f1):
            try:
                raise PairedReadError("unpaired reads detected. Are files sorted? are files of equal length?")
            except PairedReadError, e:
                raise PairedReadError(e), None, sys.exc_info()[2]
        else:
            assert f1.identifier.endswith("/1") and f2.identifier.endswith("/2"), "Reads in file 1 must end with /1 and reads in file 2 with /2"
            options.stdout.write(">%s\n%s\n>%s\n%s\n" % (f1.identifier, f1.seq, f2.identifier, f2.seq))
            f1_count += 1
Beispiel #13
0
def buildInputFiles(infile, outfiles):
    '''
    build input file based on parameters and fasta sequences
    that primers are to be designed for
    '''
    PARAMS["constraints_primer_mispriming_library"] = glob.glob("mispriming.dir/*.lib")[0]

    fasta, identifiers = infile[0], "identifiers.tsv"
    inf = IOTools.openFile(fasta)
    
    E.info("Reading ids for primer design")
    ids = readIdentifiers(identifiers)
    
    E.info("collecting sequences")
    for f in FastaIterator.iterate(IOTools.openFile(fasta)):
        if f.title in ids:
            outf = IOTools.openFile(os.path.join(
                "input.dir",f.title.replace(" ", "_").replace("/","_") + ".input").replace('"', ''), "w")
            seq = f.sequence
            outf.write("SEQUENCE_ID=%s\n" % f.title)
            for key, value in PARAMS.iteritems():
                if "constraints" in key:
                    outf.write("%s=%s\n" % (key.replace("constraints_", "").upper(), value))
            outf.write("SEQUENCE_TEMPLATE=%s\n=\n" % seq)
            outf.close()
    def copy(src, dst, name):

        # remove "template" and the pipeline type from file/directory
        # names.
        fn_dest = os.path.join(
            destination_dir,
            dst,
            rx_type.sub("", rx_file.sub(name, src)))

        fn_src = os.path.join(srcdir,
                              "pipeline_template_data", src)

        E.debug("fn_src=%s, fn_dest=%s, src=%s, dest=%s" %
                (fn_src, fn_dest, src, dst))

        if os.path.exists(fn_dest) and not options.force:
            raise OSError(
                "file %s already exists - not overwriting." % fn_dest)

        if fn_src.endswith(".png"):
            shutil.copyfile(fn_src, fn_dest)
        else:
            with IOTools.openFile(fn_dest, "w") as outfile:
                with IOTools.openFile(fn_src) as infile:
                    for line in infile:
                        outfile.write(rx_reportdir.sub(reportdir,
                                                       rx_template.sub(name, line)))
def mergeAdaptorFasta(infiles, outfile):
    '''
    Merge fasta files of adapter contamination,
    include reverse complement, remove duplicate sequences
    '''

    fasta_dict = {}
    for each in infiles:
        with IOTools.openFile(each, "r") as infle:
            for line in infle:
                if line[0] == '>':
                    adapt = line.lstrip(">").rstrip("\n")
                    fasta_dict[adapt] = set()
                    fasta_dict[adapt + "_R"] = set()
                else:
                    seq = line.rstrip("\n")
                    rev_seq = reverseComplement(seq)
                    fasta_dict[adapt].add(seq)
                    fasta_dict[adapt + "_R"].add(rev_seq)

    # if there are no adapters to remove break the pipeline here
    if not len(fasta_dict):
        raise AttributeError("There are no overrepresented sequences in "
                             "these fastq files.  Please turn off this "
                             "feature and re-run the pipeline")
    else:
        pass
    with IOTools.openFile(outfile, "w") as outfle:
        for key, value in fasta_dict.items():
            outfle.write(">%s\n%s\n" % (key, list(value)[0]))
def annotateCpGIslands( infiles, outfile ):
    '''annotate transcript by absence/presence of CpG islands
    '''
    cpgfile, tssfile = infiles
    cpg = Bed.readAndIndex( IOTools.openFile( cpgfile ) )
    
    extension_upstream = PARAMS["cpg_search_upstream"]
    extension_downstream = PARAMS["cpg_search_downstream"]

    c = E.Counter()
    outf = IOTools.openFile( outfile, "w" )
    outf.write("transcript_id\tstrand\tstart\tend\trelative_start\trelative_end\n" )

    for tss in Bed.iterator(IOTools.openFile( tssfile ) ):
        c.tss_total += 1

        if tss.strand == "+":
            start, end = tss.start - extension_upstream, tss.start + extension_downstream
        else:
            start, end = tss.end - extension_downstream, tss.end + extension_upstream

        try:
            matches = list(cpg[tss.contig].find( start, end ))
        except KeyError:
            c.promotor_without_matches += 1
            continue

        if len(matches) == 0:
            c.promotor_without_matches += 1
            continue

        c.promotor_output += 1
        for match in matches:
            c.matches_total += 1
            genome_start, genome_end, x = match

            l = genome_end - genome_start

            # get relative location of match
            if tss.strand == "+":
                relative_start = genome_start - tss.start 
            else:
                relative_start = tss.end - genome_end
            
            relative_end = relative_start + l

            outf.write( "\t".join( map(str, (
                            tss.name, tss.strand,
                            genome_start, genome_end,
                            relative_start, relative_end ))) + "\n" )
            c.matches_output += 1

    outf.close()
            
    with IOTools.openFile( outfile + ".summary", "w" ) as outf:
        outf.write ("category\tcounts\n" )
        outf.write( c.asTable() + "\n" )
    
    E.info( c )
def CleanVariantTables(genes, variants, cols, outfile):
    variants = pd.read_csv(variants, sep="\t")
    variants = variants.drop(0)

    vp1 = copy.copy(variants[['CHROM',
                              'POS', 'QUAL', 'ID', 'REF1', 'ALT', 'GT']])
    alleles = vp1['REF1'].str.cat(
        vp1['ALT'].str.strip(), sep=",").str.split(",")

    vp1['GT'] = vp1['GT'].str.replace(".", "0")
    inds1 = vp1['GT'].str.get(0).astype(int).values
    inds2 = vp1['GT'].str.get(-1).astype(int).values
    x = 0
    a1s = []
    a2s = []
    gts = []
    homhet = []
    for allele in alleles:
        i1 = int(inds1[x])
        i2 = int(inds2[x])
        a1 = allele[i1]
        a2 = allele[i2]
        a1s.append(a1)
        a2s.append(a2)
        if a1 == a2:
            homhet.append("HOM")
        else:
            homhet.append("HET")
        gts.append("%s%s" % (a1, a2))
        x += 1
    vp1['HOMHET'] = homhet
    vp1['Allele1'] = a1s
    vp1['Allele2'] = a2s
    vp1['Genotype'] = gts
    vp1 = vp1.drop(['REF1', 'ALT', 'GT'], 1)
    vp1[cols] = copy.copy(variants[cols])

    Ls = []
    for gene in [line.strip()
                 for line in IOTools.openFile(genes[0]).readlines()]:
        cp = []
        with IOTools.openFile(genes[1]) as infile:
            for line in infile:
                r = re.search(gene, line)
                if r:
                    line = line.strip().split("\t")
                    chrom = line[0]
                    pos = line[1]
                    cp.append("%s_%s" % (chrom, pos))
        cp = set(cp)
        for c in cp:
            Ls.append((gene, c.split("_")))
    df = pd.DataFrame(Ls)
    df['CHROM'] = df[1].str.get(0)
    df['POS'] = df[1].str.get(1)
    df = df.drop(1, 1)
    df.columns = ['gene', 'CHROM', 'POS']
    variants = vp1.merge(df, 'left')
    variants.to_csv(outfile, sep="\t")
def FilterFreqCols(infile, thresh, fcols):
    '''
    Returns a set of line indices indicating lines where either of the alleles
    called have a frequency of less than thresh in all of the columns specified
    in fcols.
    No information - assigned allele frequency of -1.
    '''
    fcols = fcols.split(",")
    # read the column headings from the variant table
    cols = IOTools.openFile(infile).readline().strip().split("\t")
    # store allele frequency columns
    AFdict = dict()
    # store low frequency indices
    nD = dict()
    for col in fcols:
        ind = cols.index(col)
        GT_i = cols.index('GT')
        n = 0
        nlist = set()
        AFS = []
        with IOTools.openFile(infile) as input:
            for line in input:
                if n > 1:
                    line = line.strip().split("\t")
                    GT = line[GT_i].replace(".", "0").split("/")
                    af = line[ind].split(",")
                    AF = []
                    # where the allele frequency is not numeric
                    # "." or "NA" use -1 to indicate no data
                    for a in af:
                        try:
                            AF.append(float(a))
                        except:
                            AF.append(float(-1))
                    AF2 = [l if l > 0 else 0 for l in AF]
                    AF = np.array(AF)
                    AF = np.insert(AF, 0, 1 - sum(AF2))
                    GT[0] = int(GT[0])
                    GT[1] = int(GT[1])
                    # If the variant is not in database the column shows "."
                    # but the site
                    # may still have been called as multi allelic
                    # - use -1 for all frequencies
                    # in this case
                    if max(GT[0], GT[1]) > (len(AF) - 1):
                        AF = [float(-1)] * (max(GT[0], GT[1]) + 1)
                    AF1 = AF[GT[0]]
                    AF2 = AF[GT[1]]
                    if AF1 >= thresh and AF2 >= thresh:
                        nlist.add(n)
                    AFS.append((AF1, AF2))
                else:
                    AFS.append(('NA', 'NA'))
                n += 1
        AFdict[col] = AFS
        nD[col] = nlist

    ns = set.union(*list(nD.values()))
    return AFdict, ns
Beispiel #19
0
def iterator_psl_intervals(options):
    """iterate over psl file yield an entry together with overlapping entries.

    returns tuples of (match, list(query_intervals), list(target_intervals))
    """

    if options.filename_filter_query:
        intervals_query = readIntervals(
            IOTools.openFile(options.filename_filter_query, "r"), options)
    else:
        intervals_query = None

    if options.filename_filter_target:
        intervals_target = readIntervals(
            IOTools.openFile(options.filename_filter_target, "r"), options)
    else:
        intervals_target = None

    iterator = Blat.BlatIterator(options.stdin)

    ninput = 0

    while 1:

        match = iterator.next()
        if not match:
            break

        ninput += 1
        if options.test and ninput >= options.test:
            break

        if options.loglevel >= 1 and ninput % options.report_step == 0:
            options.stdlog.write("# progress: ninput=%i\n" % (ninput))
            options.stdlog.flush()

        qx, tx = None, None
        if intervals_query:
            try:
                qx = list(
                    intervals_query.get(match.mQueryId, match.mQueryFrom, match.mQueryTo))
            except KeyError:
                qx = []

        if intervals_target:
            try:
                tx = list(
                    intervals_target.get(match.mSbjctId, match.mSbjctFrom, match.mSbjctTo))
            except KeyError:
                tx = []

        if options.loglevel >= 2:
            options.stdlog.write(
                "###################################################\n")
            options.stdlog.write("# testing match %s\n" % (str(match)))
            options.stdlog.write(
                "###################################################\n")

        yield match, qx, tx
def getProbeFragments(probe_bed, digest_bed, outfile,
                        lookup_out):
    
    # First find the length of the restriction enzyme cut, required to obtain the start and end coordinates
    # from the pregenerated file.
    # First iteration, no comparison
    first_iteration = True
    
    length_RE_cut = 0
    
    last_bed = None
    
    for bed_digest in Bed.iterator(IOTools.openFile(digest_bed)):
                
        if(first_iteration):
            first_iteration = False
        else:
            # If they are in the same contig they can be compared
            if(bed_digest.contig == last_bed.contig):
                length_RE_cut = bed_digest.start - last_bed.end
                break
        
        last_bed = bed_digest
    
    
    digest_fragments = pysam.TabixFile(digest_bed)
    bed = Bed.Bed()
    with IOTools.openFile(outfile, "w") as outf, \
         IOTools.openFile(lookup_out,"w") as lookup:

        lookup.write("probe\tfragment\n")
        for probe in Bed.iterator(IOTools.openFile(probe_bed)):
            
            frag = digest_fragments.fetch(probe.contig,
                                          probe.start,
                                          probe.end,
                                          parser=pysam.asBed())
            frag = list(frag)
            if not len(frag) == 1:
                E.warn("%i fragments found for probe %s, skipping" %
                       (len(frag), probe.name))
                continue

            frag = frag[0]
            
            # The restriction enzyme cut on the left side of the fragment
            # is the end site of the last restriction enzyme fragment + 1
            # (+1 because according to the manual coordinates are specified
            # in 1-origin for the bed start.)
            
            bed.start = frag.start-length_RE_cut+1
            bed.end = frag.end+length_RE_cut
            bed.contig = frag.contig
            bed["name"] = probe.name
            bed["score"] = "."
            bed["strand"] = "+"

            lookup.write("%s\t%s\n" % (probe.name, frag.name))
            outf.write(str(bed) + "\n")
Beispiel #21
0
def imputeGO(infile_go, infile_paths, outfile):
    """impute GO accessions.

    Output a list of gene-to-GO associations for genes that includes
    ancestral terms.

    Arguments
    ---------
    infile_go : string
        Filename with gene-to-GO assocations for genes
    infile_paths : string
        Filename with paths of term to ancestor (see go2fmt.pl).
    outfile : string
         Output filename

    """

    c = E.Counter()

    term2ancestors = collections.defaultdict(set)
    with IOTools.openFile(infile_paths) as inf:
        for line in inf:
            parts = line[:-1].split()
            term = parts[0]
            ancestors = [parts[x] for x in range(2, len(parts), 2)]
            # there can be multiple paths
            term2ancestors[term].update(ancestors)

    goid2description = {}
    gene2goids = collections.defaultdict(list)
    goid2type = {}
    with IOTools.openFile(infile_go) as inf:
        for line in inf:
            if line.startswith("go_type"):
                continue
            go_type, gene_id, goid, description, evidence = line[:-1].split("\t")
            gene2goids[gene_id].append(goid)
            goid2description[goid] = description
            goid2type[goid] = go_type

    outf = IOTools.openFile(outfile, "w ")
    for gene_id, in_goids in gene2goids.iteritems():
        c.genes += 1
        out_goids = set(in_goids)
        for goid in in_goids:
            out_goids.update(term2ancestors[goid])
        if len(in_goids) != len(out_goids):
            c.increased += 1
        else:
            c.complete += 1

        for goid in out_goids:
            outf.write("\t".join((goid2type.get(goid, ""), gene_id, goid, goid2description.get(goid, ""), "NA")) + "\n")
            c.assocations += 1

    outf.close()

    E.info("%s" % str(c))
Beispiel #22
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(version="%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                            usage=globals()["__doc__"])

    parser.add_option("-f", "--change-format", dest="change_format", type="choice",
                      choices=('sanger', 'solexa', 'phred64', 'integer'),
                      help="guess quality score format and set quality scores to format [default=%default].")

    parser.add_option("--guess-format", dest="guess_format", type="choice",
                      choices=('sanger', 'solexa', 'phred64', 'integer'),
                      help="quality score format to assume if ambiguous [default=%default].")

    parser.add_option("--pattern", dest="pattern", type="string",
                      help="filename prefix [default=%default].")

    parser.set_defaults(
        change_format=None,
        guess_format=None,
        pattern="%s.gz"
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    c = E.Counter()

    outfile_seq = IOTools.openFile(options.pattern % "csfasta", "w")
    outfile_qual = IOTools.openFile(options.pattern % "qual", "w")

    if options.change_format:
        iter = Fastq.iterate_convert(options.stdin,
                                     format=options.change_format,
                                     guess=options.guess_format)
    else:
        iter = Fastq.iterate(options.stdin)

    for record in iter:
        c.input += 1
        outfile_seq.write(">%s\n%s\n" % (record.identifier, record.seq))
        outfile_qual.write(">%s\n%s\n" % (record.identifier, record.quals))
        c.output += 1

    outfile_seq.close()
    outfile_qual.close()

    # write footer and output benchmark information.
    E.info("%s" % str(c))
    E.Stop()
def outputAllWindows( infile, outfile ):
    '''output all Windows as a bed file with the l2fold change
    as a score.
    '''
    outf = IOTools.openFile( outfile, "w" )
    for line in IOTools.iterate( IOTools.openFile( infile ) ):
        outf.write( "\t".join( (line.contig, line.start, line.end, "%6.4f" % float(line.l2fold ))) + "\n" ) 

    outf.close()
Beispiel #24
0
def buildGenomicFunctionalAnnotation(gtffile, dbh, outfiles):
    '''output a bed file with genomic regions with functional annotations.

    The regions for each gene are given in the gtf file.

    Each bed entry is a gene territory. Bed entries are labeled
    by functional annotations associated with a gene.

    Ambiguities in territories are resolved by outputting
    annotations for all genes within a territory.

    The output file contains annotations for both GO and GOSlim. These
    are prefixed by ``go:`` and ``goslim:``.
    '''
    territories_file = gtffile

    outfile_bed, outfile_tsv = outfiles

    gene2region = {}
    for gtf in GTF.iterator(IOTools.openFile(gtffile, "r")):
        gid = gtf.gene_id.split(":")
        for g in gid:
            gene2region[g] = (gtf.contig, gtf.start, gtf.end, gtf.strand)

    cc = dbh.cursor()

    outf = P.getTempFile(".")
    c = E.Counter()
    term2description = {}
    for db in ('go', 'goslim'):
        for gene_id, go_id, description in cc.execute(
                "SELECT gene_id, go_id, description FROM %s_assignments" % db):
            try:
                contig, start, end, strand = gene2region[gene_id]
            except KeyError:
                c.notfound += 1
                continue
            outf.write(
                "\t".join(map(str, (
                    contig, start, end,
                    "%s:%s" % (db, go_id), 1, strand))) + "\n")
            term2description["%s:%s" % (db, go_id)] = description
    outf.close()
    tmpfname = outf.name
    statement = '''sort -k1,1 -k2,2n  < %(tmpfname)s | uniq
    | gzip > %(outfile_bed)s'''

    P.run()

    outf = IOTools.openFile(outfile_tsv, "w")
    outf.write("term\tdescription\n")
    for term, description in term2description.iteritems():
        outf.write("%s\t%s\n" % (term, description))
    outf.close()

    os.unlink(tmpfname)
Beispiel #25
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version="%prog version: $Id: bed2graph.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"])

    parser.add_option("-o", "--output-section", dest="output", type="choice",
                      choices=("full", "name"),
                      help="output either ``full`` overlapping entries, only the ``name``s. [default=%default].")

    parser.set_defaults(
        output="full",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    if len(args) != 2:
        raise ValueError("two arguments required")

    if args[0] == "-":
        infile1 = options.stdin
    else:
        infile1 = IOTools.openFile(args[0], "r")

    infile2 = IOTools.openFile(args[1], "r")

    idx = Bed.readAndIndex(infile2, with_values=True)

    output = options.output
    outfile = options.stdout

    if output == "name":
        outfile.write("name1\tname2\n")
        outf = lambda x: x.fields[0]
    else:
        outf = str

    for bed in Bed.iterator(infile1):
        try:
            overlaps = idx[bed.contig].find(bed.start, bed.end)
        except (KeyError, IndexError):
            # ignore missing contig and zero length intervals
            continue

        for o in overlaps:
            outfile.write("\t".join((outf(bed), outf(o[2]))) + "\n")

    E.Stop()
def calculateSplicingIndex(bamfile, gtffile, outfile):

    bamfile = pysam.AlignmentFile(bamfile)

    counts = E.Counter()

    for transcript in GTF.transcript_iterator(
            GTF.iterator(IOTools.openFile(gtffile))):

        introns = GTF.toIntronIntervals(transcript)
        E.debug("Gene %s (%s), Transcript: %s, %i introns" %
                (transcript[0].gene_id,
                 transcript[0].contig,
                 transcript[0].transcript_id,
                 len(introns)))

        for intron in introns:
            reads = bamfile.fetch(
                reference=transcript[0].contig,
                start=intron[0], end=intron[1])
            
            for read in reads:
                if 'N' in read.cigarstring:
                    blocks = read.get_blocks()
                    starts, ends = zip(*blocks)
                    if intron[0] in ends and intron[1] in starts:
                        counts["Exon_Exon"] += 1
                    else:
                        counts["spliced_uncounted"] += 1
                elif (read.reference_start <= intron[0] - 3
                      and read.reference_end >= intron[0] + 3):
                    if transcript[0].strand == "+":
                        counts["Exon_Intron"] += 1
                    else:
                        counts["Intron_Exon"] += 1
                elif (read.reference_start <= intron[1] - 3
                      and read.reference_end >= intron[1] + 3):
                    if transcript[0].strand == "+":
                        counts["Intron_Exon"] += 1
                    else:
                        counts["Exon_Intron"] += 1
                else:
                    counts["unspliced_uncounted"] += 1

        E.debug("Done, counts are: " + str(counts))
    header = ["Exon_Exon",
              "Exon_Intron",
              "Intron_Exon",
              "spliced_uncounted",
              "unspliced_uncounted"]

    with IOTools.openFile(outfile, "w") as outf:

        outf.write("\t".join(header)+"\n")
        outf.write("\t".join(map(str, [counts[col] for col in header]))
                   + "\n")
Beispiel #27
0
    def __call__(self, track, slice=None):

        c_transcript = []
        c_gene = []
        for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))):
            c_transcript.append(len(transcript))
        for gene in GTF.flat_gene_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))):
            c_gene.append(len(gene))

        return odict((("transcript", np.mean(c_transcript)), ("gene", np.mean(c_gene))))
def makeCpgIslandsBed(outfile):
    infile = PARAMS["methylation_summary_cpgislands"]
    out = IOTools.openFile(outfile, "w")
    with IOTools.openFile(infile, "r") as f:
        for line in f.readlines():
            # this assumes location of req. values
            contig, start, end = line.split()[1:4]
            if not contig == "chrom":
                out.write("%s\t%s\t%s\n" % (contig, start, end))
    out.close()
Beispiel #29
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser( version = "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", 
                                    usage = globals()["__doc__"] )

    parser.set_defaults(
        )

    ## add common options (-h/--help, ...) and parse command line 
    (options, args) = E.Start( parser, argv = argv )

    ## do sth
    assert len(args) == 3, "expected three command line arguments" 

    samfile = pysam.Samfile( args[0], "rb" )
    outstream1 = IOTools.openFile( args[1], "w" )
    outstream2 = IOTools.openFile( args[2], "w" )

    c = E.Counter()
    for read in samfile.fetch():
        c.input += 1
        if read.mate_is_unmapped:
            c.is_unmapped += 1
            if read.is_read1:
                seq1, seq2 = read.seq, "N" * read.qlen 
                qual1, qual2 = read.qual, "B" * read.qlen
            else:
                seq1, seq2 = "N" * read.qlen, read.seq
                qual1, qual2 = "B" * read.qlen, read.qual
        else:
            if read.is_read2: continue
            try:
                mate = samfile.mate( read )
                c.found += 1
            except ValueError, msg:
                mate = None
                c.failed += 1
                
            if mate:
                seq1, seq2 = read.seq, mate.seq
                qual1, qual2 = read.qual, mate.qual
            else:
                seq1, seq2 = read.seq, "N" * read.qlen 
                qual1, qual2 = read.qual, "B" * read.qlen

        c.output += 1
        outstream1.write( "@%s\n%s\n+\n%s\n" % (read.qname, seq1, qual1 ) )
        outstream2.write( "@%s\n%s\n+\n%s\n" % (read.qname, seq2, qual2 ) )
def getPathwayGenes(infiles, outfile):
    '''
    get genes that are asscociated with diff pathways
    '''
    pathway_file, clusters_file, geneset_file, probe2gene_file = [infiles[0]] + infiles[1]
    pathways = IOTools.openFile(pathway_file)

    # get which cluster it was
    cluster = os.path.basename(pathway_file)[1]
    
    # get genes associated with cluster
    probes = [x[:-1].split("\t")[0] for x in IOTools.openFile(clusters_file) if x[:-1].split("\t")[1] == cluster]
    
    # header
    pathways.readline()

    # get pathways
    sig_pathways = [x.split("\t")[11] for x in pathways.readlines()
                    if x.split("\t")[0] == "+"]

    # probe2gene map
    probe2gene_file = IOTools.openFile(probe2gene_file)
    probe2gene = {}
    for line in probe2gene_file:
        data = line[:-1].split("\t")
        probe, gene = data
        probe = probe.replace('"', '')
        gene = gene.replace('"', '')
        if probe in probes:
            probe2gene[probe] = gene
        else:
            continue

    # pathway2genes
    pathways_geneset = IOTools.openFile(geneset_file)
    pathway2genes = collections.defaultdict(set)
    for line in pathways_geneset.readlines():
        data = line[:-1].split("\t")
        pathway, gene = data[2], data[1]
        if pathway in sig_pathways:
            
            if gene in probe2gene.values():
                pathway2genes[pathway].add(gene)
            else:
                continue
        else:
            continue

    # output
    outf = open(outfile, "w")
    outf.write("pathway\tgene\n")
    for pathway, genes in pathway2genes.iteritems():
        for gene in genes:
            outf.write("%s\t%s\n" % (pathway, gene))
    outf.close()
Beispiel #31
0
def getBarcodeCG(table, outfile):
    ''' Annotate barcode use statistics with %GC '''

    statement = " SELECT * FROM %(table)s" % locals()

    umi_stats = PUtils.fetch_DataFrame(statement)

    def _GC(x):
        return float(x.count("G") + x.count("G"))/len(x)

    barcode_gc = umi_stats.Barcode.apply(_GC)
    sample_gc = umi_stats.Sample.apply(_GC)
    umi_gc = umi_stats.UMI.apply(_GC)

    gc_stats= pandas.DataFrame({"Barcode":umi_stats.Barcode,
                                "barcode_gc": barcode_gc,
                                "sample_gc": sample_gc,
                                "umi_gc": umi_gc})

    gc_stats.to_csv(IOTools.openFile(outfile,"w"), sep="\t", index=False)
Beispiel #32
0
    def __init__(self, filename, *args, **kwargs):

        assert filename is not None,\
            "please supply filename for CounterOverlap"

        Counter.__init__(self, *args, **kwargs)

        self.filename = filename

        E.info("reading intervals from %s" % self.filename)

        self.index = Bed.readAndIndex(IOTools.openFile(self.filename, "r"),
                                      per_track=True)

        E.info("read intervals for %s tracks" % len(self.index))

        self.tracks = list(self.index.keys())
        self.headers = []
        for track in self.tracks:
            self.headers.extend(["%s_nover" % track, "%s_bases" % track])
Beispiel #33
0
def loadLncRNAClass(infile, outfile):
    '''
    load the lncRNA classifications
    '''
    tablename = os.path.basename(filenameToTablename(P.snip(infile,
                                                            ".gtf.gz")))

    to_cluster = False
    # just load each transcript with its classification
    temp = P.getTempFile()
    for transcript in GTF.transcript_iterator(
            GTF.iterator(IOTools.openFile(infile))):
        temp.write("%s\t%s\t%s\n" %
                   (transcript[0].transcript_id, transcript[0].gene_id,
                    transcript[0].source))
    temp.close()

    inf = temp.name
    statement = '''python %(scriptsdir)s/csv2db.py -t %(tablename)s --log=%(outfile)s.log --header=transcript_id,gene_id,class < %(inf)s > %(outfile)s'''
    P.run()
def runFastqScreen(infiles, outfile):
    '''run FastqScreen on input files.'''

    # variables required for statement built by FastqScreen()
    tempdir = P.getTempDir(".")
    outdir = os.path.join(PARAMS["exportdir"], "fastq_screen")

    # Create fastq_screen config file in temp directory
    # using parameters from Pipeline.ini
    with IOTools.openFile(os.path.join(tempdir, "fastq_screen.conf"),
                          "w") as f:
        for i, k in list(PARAMS.items()):
            if i.startswith("fastq_screen_database"):
                f.write("DATABASE\t%s\t%s\n" % (i[22:], k))

    m = PipelineMapping.FastqScreen()
    statement = m.build((infiles, ), outfile)
    P.run()
    shutil.rmtree(tempdir)
    P.touch(outfile)
Beispiel #35
0
def collectFastQCSections(infiles, section):
    '''iterate over all fastqc files and extract a particular section.'''
    results = []

    for infile in infiles:

        track = P.snip(infile, ".fastqc")

        filename = os.path.join(
            PARAMS["exportdir"], "fastqc", track + "*_fastqc",
            "fastqc_data.txt")

        for fn in glob.glob(filename):
            prefix = os.path.basename(os.path.dirname(fn))
            for name, status, header, data in FastqcSectionIterator(
                    IOTools.openFile(fn)):
                if name == section:
                    results.append((track, status, header, data))

    return results
Beispiel #36
0
def getPeakShiftFromZinba(infile):
    '''get peak shift for filename infile (.zinba output file).

    returns None if no shift found
    '''

    shift = None

    # search for
    # $offset
    # [1] 125

    with IOTools.openFile(infile, "r") as ins:
        lines = ins.readlines()
        for i, line in enumerate(lines):
            if line.startswith("$offset"):
                shift = int(lines[i + 1].split()[1])
                break

    return shift
Beispiel #37
0
def buildMatrixFromTables(infiles,
                          column,
                          column_header=0,
                          dtype=numpy.float,
                          default=None):
    '''build a matrix from a column called *column* in a series of input files.

    If column_value is None, the first column is taken as the name of the row.

    The columns are given by order of the input files.

    returns matrix, row_headers
    '''

    lists = []
    for infile in infiles:
        data = pandas.read_table(IOTools.openFile(infile))
        lists.append(list(zip(list(data[column_header]), list(data[column]))))

    return buildMatrixFromLists(lists, dtype=dtype, default=default)
Beispiel #38
0
def makeIntervalCorrelation(infiles, outfile, field, reference):
    '''compute correlation of interval properties between sets
    '''

    dbhandle = sqlite3.connect(PARAMS["database_name"])

    tracks, idx = [], []
    for infile in infiles:
        track = P.snip(infile, ".bed")
        tablename = "%s_intervals" % P.tablequote(track)
        cc = dbhandle.cursor()
        statement = "SELECT contig, start, end, %(field)s FROM %(tablename)s" % locals(
        )
        cc.execute(statement)
        ix = IndexedGenome.IndexedGenome()
        for contig, start, end, peakval in cc:
            ix.add(contig, start, end, peakval)
        idx.append(ix)
        tracks.append(track)
    outs = IOTools.openFile(outfile, "w")
    outs.write("contig\tstart\tend\tid\t" + "\t".join(tracks) + "\n")

    for bed in Bed.iterator(infile=open(reference, "r")):

        row = []
        for ix in idx:
            try:
                intervals = list(ix.get(bed.contig, bed.start, bed.end))
            except KeyError:
                row.append("")
                continue

            if len(intervals) == 0:
                peakval = ""
            else:
                peakval = str((max([x[2] for x in intervals])))
            row.append(peakval)

        outs.write(str(bed) + "\t" + "\t".join(row) + "\n")

    outs.close()
def buildExperimentReadQuality(infiles, outfile, datadir):
    """build per-experiment read quality summary.

    Arguments
    ---------
    infiles : list
        List of filenames with fastqc output (logging information). The
        track name is derived from that.
    outfile : list
        Output filename in :term:`tsv` format.
    datadir : string
        Location of actual Fastqc output to be parsed.

    """
    data = collectFastQCSections(infiles,
                                 "Per sequence quality scores",
                                 datadir)
    first = True

    if len(data) == 0:
        raise ValueError("received no data")

    for track, status, header, rows in data:
        T = track.replace("-", "_").replace(".", "_")
        rows = [list(map(float, x.split("\t"))) for x in rows]
        header = header.split("\t")
        if first:
            first = False
            df_out = pd.DataFrame(rows)
            df_out.columns = header
            df_out.rename(columns={"Count": T}, inplace=True)
        else:
            df = pd.DataFrame(rows)
            df.columns = header
            df.rename(columns={"Count": T}, inplace=True)
            df_out = df_out.merge(df, how="outer", on="Quality", sort=True)

    # SLV: is this really required?
    #df_out = pd.DataFrame(df_out.sum(axis=1))

    df_out.to_csv(IOTools.openFile(outfile, "w"), sep="\t", index=False)
def buildRefcodingGeneSetStats(infile, outfile):
    '''
    counts:
    no. of transcripts
    no. genes
    average number of exons per transcript
    average number of exons per gene
    no. multi-exon transcripts
    no. single exon transcripts
    no. multi-exon genes
    no. single exon genes

    in the coding and lncRNA genesets
    '''

    # calculate exon status for refcoding genes.
    tmpf = P.getTempFilename(".") + ".gz"
    PipelineLncRNA.flagExonStatus(infile, tmpf)

    outf = IOTools.openFile(outfile, "w")
    outf.write("\t".join(["no_transcripts",
                          "no_genes",
                          "no_exons_per_transcript",
                          "no_exons_per_gene",
                          "no_single_exon_transcripts",
                          "no_multi_exon_transcripts",
                          "no_single_exon_genes",
                          "no_multi_exon_genes"]) + "\n")
    outf.write("\t".join(map(str, [
        PipelineLncRNA.CounterTranscripts(tmpf).count(),
        PipelineLncRNA.CounterGenes(tmpf).count(),
        PipelineLncRNA.CounterExonsPerTranscript(tmpf).count(),
        PipelineLncRNA.CounterExonsPerGene(tmpf).count(),
        PipelineLncRNA.CounterSingleExonTranscripts(tmpf).count(),
        PipelineLncRNA.CounterMultiExonTranscripts(tmpf).count(),
        PipelineLncRNA.CounterSingleExonGenes(tmpf).count(),
        PipelineLncRNA.CounterMultiExonGenes(tmpf).count()])))

    os.unlink(tmpf)
    os.unlink(tmpf + ".log")
    os.unlink(P.snip(tmpf, ".gz"))
Beispiel #41
0
 def readLarge(self, fname=None):
     """ read a potentially huge file.
     """
     try:
         # get stderr, allowing for case where it's very large
         tmp = IOTools.openFile(fname, 'rb')
         s = ''
         buffsize = 1048576
         try:
             while True:
                 more = tmp.read(buffsize)
                 if len(more) > 0:
                     s += more
                 else:
                     break
         except OverflowError:
             pass
         tmp.close()
     except Exception as e:
         stop_err('Read Large Exception : %s' % str(e))
     return s
Beispiel #42
0
    def readChunk(lines, chunk):
        # use real file, as MAST parser can not deal with a
        # list of lines
        tmpfile2 = P.getTempFile(".")
        try:
            motif, part = re.match(":: motif = (\S+) - (\S+) ::",
                                   lines[chunks[chunk]]).groups()
        except AttributeError:
            raise ValueError("parsing error in line '%s'" %
                             lines[chunks[chunk]])

        E.info("reading %s - %s" % (motif, part))

        tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]]))
        tmpfile2.close()

        mast = MAST.parse(IOTools.openFile(tmpfile2.name, "r"))

        os.unlink(tmpfile2.name)

        return motif, part, mast
def buildPathwayGenes(infiles, outfile):
    '''
    build sets of genes that are differentially expressed and
    fall into the pathways identified in the top 10 lists
    '''
    goresult, gene2pathways = infiles[0], infiles[1]

    if "_up" in os.path.basename(goresult):
        contrast = os.path.basename(goresult).replace("_up", ",").split(",")[0]
    if "_down" in os.path.basename(goresult):
        contrast = os.path.basename(goresult).replace("_down",
                                                      ",").split(",")[0]
    dbh = connect()
    result_table = contrast

    # catch where there is no sig
    if len(IOTools.openFile(goresult).readlines()) == 1:
        P.touch(outfile)
    else:
        PipelineIlmnArray.buildPathwayGenes(goresult, gene2pathways, contrast,
                                            dbh, result_table, outfile)
Beispiel #44
0
def preprocessIdba(infile, outfile):
    '''
    preprocess pooled reads for IDBA
    '''
    # check for second read in the pair
    if infile.endswith(".fastq.gz"):
        E.info("converting fastq file to fasta file")
        outf = open(outfile, "w")
        for fastq in Fastq.iterate(IOTools.openFile(infile)):
            outf.write("%s\n%s\n" % (">" + fastq.identifier, fastq.seq))
        outf.close()
    elif infile.endswith(".1.gz"):
        read2 = P.snip(infile, ".1.gz") + ".2.gz"
        assert os.path.exists(read2), "file does not exist %s" % read2

        statement = '''python %(scriptsdir)s/fastqs2fasta.py
                   -a %(infile)s
                   -b %(read2)s
                   --log=%(infile)s.log
                   > %(outfile)s'''
        P.run()
def fetchGenomeAssemblyAndAnnotations(infile, outfiles):
    '''Download all assemblies using rsync, as per ncbi recommendation'''
    
    outdir = P.snip(infile, '.txt') + '.dir'
    if os.path.exists(outdir):
        shutil.rmtree(outdir)
    os.mkdir(outdir)

    for genome in IOTools.openFile(infile):
        genome_path = genome.split().pop()
        genome_path = re.sub('ftp', 'rsync', genome_path)
        gff_path = P.snip(genome_path, '.fna.gz') + '.gff.gz'

        statement = (" rsync --copy-links --quiet"
                     "  %(genome_path)s"
                     "  %(outdir)s &&"
                     " rsync --copy-links --quiet"
                     "  %(gff_path)s"
                     "  %(outdir)s")
        to_cluster = False
        P.run()
Beispiel #46
0
    def __call__(self, track, slice=None):

        fn = os.path.join(
            DATADIR, "peakshapes.dir",
            "%(track)s.peakshape.tsv.gz.matrix_%(slice)s.gz" % locals())
        if not os.path.exists(fn): return

        matrix, rownames, colnames = IOTools.readMatrix(IOTools.openFile(fn))

        nrows = len(rownames)
        if nrows < 2: return

        if nrows > 1000:
            take = numpy.array(numpy.floor(numpy.arange(
                0, nrows, nrows / 1000)),
                               dtype=int)
            rownames = [rownames[x] for x in take]
            matrix = matrix[take]

        return odict(
            (('matrix', matrix), ('rows', rownames), ('columns', colnames)))
Beispiel #47
0
def buildFDRStats(infile, outfile, method):
    '''compute number of windows called at different FDR.
    '''

    data = pandas.read_csv(IOTools.openFile(infile), sep="\t", index_col=0)

    assert data['treatment_name'][0] == data['treatment_name'][-1]
    assert data['control_name'][0] == data['control_name'][-1]

    treatment_name, control_name = data['treatment_name'][0], data[
        'control_name'][0]

    key = (treatment_name, control_name)
    fdrs = (0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0)

    for fdr in fdrs:
        print "fdr"
        take = data['qvalue'] <= fdr

        significant = sum(take)
        print significant
Beispiel #48
0
def getPeakShiftFromMacs(infile):
    '''get peak shift for filename infile (.macs output file).

    returns None if no shift found'''

    shift = None
    with IOTools.openFile(infile, "r") as ins:
        rx = re.compile("#2 predicted fragment length is (\d+) bps")
        r2 = re.compile("#2 Use (\d+) as shiftsize, \d+ as fragment length")
        for line in ins:
            x = rx.search(line)
            if x:
                shift = int(x.groups()[0])
                break
            x = r2.search(line)
            if x:
                shift = int(x.groups()[0])
                E.warn("shift size was set automatically - see MACS logfiles")
                break

    return shift
Beispiel #49
0
    def __call__(self, track):

        length = {}
        for transcript in GTF.transcript_iterator(
                GTF.iterator(IOTools.openFile("gtfs/lncrna_filtered.gtf.gz"))):
            length[transcript[0].transcript_id] = sum(
                [gtf.end - gtf.start for gtf in transcript])

        score = {}
        dbh = sqlite3.connect("csvdb")
        cc = dbh.cursor()
        for data in cc.execute(
                "SELECT transcript_id, CP_score FROM lncrna_filtered_cpc_result"
        ):
            score[data[0]] = data[1]

        result = {"length": [], "score": []}
        for transcript, value in length.items():
            result["length"].append(np.log10(length[transcript]))
            result["score"].append(score[transcript])
        return result
Beispiel #50
0
def getGODescriptions(infile):
    '''build dictionary mapping GOids to types and descriptions.

    Arguments
    ---------
    infile : string
        Filename of table with GO assignments

    Returns
    -------
    mapping : dict
        Dictionary mapping GOid to GOtype and GOdescription.
    '''

    with IOTools.openFile(infile) as inf:
        fields, table = CSV.readTable(inf, as_rows=False)

    return dict([
        (y, (x, z)) for x, y, z in zip(table[fields.index("go_type")], table[
            fields.index("go_id")], table[fields.index("description")])
    ])
Beispiel #51
0
def loadLncRNAClass(infile, outfile):
    '''
    load the lncRNA classifications
    '''

    # just load each transcript with its classification
    temp = P.getTempFile(".")
    inf = IOTools.openFile(infile)
    for transcript in GTF.transcript_iterator(GTF.iterator(inf)):
        temp.write("%s\t%s\t%s\n" %
                   (transcript[0].transcript_id, transcript[0].gene_id,
                    transcript[0].source))
    temp.close()

    P.load(temp.name,
           outfile,
           options="--header-names=transcript_id,gene_id,class "
           "--add-index=transcript_id "
           "--add-index=gene_id")

    os.unlink(temp.name)
 def unstowSetDict(self, infile):
     '''
     Regenerates a dictionary using a file generated by
     stowSetDict
     Reads a flat file where column one is dictionary keys and column two
     a comma delimited list of values for that key and generates a
     dictionary of sets.
     e.g. A    a,b,c would become  {A:set("a", "b", "c")}
     '''
     D = dict()
     i = 0
     with IOTools.openFile(infile) as inf:
         for line in inf:
             if i != 0:
                 line = line.strip().split("\t")
                 if line[0] not in D and len(line) > 1:
                     D[line[0]] = set()
                     for part in line[1].split(","):
                         D[line[0]].add(part)
             i += 1
     return D
Beispiel #53
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(
        version=
        "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-g",
                      "--genome-dir",
                      dest="genome_dir",
                      type="string",
                      help="supply help")

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv)

    contigs_map = {}
    for genome in glob.glob(os.path.join(options.genome_dir, "*")):
        for fasta in FastaIterator.iterate(IOTools.openFile(genome)):
            identifier = fasta.title.split("|")
            gi = identifier[1]
            contigs_map[gi] = fasta.title

    for line in options.stdin.readlines():
        data = line[:-1].split("\t")
        gi = data[1]
        assert gi in contigs_map, "cannot find genome with id gi|%s in genomes directory" % gi

        options.stdout.write("%s\t%s\n" % (data[0], contigs_map[gi]))

    # write footer and output benchmark information.
    E.Stop()
Beispiel #54
0
    def __call__(self, track, slice=None):

        classes = [
            "antisense", "antisense_upstream", "antisense_downstream",
            "sense_upstream", "sense_downstream", "intergenic",
            "sense_intronic", "antisense_intronic"
        ]

        coding_set = {}
        for gtf in GTF.iterator(
                IOTools.openFile("gtfs/lncrna_filtered.class.gtf.gz")):
            coding_set[gtf.transcript_id] = gtf.source

        result = {"noncoding": {}, "coding": collections.defaultdict(int)}
        total_nc = float(
            self.getValue(
                "SELECT COUNT(*) FROM %(track)s_cpc_result WHERE C_NC = 'noncoding'"
            ))
        for c in classes:
            result["noncoding"][c] = (float(
                self.getValue(
                    """SELECT COUNT(*) FROM lncrna_final_class as a, %s_cpc_result as b WHERE a.class = '%s' 
                                                              AND b.C_NC = 'noncoding' 
                                                              AND a.transcript_id = b.transcript_id"""
                    % (track, c))) / total_nc) * 100

        total_c = len(list(coding_set.keys()))
        for c in classes:
            ids = self.getValues(
                "SELECT transcript_id FROM %(track)s_cpc_result WHERE C_NC = 'coding'"
            )
            for i in ids:
                if i in list(coding_set.keys()):
                    if coding_set[i] == c:
                        result["coding"][c] += 1

        for x, y in result["coding"].items():
            result["coding"][x] = (float(y) / total_c) * 100

        return result
Beispiel #55
0
    def _count(self, filename, idx):
        '''count filename against idx.'''

        overlapping_genes = set()
        genes = set()

        # iterate over exons
        infile = IOTools.openFile(filename, "r")
        it = Bed.bed_iterator(infile)

        nexons, nexons_overlapping = 0, 0
        nbases, nbases_overlapping = 0, 0
        for this in it:
            nexons += 1
            nbases += this.end - this.start

            try:
                intervals = list(idx[this.contig].find(max(0, this.start),
                                                       this.end))
            except KeyError:
                continue
            except Exception as msg:
                raise Exception("error while processing %s, msg=%s" %
                                (filename, msg))
            if len(intervals) == 0:
                continue

            nexons_overlapping += 1
            start, end = this.start, this.end
            counts = numpy.zeros(end - start, numpy.int)
            for other_start, other_end, other_value in intervals:
                for x in range(
                        max(start, other_start) - start,
                        min(end, other_end) - start):
                    counts[x] += 1
            nbases_overlapping += sum([1 for x in counts if x > 0])

        infile.close()

        return nexons, nexons_overlapping, nbases, nbases_overlapping
Beispiel #56
0
def collectCpGIslands(infile, outfile):
    '''select repeats from UCSC and write to *outfile* in gff format.
    '''

    dbhandle = PipelineGeneset.connectToUCSC()

    # Repeats are either stored in a single ``rmsk`` table (hg19) or in
    # individual ``rmsk`` tables (mm9) like chr1_rmsk, chr2_rmsk, ....
    # In order to do a single statement, the ucsc mysql database is
    # queried for tables that end in rmsk.
    cc = dbhandle.cursor()
    table = "cpgIslandExt"
    sql = """SELECT chrom, chromStart, chromEnd, name, obsExp
               FROM %(table)s
    """ % locals()
    E.debug("executing sql statement: %s" % sql)
    cc.execute(sql)
    outf = IOTools.openFile(outfile, "w")
    for data in cc.fetchall():
        outf.write("\t".join(map(str, data)) + "\n")

    outf.close()
Beispiel #57
0
def loadFastqc(infile, outfile):
    '''load FASTQC stats.'''

    track = P.snip(infile, ".fastqc")

    filename = os.path.join(PARAMS["exportdir"], "fastqc", track + "*_fastqc",
                            "fastqc_data.txt")

    for fn in glob.glob(filename):
        prefix = os.path.basename(os.path.dirname(fn))
        results = []

        for name, status, header, data in FastqcSectionIterator(
                IOTools.openFile(fn)):
            # do not collect basic stats, see loadFastQCSummary
            if name == "Basic Statistics":
                continue

            parser = CSV2DB.buildParser()
            (options, args) = parser.parse_args([])
            options.tablename = prefix + "_" + re.sub(" ", "_", name)
            options.allow_empty = True

            inf = cStringIO.StringIO("\n".join([header] + data) + "\n")
            CSV2DB.run(inf, options)
            results.append((name, status))

        # load status table
        parser = CSV2DB.buildParser()
        (options, args) = parser.parse_args([])
        options.tablename = prefix + "_status"
        options.allow_empty = True

        inf = cStringIO.StringIO("\n".join(["name\tstatus"] +
                                           ["\t".join(x)
                                            for x in results]) + "\n")
        CSV2DB.run(inf, options)

    P.touch(outfile)
 def readOntFile(self):
     '''
     Reads an obo formatted file.
     Generates a dictionary - self.TermsToOnt
     Keys are terms, values are all immediate parents of that term - the
     "is_a" argument in the obo file.
     '''
     TermsToOnt = dict()
     Tname = None
     isas = set()
     with IOTools.openFile(self.options['ont']) as infile:
         for line in infile:
             line = line.strip()
             if line.startswith("id"):
                 if Tname is not None:
                     TermsToOnt[Tname] = isas
                     isas = set()
                 Tname = line.split(": ")[1]
             elif line.startswith("is_a"):
                 isas.add(":".join(line.split(": ")[1:]).split(" ! ")[0])
     TermsToOnt[Tname] = isas
     return TermsToOnt
Beispiel #59
0
def GetAndTranslateAllGenes(outfile):
    '''
    This step is required.
    1. All Entrez gene IDs are downloaded from entrez gene.
    2. Corresponding ensembl gene, ensembl transcript and ensembl protein
       IDs are downloaded from mygene.info
    3. Corresponding gene symbols are downloaded from mygene.info
    4. These are loaded into the database
    5. A list of all gene Entrez IDs is stored as 'allgenes.tsv

    Tables:
    ensemblg2entrez$geneid - ensemblg to entrez ID
    ensemblg2ensemblt$other - ensemblg to ensembl transcript
    ensemblg2ensemblp$other - ensemblg to ensembl protein
    ensemblg2symbol_xxx$geneid - ensemblg to symbol in species xxx
    '''
    GeneAnnot = PipelineGeneInfo.EntrezGeneAnnotation(PARAMS['db_name'],
                                                      PARAMS['entrez_email'])
    genelist = GeneAnnot.download_all(PARAMS['entrez_host'])

    # Generate an EnsemblAnnotation object
    Ens = PipelineGeneInfo.EnsemblAnnotation(PARAMS['my_gene_info_source'],
                                             PARAMS['db_name'])
    # Get Ensembl annotations
    PipelineGeneInfo.runall(Ens, genelist, ['ensembl'], submit=True)

    # Generate a SymbolAnnotation object
    Sym = PipelineGeneInfo.SymbolAnnotation(PARAMS['my_gene_info_source'],
                                            PARAMS['db_name'],
                                            PARAMS['entrez_host'],
                                            PARAMS['entrez_sciname'])
    # Get Symbol Annotations
    PipelineGeneInfo.runall(Sym, genelist, ['symbol'], submit=True)

    # Make output gene list
    outf = IOTools.openFile(outfile, "w")
    for gene in genelist:
        outf.write("%s\n" % gene)
    outf.close()
Beispiel #60
0
def buildOptimalPrimerSet(infiles, outfile):
    '''
    build a set of optimal primer pairs across sequences
    '''
    outf = IOTools.openFile(outfile, "w")
    outf.write(
        """name\tforward_seq\tforward_gc (%) \tforward_tm\tforward_length (bp)\treverse_seq\treverse_gc (%)\treverse_tm\treverse_length (bp)\tfragment_length (bp)\n"""
    )
    for infile in infiles:
        primerset = PrimerSet()
        name = primerset.readName(infile)
        size = primerset.readSize(infile)
        forward = primerset.readForward(infile)
        reverse = primerset.readReverse(infile)
        primerset = primerset.parse(name, size, *forward + reverse)
        outf.write("\t".join([
            primerset.name, primerset.forwardseq, primerset.forwardgc,
            primerset.forwardtm, primerset.forwardlength, primerset.reverseseq,
            primerset.reversegc, primerset.reversetm, primerset.reverselength,
            primerset.size
        ]) + "\n")
    outf.close()