def mergeAndLoad(infiles, outfile, suffix): '''load categorical tables (two columns) into a database. The tables are merged and entered row-wise. ''' header = ",".join([P.tablequote(P.snip(x, suffix)) for x in infiles]) if suffix.endswith(".gz"): filenames = " ".join( ["<( zcat %s | cut -f 1,2 )" % x for x in infiles]) else: filenames = " ".join(["<( cat %s | cut -f 1,2 )" % x for x in infiles]) tablename = P.toTable(outfile) statement = """python %(scriptsdir)s/combine_tables.py --header-names=%(header)s --missing-value=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/track/" | python %(scriptsdir)s/table2table.py --transpose | python %(scriptsdir)s/csv2db.py --add-index=track --table=%(tablename)s > %(outfile)s """ P.run()
def mergeAndLoad(infiles, outfile, suffix): """load categorical tables (two columns) into a database. The tables are merged and entered row-wise. """ header = ",".join([P.tablequote(P.snip(x, suffix)) for x in infiles]) if suffix.endswith(".gz"): filenames = " ".join(["<( zcat %s | cut -f 1,2 )" % x for x in infiles]) else: filenames = " ".join(["<( cat %s | cut -f 1,2 )" % x for x in infiles]) tablename = P.toTable(outfile) statement = """python %(scriptsdir)s/combine_tables.py --header-names=%(header)s --missing-value=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/track/" | python %(scriptsdir)s/table2table.py --transpose | python %(scriptsdir)s/csv2db.py --add-index=track --table=%(tablename)s > %(outfile)s """ P.run()
def makeIntervalCorrelation(infiles, outfile, field, reference): '''compute correlation of interval properties between sets ''' dbhandle = sqlite3.connect(PARAMS["database_name"]) tracks, idx = [], [] for infile in infiles: track = P.snip(infile, ".bed.gz") tablename = "%s_intervals" % P.tablequote(track) cc = dbhandle.cursor() statement = "SELECT contig, start, end, %(field)s FROM %(tablename)s" % locals( ) cc.execute(statement) ix = IndexedGenome.IndexedGenome() for contig, start, end, peakval in cc: ix.add(contig, start, end, peakval) idx.append(ix) tracks.append(track) outs = IOTools.openFile(outfile, "w") outs.write("contig\tstart\tend\tid\t" + "\t".join(tracks) + "\n") for bed in Bed.iterator(infile=IOTools.openFile(reference, "r")): row = [] for ix in idx: try: intervals = list(ix.get(bed.contig, bed.start, bed.end)) except KeyError: row.append("") continue if len(intervals) == 0: peakval = "" else: peakval = str((max([x[2] for x in intervals]))) row.append(peakval) outs.write(str(bed) + "\t" + "\t".join(row) + "\n") outs.close()
def loadBAMStats(infiles, outfile): """import bam statisticis.""" header = ",".join([P.tablequote(P.snip(x, ".readstats")) for x in infiles]) filenames = " ".join(["<( cut -f 1,2 < %s)" % x for x in infiles]) tablename = P.toTable(outfile) E.info("loading bam stats - summary") statement = """python %(scriptsdir)s/combine_tables.py --header-names=%(header)s --missing-value=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/track/" | perl -p -e "s/unique/unique_alignments/" | python %(scriptsdir)s/table2table.py --transpose | python %(scriptsdir)s/csv2db.py --add-index=track --table=%(tablename)s > %(outfile)s """ P.run() for suffix in ("nm", "nh"): E.info("loading bam stats - %s" % suffix) filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles]) tname = "%s_%s" % (tablename, suffix) statement = """python %(scriptsdir)s/combine_tables.py --header-names=%(header)s --skip-titles --missing-value=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/%(suffix)s/" | python %(scriptsdir)s/csv2db.py --table=%(tname)s >> %(outfile)s """ P.run()
def loadBAMStats(infiles, outfile): '''import bam statisticis.''' header = ",".join([P.tablequote(P.snip(x, ".readstats")) for x in infiles]) filenames = " ".join(["<( cut -f 1,2 < %s)" % x for x in infiles]) tablename = P.toTable(outfile) E.info("loading bam stats - summary") statement = """python %(scriptsdir)s/combine_tables.py --header-names=%(header)s --missing-value=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/track/" | perl -p -e "s/unique/unique_alignments/" | python %(scriptsdir)s/table2table.py --transpose | python %(scriptsdir)s/csv2db.py --add-index=track --table=%(tablename)s > %(outfile)s """ P.run() for suffix in ("nm", "nh"): E.info("loading bam stats - %s" % suffix) filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles]) tname = "%s_%s" % (tablename, suffix) statement = """python %(scriptsdir)s/combine_tables.py --header-names=%(header)s --skip-titles --missing-value=0 --ignore-empty %(filenames)s | perl -p -e "s/bin/%(suffix)s/" | python %(scriptsdir)s/csv2db.py --table=%(tname)s >> %(outfile)s """ P.run()
def loadTranscriptomeValidation(infiles, outfile): """load transcriptome validation data into database.""" to_cluster = USECLUSTER headers = ",".join([P.tablequote(P.snip(x, ".accepted.bam")) for x in infiles]) infiles = " ".join(["%s.log" % x for x in infiles]) tablename = P.toTable(outfile) statement = """ python %(scriptsdir)s/combine_tables.py --header-names=%(headers)s %(infiles)s | python %(scriptsdir)s/table2table.py --transpose | perl -p -e "s/bin/track/" | python %(scriptsdir)s/csv2db.py --table=%(tablename)s > %(outfile)s """ P.run()
def loadTranscriptomeValidation(infiles, outfile): '''load transcriptome validation data into database.''' to_cluster = USECLUSTER headers = ",".join( [P.tablequote(P.snip(x, ".accepted.bam")) for x in infiles]) infiles = " ".join(["%s.log" % x for x in infiles]) tablename = P.toTable(outfile) statement = ''' python %(scriptsdir)s/combine_tables.py --header-names=%(headers)s %(infiles)s | python %(scriptsdir)s/table2table.py --transpose | perl -p -e "s/bin/track/" | python %(scriptsdir)s/csv2db.py --table=%(tablename)s > %(outfile)s ''' P.run()
def loadTranscriptomeValidation(infiles, outfile): '''load transcriptome validation data into database.''' to_cluster = USECLUSTER headers = ",".join([P.tablequote(P.snip(x, ".accepted.bam")) for x in infiles]) infiles = " ".join(["%s.log" % x for x in infiles]) tablename = P.toTable(outfile) statement = ''' cgat combine_tables --header-names=%(headers)s %(infiles)s | cgat table2table --transpose | perl -p -e "s/bin/track/" | cgat csv2db --table=%(tablename)s > %(outfile)s ''' P.run()
def exportIntervalsAsBed(infile, outfile): '''export macs peaks as bed files. ''' dbhandle = sqlite3.connect(PARAMS["database_name"]) if outfile.endswith(".gz"): compress = True track = P.snip(outfile, ".bed.gz") else: compress = False track = P.snip(outfile, ".bed") tablename = "%s_intervals" % P.tablequote(track) cc = dbhandle.cursor() statement = "SELECT contig, start, end, interval_id, peakval FROM %s ORDER by contig, start" % tablename cc.execute(statement) outs = IOTools.openFile("%s.bed" % track, "w") for result in cc: contig, start, end, interval_id, peakval = result # peakval is truncated at a 1000 as this is the maximum permitted # score in a bed file. peakval = int(min(peakval, 1000)) outs.write("%s\t%i\t%i\t%s\t%i\n" % (contig, start, end, str(interval_id), peakval)) cc.close() outs.close() if compress: E.info("compressing and indexing %s" % outfile) use_cluster = True statement = 'bgzip -f %(track)s.bed; tabix -f -p bed %(outfile)s' P.run()
def writeSequencesForIntervals(track, filename, dbhandle, full=False, halfwidth=None, maxsize=None, proportion=None, masker=[], offset=0, shuffled=False, num_sequences=None, min_sequences=None, order="peakval", shift=None): '''build a sequence set for motif discovery. Intervals are taken from the table <track>_intervals in the database *dbhandle* and save to *filename* in :term:`fasta` format. If num_shuffles is set, shuffled copies are created as well with the shuffled number appended to the filename. The sequences are masked before shuffling (is this appropriate?) If *full* is set, the whole intervals will be output, otherwise only the region around the peak given by *halfwidth* If *maxsize* is set, the output is truncated at *maxsize* characters in order to create jobs that take too long. If proportion is set, only the top *proportion* intervals are output (sorted by peakval). If *num_sequences* is set, the first *num_sequences* will be used. *masker* can be a combination of * dust, dustmasker: apply dustmasker * softmask: mask softmasked genomic regions *order* is the order by which peaks should be sorted. Possible values are 'peakval' (peak value, descending order), 'score' (peak score, descending order) If *shift* is set, intervals will be shifted. ``leftright`` creates two intervals on the left and right of the actual interval. The intervals will be centered around the mid-point and truncated the same way as the main intervals. ''' fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"])) cc = dbhandle.cursor() if order == "peakval": orderby = " ORDER BY peakval DESC" elif order == "max": orderby = " ORDER BY score DESC" else: raise ValueError( "Unknown value passed as order parameter, check your ini file") tablename = "%s_intervals" % P.tablequote(track) statement = '''SELECT contig, start, end, interval_id, peakcenter FROM %(tablename)s ''' % locals() + orderby cc.execute(statement) data = cc.fetchall() cc.close() if proportion: cutoff = int(len(data) * proportion) + 1 if min_sequences: cutoff = max(cutoff, min_sequences) elif num_sequences: cutoff = num_sequences else: cutoff = len(data) L.info( "writeSequencesForIntervals %s: using at most %i sequences for pattern finding" % (track, cutoff)) data = data[:cutoff] L.info("writeSequencesForIntervals %s: masker=%s" % (track, str(masker))) fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"])) # modify the ranges if shift: if shift == "leftright": new_data = [(contig, start - (end - start), start, str(interval_id) + "_left", peakcenter) for contig, start, end, interval_id, peakcenter in data ] new_data.extend([ (contig, end, end + (end - start), str(interval_id) + "_right", peakcenter) for contig, start, end, interval_id, peakcenter in data ]) data = new_data if halfwidth: # center around peakcenter, add halfwidth on either side data = [(contig, peakcenter - halfwidth, peakcenter + halfwidth, interval_id) for contig, start, end, interval_id, peakcenter in data] else: # remove peakcenter data = [(contig, start, end, interval_id) for contig, start, end, interval_id, peakcenter in data] # get the sequences - cut at number of nucleotides sequences = [] current_size, nseq = 0, 0 new_data = [] for contig, start, end, interval_id in data: lcontig = fasta.getLength(contig) start, end = max(0, start + offset), min(end + offset, lcontig) if start >= end: L.info( "writeSequencesForIntervals %s: sequence %s is empty: start=%i, end=%i, offset=%i - ignored" % (track, id, start, end, offset)) continue seq = fasta.getSequence(contig, "+", start, end) sequences.append(seq) new_data.append((start, end, interval_id, contig)) current_size += len(seq) if maxsize and current_size >= maxsize: L.info( "writeSequencesForIntervals %s: maximum size (%i) reached - only %i sequences output (%i ignored)" % (track, maxsize, nseq, len(data) - nseq)) break nseq += 1 data = new_data if shuffled: # note that shuffling is done on the unmasked sequences # Otherwise N's would be interspersed with real sequence # messing up motif finding unfairly. Instead, masking is # done on the shuffled sequence. sequences = [list(x) for x in sequences] for sequence in sequences: random.shuffle(sequence) sequences = maskSequences(["".join(x) for x in sequences], masker) c = E.Counter() outs = IOTools.openFile(filename, "w") for masker in masker: if masker not in ("unmasked", "none", None): sequences = maskSequences(sequences, masker) for sequence, d in zip(sequences, data): c.input += 1 if len(sequence) == 0: c.empty += 1 continue start, end, id, contig = d id = "%s_%s %s:%i-%i" % (track, str(id), contig, start, end) outs.write(">%s\n%s\n" % (id, sequence)) c.output += 1 outs.close() E.info("%s" % c) return c.output
def writeSequencesForIntervals(track, filename, dbhandle, full=False, halfwidth=None, maxsize=None, proportion=None, masker=[], offset=0, shuffled=False, num_sequences=None, min_sequences=None, order="peakval", shift=None, stranded=False): '''build a sequence set for motif discovery. Intervals are taken from the table <track>_intervals in the database *dbhandle* and save to *filename* in :term:`fasta` format. If num_shuffles is set, shuffled copies are created as well with the shuffled number appended to the filename. The sequences are masked before shuffling (is this appropriate?) If *full* is set, the whole intervals will be output, otherwise only the region around the peak given by *halfwidth* If *maxsize* is set, the output is truncated at *maxsize* characters in order to create jobs that take too long. If proportion is set, only the top *proportion* intervals are output (sorted by peakval). If *num_sequences* is set, the first *num_sequences* will be used. *masker* can be a combination of * dust, dustmasker: apply dustmasker * softmask: mask softmasked genomic regions *order* is the order by which peaks should be sorted. Possible values are 'peakval' (peak value, descending order), 'score' (peak score, descending order) If *shift* is set, intervals will be shifted. ``leftright`` creates two intervals on the left and right of the actual interval. The intervals will be centered around the mid-point and truncated the same way as the main intervals. ''' cc = dbhandle.cursor() orderby = "" if order == "peakval": orderby = " ORDER BY peakval DESC" elif order == "max": orderby = " ORDER BY score DESC" elif order != "random": raise ValueError( "Unknown value passed as order parameter, check your ini file") tablename = "%s_intervals" % P.tablequote(track) statement = '''SELECT contig, start, end, interval_id, score, strand, peakcenter FROM %(tablename)s ''' % locals() + orderby cc.execute(statement) data = cc.fetchall() cc.close() E.debug("Got %s intervals for track %s" % ( len(data), track)) if len(data) == 0: P.touch(filename) return data = truncateList(data, track, proportion, min_sequences, num_sequences, order == "random") beds = bedsFromList(data) L.info("writeSequencesForIntervals %s: masker=%s" % (track, str(masker))) fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"])) # At the moment the pipeline retrieves from the DB the bed regions and they will # always be in the positive strand but if this were to change. The regions retrieved from # the negative strand will be counted from the end of the chromosome and not the beginning without this. # This should be tested. fasta.setConverter(IndexedFasta.getConverter("zero-single-open")) # modify the ranges if shift == "leftright": beds = shitfBeds(beds) if halfwidth and not full: beds = centreAndCrop(beds, halfwidth) sequences = getFASTAFromBed(beds, fasta, stranded, offset, maxsize) if shuffled: sequences = shuffleFasta(sequences) c = E.Counter() outs = IOTools.openFile(filename, "w") for masker in masker: if masker not in ("unmasked", "none", None): ids, sequences = zip(*[(x.title, x.sequence) for x in sequences]) sequences = maskSequences(sequences, masker) sequences = (FastaRecord(id, seq) for id, seq in izip(ids, sequences)) with IOTools.openFile(filename, "w") as outs: for sequence in sequences: c.input += 1 if len(sequence.sequence) == 0: c.empty += 1 continue if len(sequence.sequence) < 0: c.too_short += 1 continue outs.write(">%s\n%s\n" % (sequence.title, sequence.sequence)) c.output += 1 outs.close() E.info("%s" % c) return c.output
def writeSequencesForIntervals(track, filename, dbhandle, full=False, halfwidth=None, maxsize=None, proportion=None, masker=[], offset=0, shuffled=False, num_sequences=None, min_sequences=None, order="peakval", shift=None): '''build a sequence set for motif discovery. Intervals are taken from the table <track>_intervals in the database *dbhandle* and save to *filename* in :term:`fasta` format. If num_shuffles is set, shuffled copies are created as well with the shuffled number appended to the filename. The sequences are masked before shuffling (is this appropriate?) If *full* is set, the whole intervals will be output, otherwise only the region around the peak given by *halfwidth* If *maxsize* is set, the output is truncated at *maxsize* characters in order to create jobs that take too long. If proportion is set, only the top *proportion* intervals are output (sorted by peakval). If *num_sequences* is set, the first *num_sequences* will be used. *masker* can be a combination of * dust, dustmasker: apply dustmasker * softmask: mask softmasked genomic regions *order* is the order by which peaks should be sorted. Possible values are 'peakval' (peak value, descending order), 'score' (peak score, descending order) If *shift* is set, intervals will be shifted. ``leftright`` creates two intervals on the left and right of the actual interval. The intervals will be centered around the mid-point and truncated the same way as the main intervals. ''' fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"])) cc = dbhandle.cursor() if order == "peakval": orderby = " ORDER BY peakval DESC" elif order == "max": orderby = " ORDER BY score DESC" else: raise ValueError( "Unknown value passed as order parameter, check your ini file") tablename = "%s_intervals" % P.tablequote(track) statement = '''SELECT contig, start, end, interval_id, peakcenter FROM %(tablename)s ''' % locals() + orderby cc.execute(statement) data = cc.fetchall() cc.close() if proportion: cutoff = int(len(data) * proportion) + 1 if min_sequences: cutoff = max(cutoff, min_sequences) elif num_sequences: cutoff = num_sequences else: cutoff = len(data) L.info("writeSequencesForIntervals %s: using at most %i sequences for pattern finding" % ( track, cutoff)) data = data[:cutoff] L.info("writeSequencesForIntervals %s: masker=%s" % (track, str(masker))) fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"])) # modify the ranges if shift: if shift == "leftright": new_data = [(contig, start - (end - start), start, str(interval_id) + "_left", peakcenter) for contig, start, end, interval_id, peakcenter in data] new_data.extend([(contig, end, end + (end - start), str(interval_id) + "_right", peakcenter) for contig, start, end, interval_id, peakcenter in data]) data = new_data if halfwidth: # center around peakcenter, add halfwidth on either side data = [(contig, peakcenter - halfwidth, peakcenter + halfwidth, interval_id) for contig, start, end, interval_id, peakcenter in data] else: # remove peakcenter data = [(contig, start, end, interval_id) for contig, start, end, interval_id, peakcenter in data] # get the sequences - cut at number of nucleotides sequences = [] current_size, nseq = 0, 0 new_data = [] for contig, start, end, interval_id in data: lcontig = fasta.getLength(contig) start, end = max(0, start + offset), min(end + offset, lcontig) if start >= end: L.info("writeSequencesForIntervals %s: sequence %s is empty: start=%i, end=%i, offset=%i - ignored" % (track, id, start, end, offset)) continue seq = fasta.getSequence(contig, "+", start, end) sequences.append(seq) new_data.append((start, end, interval_id, contig)) current_size += len(seq) if maxsize and current_size >= maxsize: L.info("writeSequencesForIntervals %s: maximum size (%i) reached - only %i sequences output (%i ignored)" % (track, maxsize, nseq, len(data) - nseq)) break nseq += 1 data = new_data if shuffled: # note that shuffling is done on the unmasked sequences # Otherwise N's would be interspersed with real sequence # messing up motif finding unfairly. Instead, masking is # done on the shuffled sequence. sequences = [list(x) for x in sequences] for sequence in sequences: random.shuffle(sequence) sequences = maskSequences(["".join(x) for x in sequences], masker) c = E.Counter() outs = IOTools.openFile(filename, "w") for masker in masker: if masker not in ("unmasked", "none", None): sequences = maskSequences(sequences, masker) for sequence, d in zip(sequences, data): c.input += 1 if len(sequence) == 0: c.empty += 1 continue start, end, id, contig = d id = "%s_%s %s:%i-%i" % (track, str(id), contig, start, end) outs.write(">%s\n%s\n" % (id, sequence)) c.output += 1 outs.close() E.info("%s" % c) return c.output