def __getitem__(self, x=None): """ The indexing function. :param x: an int. :Example: >>> from pygtftk.utils import get_example_file >>> from pygtftk.gtf_interface import GTF >>> a_file = get_example_file()[0] >>> a_tab = GTF(a_file).extract_data("transcript_id,gene_id") >>> assert len(a_tab) ==70 >>> a_tab = GTF(a_file).extract_data("seqid,gene_id") >>> assert list(a_tab[len(a_tab)-1]) == ['chr1', 'G0010'] """ if not isinstance(x, int): raise GTFtkError("This object contains FieldSet objects. " "Index it with integers") else: if x < self.nrows: field = FieldSet(self._data.data[x], size=self.ncols, ft_type=self.colnames) return field else: raise GTFtkError("Index out of range.")
def get_5p_end(self): """Get the 5' end of the feature. Returns 'start' if on '+' strand 'end' otherwise (one based). :Example: >>> from pygtftk.utils import get_example_feature >>> feat = get_example_feature() >>> assert feat.get_5p_end() == 100 >>> from pygtftk.Line import Feature >>> from collections import OrderedDict >>> a_feat = Feature.from_list(['1','pygtftk', 'exon', '100', '200', '0', '-', '1', OrderedDict()]) >>> assert a_feat.get_5p_end() == 200 >>> a_feat = Feature.from_list(['1','pygtftk', 'exon', '100', '200', '0', '+', '1', OrderedDict()]) >>> assert a_feat.get_5p_end() == 100 """ if self.strand == '+': return self.start elif self.strand == '-': return self.end else: raise GTFtkError( "Can not retrieve 5'end from an unstranded features.")
def __init__(self, ptr=None, size=None, alist=None, ft_type=None): """ :param ptr: A pointer to a gtf line. :param alist: A list of string if one want to construct a Feature from a list. >>> from pygtftk.utils import get_example_file >>> from pygtftk.gtf_interface import GTF >>> a_file = get_example_file()[0] >>> a_tab = GTF(a_file).extract_data("gene_id,start") >>> n = 0 >>> for i in a_tab: n += 1 >>> assert len(i) == 2 >>> assert i[0] == 'G0010' >>> assert i[1] == '184' >>> from pygtftk.Line import FieldSet >>> assert isinstance(i, FieldSet) >>> assert isinstance(list(i), list) """ if ptr is not None: self.fields = [ffi.string(ptr[x]).decode() for x in range(size)] elif alist is not None: self.fields = [x for x in alist] else: raise GTFtkError('Unsupported type.') if ft_type is not None: for i in range(len(ft_type)): setattr(self, ft_type[i], self.fields[i]) self.size = len(self.fields)
def as_simple_list(self, which_col=0): """Convert the selected column of a TAB object into a list. :param which_col: The column number. :Example: >>> from pygtftk.utils import get_example_file >>> from pygtftk.gtf_interface import GTF >>> a_file = get_example_file()[0] >>> a_gtf = GTF(a_file)[("feature", "gene")] >>> a_tab = a_gtf.extract_data("seqid") >>> assert list(set(a_tab.as_simple_list(0))) == ['chr1'] """ if which_col > self.ncols: raise GTFtkError( "which_col is greater than the number of contained fields.") a_list = list() for i in self: a_list += [i[which_col]] return a_list
def __init__(self, ptr=None, alist=None): """ :param ptr: A pointer to a gtf line. :param alist: A list if one want to construct a Feature from a list. :Example: >>> from pygtftk.Line import Feature >>> from pygtftk.utils import get_example_file >>> from pygtftk.gtf_interface import GTF >>> from collections import OrderedDict >>> alist = ['chr1','Unknown','transcript', 100, 200] >>> d = OrderedDict() >>> d['transcript_id'] = 'g1t1' >>> d['gene_id'] = 'g1' >>> alist +=['.','+','.',d] >>> a = Feature.from_list(alist) >>> assert a.get_tx_id() == 'g1t1' >>> assert a.get_gn_id() == 'g1' >>> a_file = get_example_file()[0] >>> a_gtf = GTF(a_file) >>> for i in a_gtf: pass >>> assert type(i) == Feature """ if ptr is not None: self.rank = ptr.rank self.nb_key = ptr.attributes.nb self.chrom = ffi.string(ptr.field[0]).decode() self.src = ffi.string(ptr.field[1]).decode() self.ft_type = ffi.string(ptr.field[2]).decode() self.start = int(ffi.string(ptr.field[3]).decode()) self.end = int(ffi.string(ptr.field[4]).decode()) self.score = ffi.string(ptr.field[5]).decode() self.strand = ffi.string(ptr.field[6]).decode() self.frame = ffi.string(ptr.field[7]).decode() self.attr = OrderedDict() n = 0 while n < self.nb_key: self.attr[ffi.string( ptr.attributes.attr[n].key).decode()] = ffi.string( ptr.attributes.attr[n].value).decode() n += 1 elif alist is not None: self.rank = 1 self.nb_key = len(alist[8]) self.chrom = alist[0] self.src = alist[1] self.ft_type = alist[2] self.start = int(alist[3]) self.end = int(alist[4]) self.score = alist[5] self.strand = alist[6] self.frame = alist[7] self.attr = alist[8] else: raise GTFtkError('Arguments alist or ptr should be set.')
def write_bed_5p_end(self, name=None, format='bed6', outputfile=None): """Write the 5p end coordinates of feature in bed format (Zero-based). :param outputfile: the output file. :param name: The keys that should be used to computed the 'name' column. :param format: Format should be one of 'bed/bed6' or 'bed3'. Default to bed6. :Example: >>> from pygtftk.utils import get_example_feature >>> from pygtftk.utils import make_tmp_file >>> from pygtftk.utils import TAB >>> from pygtftk.utils import simple_line_count >>> tmp_file = make_tmp_file() >>> feat = get_example_feature() >>> feat.write_bed_5p_end(name='foo', outputfile=tmp_file) >>> tmp_file.close() >>> for line in open(tmp_file.name): line= line.split(TAB) >>> assert line[1] == '99' >>> assert line[2] == '100' >>> assert simple_line_count(tmp_file) == 1 """ if format not in ['bed6', 'bed', 'bed3']: raise GTFtkError('Unsupported bed format') if pygtftk.utils.ADD_CHR == 1: chrom_out = "chr" + self.chrom else: chrom_out = self.chrom token = [ chrom_out, str(int(self.get_5p_end()) - 1), str(self.get_5p_end()) ] if format == 'bed6' or format == 'bed': if name is None: raise GTFtkError( "Need a name (column 4) to write a BED6 format.") token += [name, str(self.score), self.strand] pygtftk.utils.write_properly('\t'.join(token), outputfile)
def get_attr_value(self, attr_name, upon_none='continue'): """Get the value of a basic or extended attribute. :param attr_name: Name of the attribute/key. :param upon_none: Wether we should 'continue', 'raise' an error or \ 'set_na'. :Example: >>> from pygtftk.utils import get_example_feature >>> feat = get_example_feature() >>> assert feat.get_attr_value('transcript_id') == ['g1t1'] >>> assert feat.get_attr_value('chrom') == ['chr1'] >>> assert feat.get_attr_value('end') == [200] >>> assert feat.get_attr_value('bla', upon_none='continue') == [None] >>> assert feat.get_attr_value('bla', upon_none='set_na') == ['.'] """ if isinstance(attr_name, str): attr_name = [attr_name] if not isinstance(attr_name, list): raise GTFtkError('Unsupported type.') val_list = [] for i in attr_name: if i in ['chrom', 'seqname', 'seqid']: val_cur = self.chrom elif i in ['feature', 'ft_type']: val_cur = self.ft_type elif i == 'start': val_cur = self.start elif i == 'end': val_cur = self.end elif i in ['src', 'source']: val_cur = self.src elif i == 'score': val_cur = self.score elif i == 'frame': val_cur = self.frame else: val_cur = self.attr.get(i, None) if val_cur is None: if upon_none == 'continue': pass elif upon_none == 'raise': raise KeyError('Key not found') elif upon_none == 'set_na': val_cur = '.' else: raise KeyError('upon_none argument should be' ' continue, raise or set_na.') val_list += [val_cur] return val_list
def write_bed(self, name=None, format='bed6', outputfile=None): """Write the Feature instance in bed format (Zero-based). :param name: A string to use as name (Column 4). :param format: Format should be one of 'bed/bed6' or 'bed3'. Default to bed6. :param outputfile: the file object were data should be printed. :Example: >>> from pygtftk.utils import get_example_feature >>> from pygtftk.utils import make_tmp_file >>> from pygtftk.utils import TAB >>> from pygtftk.utils import simple_line_count >>> tmp_file = make_tmp_file() >>> feat = get_example_feature() >>> feat.write_bed(name="foo", outputfile=tmp_file) >>> tmp_file.close() >>> for line in open(tmp_file.name): line= line.split(TAB) >>> assert line[3] == 'foo' >>> assert simple_line_count(tmp_file) == 1 """ if format not in ['bed6', 'bed', 'bed3']: raise GTFtkError('Unsupported bed format') if pygtftk.utils.ADD_CHR == 1: chrom_out = "chr" + self.chrom else: chrom_out = self.chrom # bed is 0-based (-1 on start) token = [chrom_out, str(int(self.start) - 1), str(self.end)] if format == 'bed6' or format == 'bed': if name is None: raise GTFtkError( "Need a name (column 4) to write a BED6 format.") token += [name, str(self.score), self.strand] pygtftk.utils.write_properly('\t'.join(token), outputfile)
def get_3p_end(self): """Get the 3' end of the feature. Returns 'end' if on '+' strand 'start' otherwise (one based). :Example: >>> from pygtftk.utils import get_example_feature >>> feat = get_example_feature() >>> assert feat.get_3p_end() == 200 """ if self.strand == '+': return self.end elif self.strand == '-': return self.start else: raise GTFtkError( "Can not retrieve 3'end from an unstranded features.")
def __init__(self, ptr=None, alist=None, feat="transcript", rev_comp=False): """ :param ptr: A pointer to a fasta sequence. :param alist: A list of string if one want to construct a FastaSequence from a list. The list should contain a header and a sequence. :Example: >>> from pygtftk.Line import FastaSequence >>> a = FastaSequence(alist=['>bla', 'AATACAGAGAT','chr21','+', 'BLA', 'NM123', 123, 456, 'transcript']) """ if ptr is not None: self.header = ffi.string(ptr.header).decode() self.chrom = ffi.string(ptr.seqid).decode() self.strand = ptr.strand self.gene_id = ffi.string(ptr.gene_id).decode() self.transcript_id = ffi.string(ptr.transcript_id).decode() self.sequence = ffi.string(ptr.sequence).decode() self.start = str(ptr.start) self.end = str(ptr.end) self.feat = "transcript" elif alist is not None: self.header = alist[0] self.sequence = alist[1] self.chrom = alist[2] self.strand = alist[3] self.gene_id = alist[4] self.transcript_id = alist[5] self.start = alist[6] self.end = alist[7] self.feat = feat else: raise GTFtkError('Unsupported type.')
def _big_wig_coverage_worker(input_values): """ This function compute bigwig coverage. The input_values arguments is a tuple that contains various input parameters. 'span' is a tuple that correspond to a fraction (from, to) of the bedfile to be processed. Each worker will process all bigwig filesbut it will only process a fraction (span) of the bed file regions :param span: the fraction (lines) of the bed file [from, to] to be processed. :param bw_list: the list of bigWig files to be processed. :param region_bed_file_name: the bed file containing the region for which coverage is to be computed. :param bin_nb: The number of bin into which the region should be splitted. If the number of nucleotides is < nbBin a warning is printed. :param pseudo_count: A value for the pseudo_count. :param n_highest: compute the score based on the n highest values in the bins. :param profile: compute coverage profile not a single coverage value (mean). :param stranded: controls whether the profile should be ordered based on strand. :param type: This string will be added to the output to indicate the type of region (e.g tss, promoter...). :param label: Bigwig labels (i.e short name version) :param zero_to_na: Use NA not zero when region is undefined in bigwig. :param stat: mean (default) or sum. :param verbose: run in verbose mode. """ (span, bw_list, region_bed_file_name, bin_nb, pseudo_count, n_highest, profile, stranded, type, label, zero_to_na, stat, _) = input_values pc = pseudo_count if not profile: if n_highest is None: n_highest = bin_nb results = list() else: if bin_nb < 1: bin_nb = 1 matrix_file = make_tmp_file_pool(prefix="worker_coverage_", suffix=".txt") for cpt, big_wig in enumerate(bw_list): try: bigwig = pyBigWig.open(big_wig) if not bigwig.isBigWig(): message("Not a bigwig file :" + big_wig, type="ERROR") except: message("Not a bigwig file :" + big_wig, type="ERROR") mesg = "Computing coverage for %s (chunks : #%s , type : %s, lab : %s)." mesg = mesg % (os.path.basename(big_wig), str(span[1] - span[0]), type, label[cpt]) message(mesg, type="INFO") # Load the regions for which the coverage is to be processed. tx_bed = BedTool(region_bed_file_name) # The fraction of bed file # to be processed (from_here, to_here) = span nb = 0 nb_to_do = to_here - from_here for i in tx_bed[slice(from_here, to_here)]: nb += 1 if nb == nb_to_do: p_name = str(multiprocessing.current_process().name) message(p_name + " has processed " + str(nb) + " regions") if (i.end - i.start) < bin_nb: if pygtftk.utils.WARN_REGION_SIZE: pygtftk.utils.WARN_REGION_SIZE = False message("Encountered regions shorter than bin number.", type="WARNING") message(i.name + " has length : " + str(i.end - i.start), type="WARNING") message( "They will be set to NA or --pseudo-count depending on --zero-to-na.", type="WARNING") message("Filter them out please.", type="WARNING") if zero_to_na: out = ['NA'] * bin_nb else: out = [pc] * bin_nb else: try: """ bw_cov = bigwig.stats(i.chrom, i.start, i.end, nBins=bin_nb) """ bw_cov = bigwig.values(i.chrom, i.start, i.end) out = [] size = i.end - i.start for range_curr in intervals(list(range(size)), bin_nb, silent=True): interval_cur = bw_cov[range_curr[0]:range_curr[1]] if not zero_to_na: interval_cur = [ k if not np.isnan(k) else 0 for k in interval_cur ] if stat == 'mean': out += [ round( sum(interval_cur) / (range_curr[1] - range_curr[0]), 6) ] elif stat == 'sum': out += [round(sum(interval_cur), 6)] else: raise GTFtkError("Stat should be 'sum' or 'mean'.") if zero_to_na: out = ['NA' if np.isnan(k) else k + pc for k in out] else: out = [pc if np.isnan(k) else k + pc for k in out] except: if pygtftk.utils.WARN_UNDEF: pygtftk.utils.WARN_UNDEF = False mesg = "Encountered regions undefined in bigWig file." message(mesg, type="WARNING") mesg = '%s:%s-%s' % (i.chrom, str(i.start), str(i.end)) message(mesg) if zero_to_na: out = ['NA'] * bin_nb else: out = [pc] * bin_nb # Prepare output if i.name in ["", "."]: name = "|".join([i.chrom, str(i.start), str(i.end)]) else: name = i.name if i.strand == "": strand = "." else: strand = i.strand # Print profiles if profile: # Data should be oriented in 5' -> 3' if stranded: if i.strand == '-': out = out[::-1] out = [str(x) for x in out] out_text = [ label[cpt], i.chrom, str(i.start), str(i.end), str(i.name), i.strand ] out_text = out_text + out out_text = "\t".join(out_text) matrix_file.write(out_text + "\n") else: out = sorted(out, reverse=True) out = out[0:n_highest] if 'NA' not in out: out = sum(out) / len(out) else: out = 'NA' results.append("\t".join([ i.chrom, str(i.start), str(i.end), label[cpt] + "|" + name, str(out), strand ]) + "\n") if profile: matrix_file.close() return matrix_file.name else: return results
def shift(inputfile=None, outputfile=None, shift_value=None, chrom_info=None, stranded=False, allow_outside=False): """Shift coordinates in 3' or 5' direction. """ gtf = GTF(inputfile, check_ensembl_format=False) chrom_list_gtf = gtf.get_chroms(nr=True) chrom_info = chrom_info_as_dict(chrom_info) for chr in chrom_list_gtf: if chr not in chrom_info: raise GTFtkError("Chromosome " + chr + " was not found in chrom-info file.") for i in gtf: size = i.end - i.start + 1 if not stranded: new_start = i.start + shift_value new_end = i.end + shift_value else: if i.strand == "-": new_start = i.start - shift_value new_end = i.end - shift_value else: new_start = i.start + shift_value new_end = i.end + shift_value # Feature is going outside genome in left direction if not allow_outside: if new_start < 1: new_start = 1 new_end = size # Feature is going outside genome in right direction if new_end > int(chrom_info[i.chrom]): new_end = int(chrom_info[i.chrom]) new_start = new_end - size + 1 else: if new_start < 1: new_start = 1 if new_end < 1: new_end = None # Feature is going outside genome in right direction if new_end > int(chrom_info[i.chrom]): new_end = int(chrom_info[i.chrom]) if new_start > int(chrom_info[i.chrom]): new_start = None if new_start is not None and new_end is not None: i.start = new_start i.end = new_end i.write(outputfile) gc.disable() close_properly(outputfile, inputfile)