Esempio n. 1
0
    def __getitem__(self, x=None):
        """
        The indexing function.

        :param x: an int.

        :Example:

        >>> from  pygtftk.utils import get_example_file
        >>> from pygtftk.gtf_interface import GTF
        >>> a_file = get_example_file()[0]
        >>> a_tab = GTF(a_file).extract_data("transcript_id,gene_id")
        >>> assert len(a_tab) ==70
        >>> a_tab = GTF(a_file).extract_data("seqid,gene_id")
        >>> assert list(a_tab[len(a_tab)-1]) == ['chr1', 'G0010']
        """

        if not isinstance(x, int):
            raise GTFtkError("This object contains FieldSet objects. "
                             "Index it with integers")
        else:
            if x < self.nrows:
                field = FieldSet(self._data.data[x],
                                 size=self.ncols,
                                 ft_type=self.colnames)
                return field
            else:
                raise GTFtkError("Index out of range.")
Esempio n. 2
0
    def get_5p_end(self):
        """Get the 5' end of the feature. Returns 'start' if on '+' strand 'end'
        otherwise (one based).

        :Example:


        >>> from pygtftk.utils import get_example_feature
        >>> feat = get_example_feature()
        >>> assert feat.get_5p_end() == 100
        >>> from pygtftk.Line import Feature
        >>> from collections import OrderedDict
        >>> a_feat = Feature.from_list(['1','pygtftk', 'exon', '100', '200', '0', '-', '1', OrderedDict()])
        >>> assert a_feat.get_5p_end() == 200
        >>> a_feat = Feature.from_list(['1','pygtftk', 'exon', '100', '200', '0', '+', '1', OrderedDict()])
        >>> assert a_feat.get_5p_end() == 100

        """

        if self.strand == '+':
            return self.start
        elif self.strand == '-':
            return self.end
        else:
            raise GTFtkError(
                "Can not retrieve 5'end from an unstranded features.")
Esempio n. 3
0
    def __init__(self, ptr=None, size=None, alist=None, ft_type=None):
        """
        :param ptr: A pointer to a gtf line.
        :param alist: A list of string if one want to construct a Feature from a list.

                >>> from  pygtftk.utils import get_example_file
                >>> from pygtftk.gtf_interface import GTF
                >>> a_file = get_example_file()[0]
                >>> a_tab = GTF(a_file).extract_data("gene_id,start")
                >>> n = 0
                >>> for i in a_tab: n += 1
                >>> assert len(i) == 2
                >>> assert i[0] == 'G0010'
                >>> assert i[1] == '184'
                >>> from pygtftk.Line import FieldSet
                >>> assert isinstance(i, FieldSet)
                >>> assert isinstance(list(i), list)

        """
        if ptr is not None:
            self.fields = [ffi.string(ptr[x]).decode() for x in range(size)]
        elif alist is not None:
            self.fields = [x for x in alist]
        else:
            raise GTFtkError('Unsupported type.')
        if ft_type is not None:
            for i in range(len(ft_type)):
                setattr(self, ft_type[i], self.fields[i])

        self.size = len(self.fields)
Esempio n. 4
0
    def as_simple_list(self, which_col=0):
        """Convert the selected column of a TAB object into a list.

        :param which_col: The column number.

        :Example:

        >>> from  pygtftk.utils import get_example_file
        >>> from pygtftk.gtf_interface import GTF
        >>> a_file = get_example_file()[0]
        >>> a_gtf = GTF(a_file)[("feature", "gene")]
        >>> a_tab = a_gtf.extract_data("seqid")
        >>> assert list(set(a_tab.as_simple_list(0))) == ['chr1']
        """

        if which_col > self.ncols:
            raise GTFtkError(
                "which_col is greater than the number of contained fields.")

        a_list = list()

        for i in self:
            a_list += [i[which_col]]

        return a_list
Esempio n. 5
0
    def __init__(self, ptr=None, alist=None):
        """
        :param ptr: A pointer to a gtf line.
        :param alist: A list if one want to construct a Feature from a list.

        :Example:


        >>> from pygtftk.Line import Feature
        >>> from  pygtftk.utils import get_example_file
        >>> from pygtftk.gtf_interface import GTF
        >>> from collections import OrderedDict
        >>> alist = ['chr1','Unknown','transcript', 100, 200]
        >>> d = OrderedDict()
        >>> d['transcript_id'] = 'g1t1'
        >>> d['gene_id'] = 'g1'
        >>> alist +=['.','+','.',d]
        >>> a = Feature.from_list(alist)
        >>> assert a.get_tx_id() == 'g1t1'
        >>> assert a.get_gn_id() == 'g1'
        >>> a_file = get_example_file()[0]
        >>> a_gtf = GTF(a_file)
        >>> for i in a_gtf: pass
        >>> assert type(i) == Feature
        """

        if ptr is not None:
            self.rank = ptr.rank
            self.nb_key = ptr.attributes.nb
            self.chrom = ffi.string(ptr.field[0]).decode()
            self.src = ffi.string(ptr.field[1]).decode()
            self.ft_type = ffi.string(ptr.field[2]).decode()
            self.start = int(ffi.string(ptr.field[3]).decode())
            self.end = int(ffi.string(ptr.field[4]).decode())
            self.score = ffi.string(ptr.field[5]).decode()
            self.strand = ffi.string(ptr.field[6]).decode()
            self.frame = ffi.string(ptr.field[7]).decode()
            self.attr = OrderedDict()
            n = 0
            while n < self.nb_key:
                self.attr[ffi.string(
                    ptr.attributes.attr[n].key).decode()] = ffi.string(
                        ptr.attributes.attr[n].value).decode()
                n += 1
        elif alist is not None:
            self.rank = 1
            self.nb_key = len(alist[8])
            self.chrom = alist[0]
            self.src = alist[1]
            self.ft_type = alist[2]
            self.start = int(alist[3])
            self.end = int(alist[4])
            self.score = alist[5]
            self.strand = alist[6]
            self.frame = alist[7]
            self.attr = alist[8]
        else:
            raise GTFtkError('Arguments alist or ptr should be set.')
Esempio n. 6
0
    def write_bed_5p_end(self, name=None, format='bed6', outputfile=None):
        """Write the 5p end coordinates of feature in bed format (Zero-based).

        :param outputfile: the output file.
        :param name: The keys that should be used to computed the 'name' column.
        :param format: Format should be one of 'bed/bed6' or 'bed3'. Default to bed6.

        :Example:

        >>> from pygtftk.utils import get_example_feature
        >>> from pygtftk.utils import make_tmp_file
        >>> from pygtftk.utils import TAB
        >>> from pygtftk.utils import  simple_line_count
        >>> tmp_file =  make_tmp_file()
        >>> feat = get_example_feature()
        >>> feat.write_bed_5p_end(name='foo', outputfile=tmp_file)
        >>> tmp_file.close()
        >>> for line in open(tmp_file.name): line= line.split(TAB)
        >>> assert line[1] == '99'
        >>> assert line[2] == '100'
        >>> assert simple_line_count(tmp_file) == 1

        """

        if format not in ['bed6', 'bed', 'bed3']:
            raise GTFtkError('Unsupported bed format')

        if pygtftk.utils.ADD_CHR == 1:
            chrom_out = "chr" + self.chrom
        else:
            chrom_out = self.chrom

        token = [
            chrom_out,
            str(int(self.get_5p_end()) - 1),
            str(self.get_5p_end())
        ]

        if format == 'bed6' or format == 'bed':
            if name is None:
                raise GTFtkError(
                    "Need a name (column 4) to write a BED6 format.")
            token += [name, str(self.score), self.strand]

        pygtftk.utils.write_properly('\t'.join(token), outputfile)
Esempio n. 7
0
    def get_attr_value(self, attr_name, upon_none='continue'):
        """Get the value of a basic or extended attribute.

        :param attr_name: Name of the attribute/key.
        :param  upon_none: Wether we should 'continue', 'raise' an error or \
        'set_na'.

        :Example:

        >>> from pygtftk.utils import get_example_feature
        >>> feat = get_example_feature()
        >>> assert feat.get_attr_value('transcript_id') == ['g1t1']
        >>> assert feat.get_attr_value('chrom') == ['chr1']
        >>> assert feat.get_attr_value('end') == [200]
        >>> assert feat.get_attr_value('bla', upon_none='continue') == [None]
        >>> assert feat.get_attr_value('bla', upon_none='set_na') == ['.']

        """

        if isinstance(attr_name, str):
            attr_name = [attr_name]

        if not isinstance(attr_name, list):
            raise GTFtkError('Unsupported type.')

        val_list = []

        for i in attr_name:
            if i in ['chrom', 'seqname', 'seqid']:
                val_cur = self.chrom
            elif i in ['feature', 'ft_type']:
                val_cur = self.ft_type
            elif i == 'start':
                val_cur = self.start
            elif i == 'end':
                val_cur = self.end
            elif i in ['src', 'source']:
                val_cur = self.src
            elif i == 'score':
                val_cur = self.score
            elif i == 'frame':
                val_cur = self.frame
            else:
                val_cur = self.attr.get(i, None)
                if val_cur is None:
                    if upon_none == 'continue':
                        pass
                    elif upon_none == 'raise':
                        raise KeyError('Key not found')
                    elif upon_none == 'set_na':
                        val_cur = '.'
                    else:
                        raise KeyError('upon_none argument should be'
                                       ' continue, raise or set_na.')
            val_list += [val_cur]

        return val_list
Esempio n. 8
0
    def write_bed(self, name=None, format='bed6', outputfile=None):
        """Write the Feature instance in bed format (Zero-based).

        :param name: A string to use as name (Column 4).
        :param format: Format should be one of 'bed/bed6' or 'bed3'. Default to bed6.
        :param outputfile: the file object were data should be printed.

        :Example:

        >>> from pygtftk.utils import get_example_feature
        >>> from pygtftk.utils import make_tmp_file
        >>> from pygtftk.utils import TAB
        >>> from pygtftk.utils import  simple_line_count
        >>> tmp_file =  make_tmp_file()
        >>> feat = get_example_feature()
        >>> feat.write_bed(name="foo", outputfile=tmp_file)
        >>> tmp_file.close()
        >>> for line in open(tmp_file.name): line= line.split(TAB)
        >>> assert line[3] == 'foo'
        >>> assert simple_line_count(tmp_file) == 1
        """

        if format not in ['bed6', 'bed', 'bed3']:
            raise GTFtkError('Unsupported bed format')

        if pygtftk.utils.ADD_CHR == 1:
            chrom_out = "chr" + self.chrom
        else:
            chrom_out = self.chrom

        # bed is 0-based (-1 on start)
        token = [chrom_out, str(int(self.start) - 1), str(self.end)]

        if format == 'bed6' or format == 'bed':
            if name is None:
                raise GTFtkError(
                    "Need a name (column 4) to write a BED6 format.")
            token += [name, str(self.score), self.strand]

        pygtftk.utils.write_properly('\t'.join(token), outputfile)
Esempio n. 9
0
    def get_3p_end(self):
        """Get the 3' end of the feature. Returns 'end' if on '+' strand 'start'
        otherwise (one based).

        :Example:

        >>> from pygtftk.utils import get_example_feature
        >>> feat = get_example_feature()
        >>> assert feat.get_3p_end() == 200

        """

        if self.strand == '+':
            return self.end
        elif self.strand == '-':
            return self.start
        else:
            raise GTFtkError(
                "Can not retrieve 3'end from an unstranded features.")
Esempio n. 10
0
    def __init__(self,
                 ptr=None,
                 alist=None,
                 feat="transcript",
                 rev_comp=False):
        """
        :param ptr: A pointer to a fasta sequence.
        :param alist: A list of string if one want to construct a FastaSequence from a list. The list should contain a header and a sequence.

        :Example:

        >>> from pygtftk.Line import FastaSequence
        >>> a = FastaSequence(alist=['>bla', 'AATACAGAGAT','chr21','+', 'BLA', 'NM123', 123, 456, 'transcript'])

        """

        if ptr is not None:

            self.header = ffi.string(ptr.header).decode()
            self.chrom = ffi.string(ptr.seqid).decode()
            self.strand = ptr.strand
            self.gene_id = ffi.string(ptr.gene_id).decode()
            self.transcript_id = ffi.string(ptr.transcript_id).decode()
            self.sequence = ffi.string(ptr.sequence).decode()

            self.start = str(ptr.start)
            self.end = str(ptr.end)
            self.feat = "transcript"

        elif alist is not None:
            self.header = alist[0]
            self.sequence = alist[1]
            self.chrom = alist[2]
            self.strand = alist[3]
            self.gene_id = alist[4]
            self.transcript_id = alist[5]
            self.start = alist[6]
            self.end = alist[7]
            self.feat = feat
        else:
            raise GTFtkError('Unsupported type.')
Esempio n. 11
0
def _big_wig_coverage_worker(input_values):
    """
    This function compute bigwig coverage. The input_values arguments is a
    tuple that contains various input parameters. 'span' is a tuple that
    correspond to a fraction (from, to) of the bedfile to be processed. Each
    worker will process all bigwig filesbut it will only process a fraction
    (span) of the bed file regions


    :param span: the fraction (lines) of the bed file [from, to] to be processed.
    :param bw_list: the list of bigWig files to be processed.
    :param region_bed_file_name: the bed file containing the region for which coverage is to be computed.
    :param bin_nb: The number of bin into which the region should be splitted.
    If the number of nucleotides is < nbBin a warning is printed.
    :param pseudo_count: A value for the pseudo_count.
    :param n_highest: compute the score based on the n highest values in the bins.
    :param profile: compute coverage profile not a single coverage value (mean).
    :param stranded: controls whether the profile should be ordered based on
    strand.
    :param type: This string will be added to the output to indicate the type
    of region (e.g tss, promoter...).
    :param label: Bigwig labels (i.e short name version)
    :param zero_to_na: Use NA not zero when region is undefined in bigwig.
    :param stat: mean (default) or sum.
    :param verbose: run in verbose mode.

    """

    (span, bw_list, region_bed_file_name, bin_nb, pseudo_count, n_highest,
     profile, stranded, type, label, zero_to_na, stat, _) = input_values

    pc = pseudo_count

    if not profile:
        if n_highest is None:
            n_highest = bin_nb
        results = list()
    else:
        if bin_nb < 1:
            bin_nb = 1
        matrix_file = make_tmp_file_pool(prefix="worker_coverage_",
                                         suffix=".txt")

    for cpt, big_wig in enumerate(bw_list):

        try:
            bigwig = pyBigWig.open(big_wig)
            if not bigwig.isBigWig():
                message("Not a bigwig file :" + big_wig, type="ERROR")
        except:
            message("Not a bigwig file :" + big_wig, type="ERROR")

        mesg = "Computing coverage for %s (chunks : #%s , type : %s, lab : %s)."
        mesg = mesg % (os.path.basename(big_wig), str(span[1] - span[0]), type,
                       label[cpt])
        message(mesg, type="INFO")

        # Load the regions for which the coverage is to be processed.

        tx_bed = BedTool(region_bed_file_name)

        # The fraction of bed file
        # to be processed
        (from_here, to_here) = span

        nb = 0
        nb_to_do = to_here - from_here

        for i in tx_bed[slice(from_here, to_here)]:

            nb += 1

            if nb == nb_to_do:
                p_name = str(multiprocessing.current_process().name)
                message(p_name + " has processed " + str(nb) + " regions")

            if (i.end - i.start) < bin_nb:

                if pygtftk.utils.WARN_REGION_SIZE:
                    pygtftk.utils.WARN_REGION_SIZE = False
                    message("Encountered regions shorter than bin number.",
                            type="WARNING")
                    message(i.name + " has length : " + str(i.end - i.start),
                            type="WARNING")
                    message(
                        "They will be set to NA or --pseudo-count depending on --zero-to-na.",
                        type="WARNING")
                    message("Filter them out please.", type="WARNING")

                if zero_to_na:
                    out = ['NA'] * bin_nb
                else:
                    out = [pc] * bin_nb

            else:

                try:
                    """
                    bw_cov = bigwig.stats(i.chrom,
                                          i.start,
                                          i.end,
                                          nBins=bin_nb)
                    """

                    bw_cov = bigwig.values(i.chrom, i.start, i.end)

                    out = []
                    size = i.end - i.start

                    for range_curr in intervals(list(range(size)),
                                                bin_nb,
                                                silent=True):

                        interval_cur = bw_cov[range_curr[0]:range_curr[1]]

                        if not zero_to_na:
                            interval_cur = [
                                k if not np.isnan(k) else 0
                                for k in interval_cur
                            ]

                        if stat == 'mean':
                            out += [
                                round(
                                    sum(interval_cur) /
                                    (range_curr[1] - range_curr[0]), 6)
                            ]
                        elif stat == 'sum':
                            out += [round(sum(interval_cur), 6)]
                        else:
                            raise GTFtkError("Stat should be 'sum' or 'mean'.")

                    if zero_to_na:
                        out = ['NA' if np.isnan(k) else k + pc for k in out]

                    else:
                        out = [pc if np.isnan(k) else k + pc for k in out]

                except:
                    if pygtftk.utils.WARN_UNDEF:
                        pygtftk.utils.WARN_UNDEF = False

                        mesg = "Encountered regions undefined in bigWig file."
                        message(mesg, type="WARNING")
                        mesg = '%s:%s-%s' % (i.chrom, str(i.start), str(i.end))
                        message(mesg)

                    if zero_to_na:
                        out = ['NA'] * bin_nb
                    else:
                        out = [pc] * bin_nb

            # Prepare output
            if i.name in ["", "."]:
                name = "|".join([i.chrom, str(i.start), str(i.end)])
            else:
                name = i.name

            if i.strand == "":
                strand = "."
            else:
                strand = i.strand

            # Print profiles
            if profile:

                # Data should be oriented in 5' -> 3'
                if stranded:

                    if i.strand == '-':
                        out = out[::-1]

                out = [str(x) for x in out]

                out_text = [
                    label[cpt], i.chrom,
                    str(i.start),
                    str(i.end),
                    str(i.name), i.strand
                ]
                out_text = out_text + out
                out_text = "\t".join(out_text)
                matrix_file.write(out_text + "\n")

            else:

                out = sorted(out, reverse=True)
                out = out[0:n_highest]

                if 'NA' not in out:
                    out = sum(out) / len(out)
                else:
                    out = 'NA'

                results.append("\t".join([
                    i.chrom,
                    str(i.start),
                    str(i.end), label[cpt] + "|" + name,
                    str(out), strand
                ]) + "\n")

    if profile:
        matrix_file.close()
        return matrix_file.name

    else:
        return results
Esempio n. 12
0
def shift(inputfile=None,
          outputfile=None,
          shift_value=None,
          chrom_info=None,
          stranded=False,
          allow_outside=False):
    """Shift coordinates in 3' or 5' direction.
    """

    gtf = GTF(inputfile, check_ensembl_format=False)

    chrom_list_gtf = gtf.get_chroms(nr=True)
    chrom_info = chrom_info_as_dict(chrom_info)

    for chr in chrom_list_gtf:
        if chr not in chrom_info:
            raise GTFtkError("Chromosome " + chr +
                             " was not found in chrom-info file.")

    for i in gtf:
        size = i.end - i.start + 1
        if not stranded:
            new_start = i.start + shift_value
            new_end = i.end + shift_value
        else:
            if i.strand == "-":
                new_start = i.start - shift_value
                new_end = i.end - shift_value
            else:
                new_start = i.start + shift_value
                new_end = i.end + shift_value

        # Feature is going outside genome in left direction
        if not allow_outside:
            if new_start < 1:
                new_start = 1
                new_end = size

            # Feature is going outside genome in right direction
            if new_end > int(chrom_info[i.chrom]):
                new_end = int(chrom_info[i.chrom])
                new_start = new_end - size + 1
        else:
            if new_start < 1:
                new_start = 1
                if new_end < 1:
                    new_end = None

            # Feature is going outside genome in right direction
            if new_end > int(chrom_info[i.chrom]):
                new_end = int(chrom_info[i.chrom])
                if new_start > int(chrom_info[i.chrom]):
                    new_start = None

        if new_start is not None and new_end is not None:
            i.start = new_start
            i.end = new_end
            i.write(outputfile)

    gc.disable()
    close_properly(outputfile, inputfile)