コード例 #1
0
def apropos(keyword="", notes=False):
    """
    Search in all command description files those related to a user-defined keyword.
    """

    out_list = set()

    for i in CmdManager.cmd_obj_list:
        if keyword in CmdManager.cmd_obj_list[i].desc:
            out_list.add(i)
        if notes:
            try:
                if keyword in CmdManager.cmd_obj_list[i].notes:
                    out_list.add(i)
            except:
                pass

    if out_list:
        message(">> Keyword '" + keyword +
                "' was found in the following command:",
                force=True)
        for i in out_list:
            print("\t- " + i + ".")
    else:
        message(">> Keyword '" + keyword + "' was not found.", force=True)
コード例 #2
0
ファイル: intronic.py プロジェクト: stefanucci-luca/pygtftk
def intronic(inputfile=None,
             outputfile=None,
             names='transcript_id',
             separator="_",
             intron_nb_in_name=False,
             no_feature_name=False,
             by_transcript=False):
    """
 Extract intronic regions.
    """

    message("Searching for intronic regions.")

    # Need to load if the gtf comes from
    # <stdin>
    gtf = GTF(inputfile, check_ensembl_format=False)

    if not by_transcript:
        introns_bo = gtf.get_introns()

        for i in introns_bo:
            write_properly(chomp(str(i)), outputfile)
    else:

        introns_bo = gtf.get_introns(by_transcript=True,
                                     name=names.split(","),
                                     sep=separator,
                                     intron_nb_in_name=intron_nb_in_name,
                                     feat_name=not no_feature_name)
        for i in introns_bo:
            write_properly(chomp(str(i)), outputfile)

    gc.disable()
    close_properly(outputfile, inputfile)
コード例 #3
0
def bed_to_gtf(inputfile=None,
               outputfile=None,
               ft_type="transcript",
               source="Unknown"):
    """
 Convert a bed file to a gtf. This will make the poor bed feel as if it was a
 nice gtf (but with lots of empty fields...). May be helpful sometimes...
    """

    message("Converting the bed file into GTF file.")

    if inputfile.name == '<stdin>':
        tmp_file = make_tmp_file(prefix="input_bed", suffix=".bed")
        for i in inputfile:
            write_properly(chomp(str(i)), tmp_file)

        tmp_file.close()
        inputfile.close()

        bed_obj = BedTool(tmp_file.name)
    else:
        bed_obj = BedTool(inputfile.name)

    n = 1
    for i in bed_obj:

        if i.strand == "":
            i.strand = "."
        if i.name == "":
            i.name = str("feature_" + str(n))
        if i.score == "":
            i.score = "0"

        if ft_type == "exon":
            key_value = "gene_id \"" + i.name + "\"; " + \
                        "transcript_id \"" + i.name + "\"; " + \
                        "exon_id \"" + i.name + "\";"
        elif ft_type == "gene":
            key_value = "gene_id \"" + i.name + "\";"
        else:
            key_value = "gene_id \"" + i.name + "\"; " + \
                        "transcript_id \"" + i.name + "\";"

        if pygtftk.utils.ADD_CHR == 1:
            chrom_out = "chr" + i.chrom
        else:
            chrom_out = i.chrom

        list_out = [
            chrom_out, source, ft_type,
            str(i.start + 1),
            str(i.end),
            str(i.score), i.strand, ".", key_value
        ]

        write_properly("\t".join(list_out), outputfile)

        n += 1
    gc.disable()
    close_properly(outputfile)
コード例 #4
0
ファイル: dict_learning.py プロジェクト: dputhier/pygtftk
    def find_interesting_combinations(self):
        """
        MAIN FUNCTION. Will call the others.
        """

        ## Hardcode ignoring of Python (and by extension SKLearn) warnings

        try:
            previous_warning_level = os.environ["PYTHONWARNINGS"]
        except:
            previous_warning_level = 'default'

        if utils.VERBOSITY < 2:  # Only if not debugging
            os.environ["PYTHONWARNINGS"] = "ignore"
            #warnings.filterwarnings('ignore', module='^{}\.'.format(re.escape("sklearn")))
            message("Filtering out sklearn warnings.")

        ## Now call the functions
        self.generate_candidate_words()
        self.filter_library()
        self.select_best_words_from_library()

        # Re-enable warnings
        os.environ["PYTHONWARNINGS"] = previous_warning_level

        return self.best_words
コード例 #5
0
ファイル: dict_learning.py プロジェクト: dputhier/pygtftk
    def select_best_words_from_library(self):
        """
        This is step 2. Takes the library of candidates produced at step 1
        and will get the best N words among it that best rebuild the original matrix.
        """

        # You can't request more words than are actually present in the library,
        # nor than unique elements in the data
        upper_floor_words = min(self.number_of_words_in_library,
                                len(np.unique(self.data, axis=0)))
        if self.queried_words_nb > upper_floor_words:
            self.queried_words_nb = upper_floor_words
            message("Requesting too many words, reducing to " +
                    str(self.queried_words_nb))
        # NOTE It will actually be +1 to make room for the root (0,0,0,...) word, but this is added later

        # Read the parameters that were supplied when creating the Modl object
        best_dict = modl_subroutines.build_best_dict_from_library(
            self.data,
            self.library,  # Data and Library of candidates
            self.queried_words_nb,  # N best words
            self.error_function,  # Potential custom error function
            self.nb_threads,
            self.normalize_words,  # Normalize words by sum of square
            self.step_2_alpha)  # Sparsity control

        # Final step : register the best dictionary
        self.best_words = best_dict
コード例 #6
0
ファイル: midpoints.py プロジェクト: stefanucci-luca/pygtftk
def midpoints(inputfile=None,
              outputfile=None,
              ft_type="transcript",
              names="transcript_id",
              separator="|"):
    """
 Get the midpoint coordinates for the requested feature.
    """

    message("Loading input file...")
    if inputfile.name == '<stdin>':
        is_gtf = True
    else:
        region_bo = BedTool(inputfile.name)
        if len(region_bo) == 0:
            message("Unable to find requested regions", type="ERROR")

        if region_bo.file_type == 'gff':
            is_gtf = True
        else:
            is_gtf = False

    if is_gtf:

        gtf = GTF(inputfile.name, check_ensembl_format=False)

        bed_obj = gtf.select_by_key("feature", ft_type).get_midpoints(
            name=names.split(","), sep=separator)
        for line in bed_obj:
            write_properly(chomp(str(line)), outputfile)

    else:
        for line in region_bo:

            diff = line.end - line.start

            if diff % 2 != 0:
                # e.g 10-13 (zero based) -> 11-13 one based
                # mipoint is 12 (one-based) -> 11-12 (zero based)
                # e.g 949-1100 (zero based) -> 950-1100 one based
                # mipoint is 1025 (one-based) -> 1024-1025 (zero based)
                # floored division (python 2)...
                line.end = line.start + int(diff // 2) + 1
                line.start = line.end - 1
            else:
                # e.g 10-14 (zero based) -> 11-14 one based
                # mipoint is 12-13 (one-based) -> 11-13 (zero based)
                # e.g 9-5100 (zero based) -> 10-5100 one based
                # mipoint is 2555-2555 (one-based) -> 2554-2555 (zero based)
                # floored division (python 2)...
                # No real center. Take both

                line.start = line.start + int(diff // 2) - 1
                line.end = line.start + 2

            outputfile.write(str(line))

    gc.disable()
    close_properly(outputfile, inputfile)
コード例 #7
0
def negbin_pval(k, mean, var, precision=320, ft_type="Unknown"):
    r"""
    P-value for a negative binomial distribution of the given moments (mean, var).

    This is the two-sided p-value : it will return the minimum of the left-sided
    and right-sided p-value

    NOTE : To prevent division by zero or negative r, if the mean is higher than
    or equal to the variance, set the variance to mean + epsilon and send a warning

    :param k: the critical value for which the pvalue is computed.
    :param mean: The mean for the negative binomial model.
    :param var: The variance for the negative binomial model.
    :param precision: Floating point precision of mpmath. Should be at least 1000
    :param ft_type: The name of the feature to be tested (just for meaningful messages).

    >>> from pygtftk.stats.negbin_fit import negbin_pval
    >>> mean = 18400
    >>> var = 630200
    >>> k = 65630
    >>> pval = negbin_pval(k, mean, var)
    >>> import math
    >>> assert(math.isclose(pval,1.1999432787236828e-307))

    """

    if mean < 1:
        mean = 1
        msg = "Computing log(p-val) for a Neg Binom with mean < 1 ; mean was set to 1 (" + ft_type + ")"
        message(msg, type='WARNING')
    # This is necessary, since r must be above 0.

    if mean >= var:
        var = mean + 1
        msg = "Computing log(p-val) for a Neg Binom with mean >= var ; var was set to mean+1 (" + ft_type + ")"
        message(msg, type='WARNING')

    # Floating point precision of mpmath. Should be at least 320.
    mpmath.mp.dps = precision

    # Calculate r and p based on mean and var
    r = mpmath.mpf(mean**2 / (var - mean))
    p = mpmath.mpf(1 / (mean / r + 1))

    # To circumvent scipy floating point precision issues, we implement a
    # custom p-value calculation (see 'beta.py' for details)
    mybetacalc = BetaCalculator(use_log=True,
                                precision=precision,
                                ft_type=ft_type)
    incomplete_beta = mybetacalc.betainc(a=r, b=k + 1, x=p)
    complete_beta = mybetacalc.beta(a=r, b=k + 1)

    # Take the minimum of CDF and SF
    pval = 1 - (incomplete_beta / complete_beta)
    twosided_pval = min(pval, 1 - pval)

    # Convert back to Python float and return
    return float(twosided_pval)
コード例 #8
0
ファイル: del_attr.py プロジェクト: stefanucci-luca/pygtftk
def del_attr(inputfile=None,
             outputfile=None,
             key="transcript_id",
             reg_exp=False,
             invert_match=False):
    """
    Delete extended attributes in the target gtf file. attr_list can be a
    comma-separated list of attributes.
    """

    # ----------------------------------------------------------------------
    # Read the GTF and get the list of attributes
    # ----------------------------------------------------------------------

    gtf = GTF(inputfile, check_ensembl_format=False)

    attr_list = gtf.attr_extended

    # ----------------------------------------------------------------------
    # If regExp, select the corresponding keys
    # ----------------------------------------------------------------------

    if reg_exp:

        key_list = []

        try:
            rgxp = re.compile(key)
        except:
            message("Check the regular expression please.", type="ERROR")

        for attr in attr_list:
            if rgxp.search(attr):
                key_list += [attr]
    else:
        key_list = key.split(",")

    # ----------------------------------------------------------------------
    # If invert-match select all but the selected
    # ----------------------------------------------------------------------

    key_to_del = []
    if invert_match:
        for attr in attr_list:
            if attr not in key_list:
                key_to_del += [attr]
    else:
        key_to_del = key_list

    # ----------------------------------------------------------------------
    # Delete the keys
    # ----------------------------------------------------------------------

    gtf = gtf.del_attr(feat="*", keys=",".join(key_list),
                       force=True).write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
コード例 #9
0
ファイル: cmd_manager.py プロジェクト: dputhier/pygtftk
    def __call__(self, parser, namespace, values, option_string=None):
        if not os.path.exists(CmdManager.config_dir):
            message("Please run gtftk -h before adding additional plugins",
                    force=True)
            sys.exit(0)

        open(os.path.join(CmdManager.config_dir, "reload"), "w")
        message("Plugins will be updated at next startup.", force=True)
        sys.exit()
コード例 #10
0
ファイル: cmd_manager.py プロジェクト: dputhier/pygtftk
    def dump_plugins(self):
        """Save the plugins into a pickle object."""

        message("Dumping plugins", force=True)

        f_handler = open(CmdManager.dumped_plugin_path, "wb")

        pick = cloudpickle.CloudPickler(f_handler)
        pick.dump((self.cmd_obj_list, self.parser))
        f_handler.close()
コード例 #11
0
def add_exon_nb(inputfile=None, outputfile=None, exon_numbering_key=None):
    """Add the exon number to each exon (based on 5' to 3' orientation)."""

    message("Calling nb_exons.", type="DEBUG")

    GTF(inputfile.name,
        check_ensembl_format=False).add_exon_number(exon_numbering_key).write(
            outputfile, gc_off=True)

    close_properly(inputfile, outputfile)
コード例 #12
0
ファイル: beta.py プロジェクト: dputhier/pygtftk
    def contfractbeta(self, a, b, x):
        """
        Evaluates the continued fraction form of the incomplete Beta function.

        Code translated from: GNU Scientific Library

        Uses the modified Lentz's method.

        You can see a representation of this form in the Digial Library of
        Mathematical functions <https://dlmf.nist.gov/8.17#SS5.p1>.
        The goal of the method is to calculate the successive 'd' terms,
        separately for odd and even.
        """

        a, b, x = mpmath.mpf(a), mpmath.mpf(b), mpmath.mpf(x)

        num_term = 1.0
        den_term = 1.0 - (a + b) * x / (a + 1.0)
        den_term = 1.0 / den_term

        cf = den_term

        for i in range(self.itermax + 1):
            k = i + 1
            coeff = k * (b - k) * x / (((a - 1.0) + 2 * k) * (a + 2 * k))

            # First step of the recurrence
            den_term = 1.0 + coeff * den_term
            num_term = 1.0 + coeff / num_term
            den_term = 1.0 / den_term

            delta_frac = den_term * num_term
            cf *= delta_frac

            coeff = -(a + k) * (a + b + k) * x / ((a + 2 * k) *
                                                  (a + 2 * k + 1.0))

            # Second step
            den_term = 1.0 + coeff * den_term
            num_term = 1.0 + coeff / num_term
            den_term = 1.0 / den_term

            delta_frac = den_term * num_term
            cf *= delta_frac

            # Are we done ?
            if (abs(delta_frac - 1.0) < 2.0 * self.epsilon):
                return cf

        # If failed to converge, return our best guess but send a warning
        msg = 'a or b too large or given itermax too small for computing incomplete'
        msg += ' beta function ; pval may be slightly erroneous for feature (' + self.ft_type + ').'
        message(msg, type='WARNING')
        return cf
コード例 #13
0
ファイル: biomart.py プロジェクト: stefanucci-luca/pygtftk
    def get_datasets(self, database=None):
        message("Listing available datasets", type="DEBUG")
        if database in self.databases:

            self.query(query={'type': 'datasets', 'mart': database})
            for i in self.response.text.split("\n"):
                fields = i.split("\t")
                if len(fields) > 1:
                    self.datasets[fields[1]] = fields[2:]
        else:
            message("Database not found.")
コード例 #14
0
def select_by_max_exon_nb(inputfile=None, outputfile=None):
    """
    Select transcripts based on the number of exons.
    """

    msg = "Selecting transcript with the highest number of exon for each gene."
    message(msg)

    gtf = GTF(inputfile, check_ensembl_format=False).select_by_max_exon_nb()

    gtf.write(outputfile, gc_off=True)
コード例 #15
0
ファイル: biomart.py プロジェクト: stefanucci-luca/pygtftk
    def _get_databases(self):

        message("Listing available databases", type="DEBUG")
        try:
            self.query(query={'type': 'registry'})
        except ConErr:
            message("Raised a connection Error.", type="ERROR")

        tree = ElementTree.fromstring(self.response.content)

        for child in tree:
            if child.tag == 'MartURLLocation':
                self.databases += [child.attrib['name']]
コード例 #16
0
def select_most_5p_tx(inputfile=None, outputfile=None, keep_gene_lines=False):
    """
    Select the most 5' transcript of each gene.
    """

    message("Selecting the most 5' transcript of each gene.")

    gtf = GTF(inputfile)

    if keep_gene_lines:
        gtf = gtf.select_5p_transcript()
    else:
        gtf = gtf.select_5p_transcript().select_by_key("feature", "gene", 1)

    gtf.write(outputfile, gc_off=True)
コード例 #17
0
ファイル: cmd_manager.py プロジェクト: dputhier/pygtftk
    def _find_plugins():

        message("Searching plugins", force=True)
        config_file = CmdManager.config_file

        # User plugins
        plugin_dir_user = yaml.load(open(config_file, "r"),
                                    Loader=yaml.FullLoader)["plugin_path"]
        sys.path.append(plugin_dir_user)
        plugins = sorted(os.listdir(plugin_dir_user))
        plugins_user = [os.path.join(plugin_dir_user, x) for x in plugins]

        # System wide plugins (those declared in the plugins directory of
        # pygtftk source)

        plugin_dir_base = os.path.join(pygtftk.__path__[0], "plugins")
        sys.path.append(plugin_dir_base)
        plugins = sorted(os.listdir(plugin_dir_base))
        plugins_system = [os.path.join(plugin_dir_base, x) for x in plugins]

        plugins = plugins_user + plugins_system

        for plug in plugins:
            if plug.endswith(".py") and plug != "__init__.py":

                # Loading the plugin should force code to create
                # a cmdObject that will be added to the CmdManager
                # gtftk.plugins.tss_dist
                module_name = re.sub("\.py$", "", plug)
                module_name = re.sub("/", ".", module_name)
                module_name = re.sub(".*pygtftk", "pygtftk", module_name)

                try:

                    SourceFileLoader(module_name, plug).load_module()

                except Exception as e:
                    message("Failed to load plugin :" + plug, type="WARNING")
                    print(e)

            elif plug.endswith(".R"):
                pass
                # declare_r_cmd(plugin_path, plug)

        CmdManager.reload = False

        if os.path.exists(os.path.join(CmdManager.config_dir, "reload")):
            os.remove(os.path.join(CmdManager.config_dir, "reload"))
コード例 #18
0
def make_tmp_file_pool(prefix='tmp',
                       suffix='',
                       store=True,
                       dir=None):
    """
    This

    :Example:

    >>> from pygtftk.utils import make_tmp_file_pool
    >>> tmp_file = make_tmp_file_pool()
    >>> assert os.path.exists(tmp_file.name)
    >>> tmp_file = make_tmp_file_pool(prefix="pref")
    >>> assert os.path.exists(tmp_file.name)
    >>> tmp_file = make_tmp_file_pool(suffix="suf")
    >>> assert os.path.exists(tmp_file.name)

    """

    dir_target = None

    if dir is None:
        if pygtftk.utils.TMP_DIR is not None:
            if not os.path.exists(pygtftk.utils.TMP_DIR):
                msg = "Creating directory {d}."
                message(msg.format(d=pygtftk.utils.TMP_DIR), type="INFO")
                os.mkdir(pygtftk.utils.TMP_DIR)
                dir_target = pygtftk.utils.TMP_DIR

    else:
        if not os.path.exists(dir):
            msg = "Creating directory {d}."
            message(msg.format(d=dir), type="INFO")
            os.mkdir(dir)
            dir_target = dir

    tmp_file = NamedTemporaryFile(delete=False,
                                  mode='w',
                                  prefix=prefix + "_pygtftk_",
                                  suffix=suffix,
                                  dir=dir_target)

    if store:
        TMP_FILE_POOL_MANAGER.append(tmp_file.name)

    return tmp_file
コード例 #19
0
    def __call__(self, minibatch_len, seed, id):
        my_result = compute_all_intersections_minibatch(
            self.Lr1,
            self.Li1,
            self.Lrs,
            self.Lis,
            self.all_chrom1,
            self.all_chrom2,
            minibatch_len,
            self.use_markov_shuffling,
            self.keep_intact_in_shuffling,
            self.nb_threads,
            seed=seed)

        message("--- Minibatch nb. : " + str(id) + " is complete.")

        return my_result
コード例 #20
0
def select_by_tx_size(inputfile=None,
                      outputfile=None,
                      min_size=None,
                      max_size=None):
    """
    Select features by size.
    """

    msg = "Selecting 'mature/spliced transcript by size (range: [{m},{M}])."
    msg = msg.format(m=str(min_size),
                     M=str(max_size))
    message(msg)

    GTF(inputfile
        ).select_by_transcript_size(min_size,
                                    max_size
                                    ).write(outputfile,
                                            gc_off=True)
コード例 #21
0
def select_by_nb_exon(inputfile=None,
                      outputfile=None,
                      min_exon_number=None,
                      max_exon_number=None):
    """
    Select transcripts based on the number of exons.
    """

    msg = "Selecting transcript by exon number (range: [{m},{M}])"
    msg = msg.format(m=str(min_exon_number), M=str(max_exon_number))
    message(msg)

    gtf = GTF(inputfile, check_ensembl_format=False).select_by_number_of_exons(
        min_exon_number, max_exon_number)

    gtf.write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
コード例 #22
0
def exon_sizes(inputfile=None, outputfile=None, key_name=None):
    """
 Add a new key to transcript features containing a comma-separated list of exon-size.
    """

    gtf = GTF(inputfile)

    all_tx_ids = gtf.get_tx_ids(nr=True)
    tx_to_size_list = dict()
    exons_starts = gtf.select_by_key("feature", "exon").extract_data(
        "transcript_id,start",
        as_dict_of_merged_list=True,
        no_na=True,
        nr=False)

    if not len(exons_starts):
        message("No exon found.", type="ERROR")

    exons_ends = gtf.select_by_key("feature", "exon").extract_data(
        "transcript_id,end", as_dict_of_merged_list=True, no_na=True, nr=False)

    strands = gtf.select_by_key("feature", "transcript").extract_data(
        "transcript_id,strand",
        as_dict_of_values=True,
        no_na=True,
        nr=True,
        hide_undef=True)

    for tx_id in all_tx_ids:
        size_list = []
        for s, e in zip(exons_starts[tx_id], exons_ends[tx_id]):
            size = str(int(e) - int(s) + 1)
            size_list += [size]
        if strands[tx_id] == "-":
            size_list = reversed(size_list)
        tx_to_size_list[tx_id] = ",".join(size_list)

    if len(tx_to_size_list):
        gtf = gtf.add_attr_from_dict(feat="transcript",
                                     key="transcript_id",
                                     a_dict=tx_to_size_list,
                                     new_key=key_name)
    gtf.write(outputfile, gc_off=True)
    close_properly(outputfile, inputfile)
コード例 #23
0
def select_by_go(inputfile=None,
                 outputfile=None,
                 go_id=None,
                 https_proxy=None,
                 http_proxy=None,
                 list_datasets=None,
                 species=None,
                 invert_match=False):
    """ Select lines from a GTF file based using a Gene Ontology ID (e.g GO:0050789).
    """

    if not go_id.startswith("GO:"):
        go_id = "GO:" + go_id

    is_associated = OrderedDict()

    bm = Biomart(http_proxy=http_proxy,
                 https_proxy=https_proxy)

    bm.get_datasets('ENSEMBL_MART_ENSEMBL')

    if list_datasets:
        for i in sorted(bm.datasets):
            write_properly(i.replace("_gene_ensembl", ""), outputfile)
        sys.exit()
    else:
        if species + "_gene_ensembl" not in bm.datasets:
            message("Unknow dataset/species.", type="ERROR")

    bm.query({'query': XML.format(species=species, go=go_id)})

    for i in bm.response.content.decode().split("\n"):
        i = i.rstrip("\n")
        if i != '':
            is_associated[i] = 1

    gtf = GTF(inputfile)

    gtf_associated = gtf.select_by_key("gene_id",
                                       ",".join(list(is_associated.keys())),
                                       invert_match)

    gtf_associated.write(outputfile,
                         gc_off=True)
コード例 #24
0
ファイル: random_tx.py プロジェクト: stefanucci-luca/pygtftk
def random_tx(inputfile=None,
              outputfile=None,
              max_transcript=None,
              seed_value=None):
    """
    Select randomly up to m transcript for each gene.
    """

    message("loading the GTF.")

    gtf = GTF(inputfile).select_by_key("feature", "gene", invert_match=True)

    message("Getting gene_id and transcript_id")

    gene2tx = gtf.extract_data("gene_id,transcript_id",
                               as_dict_of_merged_list=True,
                               no_na=True,
                               nr=True)

    message("Selecting random transcript")

    if seed_value is not None:
        random.seed(seed_value, version=1)

    tx_to_delete = []

    for gn_id in gene2tx:
        tx_list = gene2tx[gn_id]
        nb_tx = len(tx_list)
        max_cur = min(max_transcript, nb_tx)
        pos_to_keep = random.sample(list(range(len(tx_list))), max_cur)
        tx_list = [j for i, j in enumerate(tx_list) if i not in pos_to_keep]
        tx_to_delete += tx_list

    message("Printing results")

    message("Selecting transcript.")
    gtf.select_by_key("transcript_id",
                      ",".join(tx_to_delete),
                      invert_match=True).write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
コード例 #25
0
def join_multi_file(inputfile=None,
                    outputfile=None,
                    target_feature=None,
                    key_to_join=None,
                    matrix_files=()):
    """
    Join attributes from a set of tabulated files.
    """

    # -----------------------------------------------------------
    #  load the GTF
    # -----------------------------------------------------------

    gtf = GTF(inputfile, check_ensembl_format=False)

    # -----------------------------------------------------------
    #  Check target feature
    # -----------------------------------------------------------

    feat_list = gtf.get_feature_list(nr=True)

    if target_feature is not None:
        target_feature_list = target_feature.split(",")

        for i in target_feature_list:
            if i not in feat_list + ["*"]:
                message("Feature " + i + " not found.", type="ERROR")
    else:
        target_feature = ",".join(feat_list)

    # -----------------------------------------------------------
    #  Do it
    # -----------------------------------------------------------

    for join_file in matrix_files:
        gtf = gtf.add_attr_from_matrix_file(feat=target_feature,
                                            key=key_to_join,
                                            inputfile=join_file.name)
    gtf.write(outputfile, gc_off=True)

    gc.disable()
    close_properly(outputfile, inputfile)
コード例 #26
0
ファイル: intergenic.py プロジェクト: dputhier/pygtftk
def intergenic(inputfile=None, outputfile=None, chrom_info=None):
    """
 Extract intergenic regions.
    """

    message("Searching for intergenic regions.")

    gtf = GTF(inputfile)

    intergenic_regions = gtf.get_intergenic(chrom_info)

    nb_intergenic_region = 1

    for i in intergenic_regions:
        i.name = "region_" + str(nb_intergenic_region)
        write_properly(chomp(str(i)), outputfile)
        nb_intergenic_region += 1

    gc.disable()
    close_properly(outputfile, inputfile)
コード例 #27
0
ファイル: cmd_manager.py プロジェクト: dputhier/pygtftk
    def parse_cmd_args(cls):
        """ Parse arguments of all declared commands."""

        CmdManager.args = cls.parser.parse_args(None)
        args = CmdManager.args
        cmd_name = args.command

        if cmd_name is None:
            message("Please provide a subcommand or argument (e.g. -h)",
                    type="WARNING",
                    force=True)
            CmdManager.parser.print_help()
            exit(0)

        lang = cls.cmd_obj_list[cmd_name].lang

        if lang == 'Python':
            if args.tmp_dir is not None:

                if not os.path.exists(args.tmp_dir):
                    msg = "Creating directory {d}."
                    message(msg.format(d=args.tmp_dir), type="INFO")
                    mkdir_p(args.tmp_dir)
                if not os.path.isdir(args.tmp_dir):
                    msg = "{d} is not a directory."
                    message(msg.format(d=args.tmp_dir), type="ERROR")

                pygtftk.utils.TMP_DIR = args.tmp_dir

        return args
コード例 #28
0
ファイル: biomart.py プロジェクト: stefanucci-luca/pygtftk
    def query(self, query):
        message("Sending query", type="DEBUG")
        self.response = requests.get(self.url, query, proxies=self.proxies)

        message("Checking http response", type="DEBUG")

        if self.response.status_code != requests.codes.ok:
            msg = "HTTP response status code: {c}. {m}"
            msg = msg.format(c=str(self.response.status_code),
                             m=self.response.reason)
            message(msg, type="ERROR")

        msg = "([ \.\w]+ service you requested is currently unavailable[ \.\w]+)"
        hit = re.search(msg, self.response.text)

        if hit:
            msg = re.search(msg, self.response.text).group(1)
            message(msg.lstrip().rstrip(), type="WARNING")
            message(
                "More information about this downtime "
                "may be available on http://www.ensembl.info/",
                type="ERROR")
コード例 #29
0
def nb_exons(inputfile=None,
             outputfile=None,
             key_name=None,
             text_format=False):
    """
    Count the number of exons in the gtf file.
    """

    gtf = GTF(inputfile)
    n_exons = defaultdict(int)

    # -------------------------------------------------------------------------
    # Computing number of  exon for each transcript in input GTF file
    #
    # -------------------------------------------------------------------------

    message("Computing number of exons for each transcript in input GTF file.")

    exon = gtf.select_by_key("feature", "exon")
    fields = exon.extract_data("transcript_id")

    for i in fields:
        tx_id = i[0]
        n_exons[tx_id] += 1

    if text_format:
        for tx_id in n_exons:
            outputfile.write(tx_id + "\t" + str(n_exons[tx_id]) +
                             "\ttranscript\n")
    else:

        if len(n_exons):
            gtf = gtf.add_attr_from_dict(feat="transcript",
                                         key="transcript_id",
                                         a_dict=n_exons,
                                         new_key=key_name)
        gtf.write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
コード例 #30
0
    def produce_dot_for_node(node, graph):
        for c in node.children:
            node_name = node_to_combi_string(node, features_names)
            child_name = node_to_combi_string(c, features_names)

            message("Drawing " + node_name + ' --> ' + child_name, type='DEBUG')

            ## Add nodes
            # Only add node if not already present of course.
            # If present, the graph's 'body' contains the combi string prefixed with a tab character

            ## Parent
            if not ('\t' + node_name in s.body):
                # print(combi_string, node.s, node.pval, node.fc)
                graph.node(node_name, format_node_string(node_name, node.s, node.pval, node.fc))

            # Child
            if not ('\t' + child_name in s.body):
                graph.node(child_name, format_node_string(child_name, c.s, c.pval, c.fc))

            graph.edge(node_name + ':s', child_name + ':n')

        return 1