Exemple #1
0
def midpoints(inputfile=None,
              outputfile=None,
              ft_type="transcript",
              names="transcript_id",
              separator="|"):
    """
 Get the midpoint coordinates for the requested feature.
    """

    message("Loading input file...")
    if inputfile.name == '<stdin>':
        is_gtf = True
    else:
        region_bo = BedTool(inputfile.name)
        if len(region_bo) == 0:
            message("Unable to find requested regions", type="ERROR")

        if region_bo.file_type == 'gff':
            is_gtf = True
        else:
            is_gtf = False

    if is_gtf:

        gtf = GTF(inputfile.name, check_ensembl_format=False)

        bed_obj = gtf.select_by_key("feature", ft_type).get_midpoints(
            name=names.split(","), sep=separator)
        for line in bed_obj:
            write_properly(chomp(str(line)), outputfile)

    else:
        for line in region_bo:

            diff = line.end - line.start

            if diff % 2 != 0:
                # e.g 10-13 (zero based) -> 11-13 one based
                # mipoint is 12 (one-based) -> 11-12 (zero based)
                # e.g 949-1100 (zero based) -> 950-1100 one based
                # mipoint is 1025 (one-based) -> 1024-1025 (zero based)
                # floored division (python 2)...
                line.end = line.start + int(diff // 2) + 1
                line.start = line.end - 1
            else:
                # e.g 10-14 (zero based) -> 11-14 one based
                # mipoint is 12-13 (one-based) -> 11-13 (zero based)
                # e.g 9-5100 (zero based) -> 10-5100 one based
                # mipoint is 2555-2555 (one-based) -> 2554-2555 (zero based)
                # floored division (python 2)...
                # No real center. Take both

                line.start = line.start + int(diff // 2) - 1
                line.end = line.start + 2

            outputfile.write(str(line))

    gc.disable()
    close_properly(outputfile, inputfile)
Exemple #2
0
def random_list(inputfile=None,
                outputfile=None,
                number=None,
                ft_type=None,
                seed_value=None):
    """
    Select a random list of genes or transcripts.
    """

    message("loading the GTF.")

    gtf = GTF(inputfile)

    message("Getting ID list.")

    if ft_type == 'gene':
        id_list = gtf.extract_data("gene_id",
                                   as_list=True,
                                   nr=True,
                                   hide_undef=True,
                                   no_na=True)
    else:
        id_list = gtf.extract_data("transcript_id",
                                   as_list=True,
                                   nr=True,
                                   hide_undef=True,
                                   no_na=True)

    if number > len(id_list):
        message("To much feature. Using : " + str(len(id_list)),
                type="WARNING")
        number = len(id_list)

    if seed_value is not None:
        random.seed(seed_value, version=1)

    id_list = random.sample(id_list, number)

    message("Printing.")

    my_id = ft_type + "_id"

    gtf.select_by_key(my_id, ",".join(id_list)).write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Exemple #3
0
def exon_sizes(inputfile=None, outputfile=None, key_name=None):
    """
 Add a new key to transcript features containing a comma-separated list of exon-size.
    """

    gtf = GTF(inputfile)

    all_tx_ids = gtf.get_tx_ids(nr=True)
    tx_to_size_list = dict()
    exons_starts = gtf.select_by_key("feature", "exon").extract_data(
        "transcript_id,start",
        as_dict_of_merged_list=True,
        no_na=True,
        nr=False)

    if not len(exons_starts):
        message("No exon found.", type="ERROR")

    exons_ends = gtf.select_by_key("feature", "exon").extract_data(
        "transcript_id,end", as_dict_of_merged_list=True, no_na=True, nr=False)

    strands = gtf.select_by_key("feature", "transcript").extract_data(
        "transcript_id,strand",
        as_dict_of_values=True,
        no_na=True,
        nr=True,
        hide_undef=True)

    for tx_id in all_tx_ids:
        size_list = []
        for s, e in zip(exons_starts[tx_id], exons_ends[tx_id]):
            size = str(int(e) - int(s) + 1)
            size_list += [size]
        if strands[tx_id] == "-":
            size_list = reversed(size_list)
        tx_to_size_list[tx_id] = ",".join(size_list)

    if len(tx_to_size_list):
        gtf = gtf.add_attr_from_dict(feat="transcript",
                                     key="transcript_id",
                                     a_dict=tx_to_size_list,
                                     new_key=key_name)
    gtf.write(outputfile, gc_off=True)
    close_properly(outputfile, inputfile)
Exemple #4
0
def random_tx(inputfile=None,
              outputfile=None,
              max_transcript=None,
              seed_value=None):
    """
    Select randomly up to m transcript for each gene.
    """

    message("loading the GTF.")

    gtf = GTF(inputfile).select_by_key("feature", "gene", invert_match=True)

    message("Getting gene_id and transcript_id")

    gene2tx = gtf.extract_data("gene_id,transcript_id",
                               as_dict_of_merged_list=True,
                               no_na=True,
                               nr=True)

    message("Selecting random transcript")

    if seed_value is not None:
        random.seed(seed_value, version=1)

    tx_to_delete = []

    for gn_id in gene2tx:
        tx_list = gene2tx[gn_id]
        nb_tx = len(tx_list)
        max_cur = min(max_transcript, nb_tx)
        pos_to_keep = random.sample(list(range(len(tx_list))), max_cur)
        tx_list = [j for i, j in enumerate(tx_list) if i not in pos_to_keep]
        tx_to_delete += tx_list

    message("Printing results")

    message("Selecting transcript.")
    gtf.select_by_key("transcript_id",
                      ",".join(tx_to_delete),
                      invert_match=True).write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Exemple #5
0
def select_by_go(inputfile=None,
                 outputfile=None,
                 go_id=None,
                 https_proxy=None,
                 http_proxy=None,
                 list_datasets=None,
                 species=None,
                 invert_match=False):
    """ Select lines from a GTF file based using a Gene Ontology ID (e.g GO:0050789).
    """

    if not go_id.startswith("GO:"):
        go_id = "GO:" + go_id

    is_associated = OrderedDict()

    bm = Biomart(http_proxy=http_proxy,
                 https_proxy=https_proxy)

    bm.get_datasets('ENSEMBL_MART_ENSEMBL')

    if list_datasets:
        for i in sorted(bm.datasets):
            write_properly(i.replace("_gene_ensembl", ""), outputfile)
        sys.exit()
    else:
        if species + "_gene_ensembl" not in bm.datasets:
            message("Unknow dataset/species.", type="ERROR")

    bm.query({'query': XML.format(species=species, go=go_id)})

    for i in bm.response.content.decode().split("\n"):
        i = i.rstrip("\n")
        if i != '':
            is_associated[i] = 1

    gtf = GTF(inputfile)

    gtf_associated = gtf.select_by_key("gene_id",
                                       ",".join(list(is_associated.keys())),
                                       invert_match)

    gtf_associated.write(outputfile,
                         gc_off=True)
Exemple #6
0
def intron_sizes(
        inputfile=None,
        outputfile=None,
        key_name=None):
    """
 Add a new key to transcript features containing a comma-separated list of intron sizes.
    """

    gtf = GTF(inputfile, check_ensembl_format=False)

    all_tx_ids = gtf.get_tx_ids(nr=True)
    intron_bo = gtf.get_introns(by_transcript=True,
                                name=["transcript_id"],
                                intron_nb_in_name=False,
                                feat_name=False)

    strands = gtf.select_by_key("feature",
                                "transcript").extract_data("transcript_id,strand",
                                                           as_dict_of_values=True,
                                                           no_na=True,
                                                           nr=True,
                                                           hide_undef=True)

    intron_size = {tx: [] for tx in all_tx_ids}

    for bed_line in intron_bo:
        intron_size[bed_line.name] += [str(bed_line.end - bed_line.start)]

    for tx_id in intron_size:
        if len(intron_size[tx_id]):
            if strands[tx_id] == "-":
                intron_size[tx_id] = ",".join(reversed(intron_size[tx_id]))
            else:
                intron_size[tx_id] = ",".join(intron_size[tx_id])
        else:
            intron_size[tx_id] = "0"
    if len(intron_size):
        gtf = gtf.add_attr_from_dict(feat="transcript",
                                     key="transcript_id",
                                     a_dict=intron_size,
                                     new_key=key_name)
    gtf.write(outputfile,
              gc_off=True)
    close_properly(outputfile, inputfile)
def short_long(inputfile=None,
               outputfile=None,
               longs=None,
               keep_gene_lines=False):
    """ Select the shortest transcript for each gene, Or the longuest if the \
-l arguments is used. """

    gtf = GTF(inputfile, check_ensembl_format=False)

    if longs:
        gtf = gtf.select_longuest_transcripts()
    else:
        gtf = gtf.select_shortest_transcripts()

    if not keep_gene_lines:
        gtf = gtf.select_by_key("feature", "gene", 1)

    gtf.write(outputfile,
              gc_off=True)
Exemple #8
0
def nb_exons(inputfile=None,
             outputfile=None,
             key_name=None,
             text_format=False):
    """
    Count the number of exons in the gtf file.
    """

    gtf = GTF(inputfile)
    n_exons = defaultdict(int)

    # -------------------------------------------------------------------------
    # Computing number of  exon for each transcript in input GTF file
    #
    # -------------------------------------------------------------------------

    message("Computing number of exons for each transcript in input GTF file.")

    exon = gtf.select_by_key("feature", "exon")
    fields = exon.extract_data("transcript_id")

    for i in fields:
        tx_id = i[0]
        n_exons[tx_id] += 1

    if text_format:
        for tx_id in n_exons:
            outputfile.write(tx_id + "\t" + str(n_exons[tx_id]) +
                             "\ttranscript\n")
    else:

        if len(n_exons):
            gtf = gtf.add_attr_from_dict(feat="transcript",
                                         key="transcript_id",
                                         a_dict=n_exons,
                                         new_key=key_name)
        gtf.write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
def select_by_key(inputfile=None,
                  outputfile=None,
                  key=None,
                  value=None,
                  invert_match=False,
                  file_with_values=None,
                  col=0,
                  select_transcripts=False,
                  select_genes=False,
                  select_exons=False,
                  select_cds=False,
                  select_start_codon=False,
                  bed_format=False,
                  log=False,
                  separator="|",
                  names="transcript_id"):
    """Select lines from a GTF file based on attributes and
    associated values.
    """

    # ----------------------------------------------------------------------
    # Check mode
    # ----------------------------------------------------------------------

    if select_transcripts:
        key = "feature"
        value = "transcript"

    elif select_cds:
        key = "feature"
        value = "CDS"

    elif select_start_codon:
        key = "feature"
        value = "start_codon"

    elif select_genes:
        key = "feature"
        value = "gene"

    elif select_exons:
        key = "feature"
        value = "exon"

    elif file_with_values is None:
        if key is None or value is None:
            message(
                "Key and value are mandatory. Alternatively use -e/t/g/f or -f with -k.",
                type="ERROR")

    elif file_with_values is not None:
        if key is None:
            message("Please set -k.", type="ERROR")
        if value is not None:
            message("The -f and -v arguments are mutually exclusive.",
                    type="ERROR")

    # ----------------------------------------------------------------------
    # Load file with value
    # ----------------------------------------------------------------------

    gtf = GTF(inputfile, check_ensembl_format=False)
    all_values = gtf.extract_data(key, as_list=True, no_na=True, nr=True)

    if log:
        feat_before = len(gtf)

    if not file_with_values:
        value_list = value.split(",")
        gtf = gtf.select_by_key(key, value, invert_match)
    else:
        value_list = []

        for line in file_with_values:
            cols = line.split("\t")
            value_list += [cols[col - 1]]
        file_with_values.close()
        file_with_values = open(file_with_values.name)

        gtf = gtf.select_by_key(key=key,
                                invert_match=invert_match,
                                file_with_values=file_with_values,
                                col=col)

    if log:

        not_found = list(set(value_list) - set(all_values))
        feat_after = len(gtf)
        pct = feat_after / feat_before * 100

        message("Number of features before selection: %d" % feat_before)
        message("Fraction of feature selected: %.2f%%" % pct)

        if len(not_found):
            nfj = ",".join(not_found)
            max_letter = min(len(nfj), 50)
            if len(nfj) > 50:
                etc = "..."
            else:
                etc = ""
            message("Values not found: [" + ",".join(not_found)[:max_letter] +
                    etc + "].")
        else:
            message("Values not found: [].")

    # ----------------------------------------------------------------------
    # Write GTF file
    # ----------------------------------------------------------------------

    if not bed_format:

        gtf.write(outputfile, gc_off=True)

    else:
        nb_tokens = len(names.split(","))
        keys = "seqid,start,end," + names + ",score,strand"
        nb_fields = len(keys.split(","))

        for i in gtf.extract_data_iter_list(keys, zero_based=True):
            outputfile.write("\t".join([
                i[0],
                i[1],
                i[2],
                separator.join(i[3:(3 + nb_tokens)]),
                i[nb_fields - 2],
                i[nb_fields - 1],
            ]) + "\n")

    close_properly(outputfile, inputfile)
Exemple #10
0
def mk_matrix(inputfile=None,
              outputfile=None,
              bigwiglist=None,
              ft_type=None,
              pseudo_count=0,
              upstream=1000,
              downstream=1000,
              bin_around_frac=0.1,
              chrom_info=None,
              bin_nb=100,
              nb_proc=None,
              labels=None,
              no_stranded=False,
              zero_to_na=False):
    """
 Description: Create a matrix to be used by 'profile' and 'heatmap' commands.
    """

    # -------------------------------------------------------------------------
    # Check argument consistency
    #
    # -------------------------------------------------------------------------

    if ft_type in ['single_nuc', 'promoter', 'tts']:
        region_size = upstream + downstream + 1
        if region_size < bin_nb:
            message(
                "The region (-u/-d) needs to be extended given the number "
                "of bins (--bin-nb)",
                type="ERROR")

    # -------------------------------------------------------------------------
    # Check output file name does not ends with .zip
    #
    # -------------------------------------------------------------------------

    if outputfile.name.endswith(".zip"):
        outfn = outputfile.name.replace(".zip", "")
        outputfile = open(outfn, "w")

    # -------------------------------------------------------------------------
    # Check input file is in bed or GTF format
    #
    # -------------------------------------------------------------------------

    message("Loading input file...")
    if inputfile.name == '<stdin>':
        gtf = GTF(inputfile.name)
        is_gtf = True
        if ft_type == 'user_regions':
            message(
                "--ft-type can not be set to user_regions"
                " when a gtf is provided.",
                type="ERROR")
    else:
        try:

            region_bo = BedTool(inputfile.name)
            len(region_bo)
        except IndexError:
            message("Unable to read the input file. Check format",
                    type="ERROR")
        if len(region_bo) == 0:
            message("Unable to find requested regions", type="ERROR")

        if region_bo.file_type == 'gff':
            message('Loading the GTF file.')
            gtf = GTF(inputfile.name)
            is_gtf = True
        else:
            is_gtf = False

            if ft_type != 'user_regions' and ft_type != 'single_nuc':
                message(
                    "Set --ft-type to 'user_regions' or 'single_nuc'"
                    " when using input bed file.",
                    type="ERROR")
            # Check that the strand is provided and
            # check it is located in the right column
            # (not checked by BedTool...).
            if region_bo.field_count() < 6:
                if not no_stranded:
                    message("Strand is undefined. Use -nst.", type="ERROR")
            else:
                region_name = dict()
                for i in region_bo:
                    if region_name.get(i.name, None) is None:
                        region_name[i.name] = 1
                    else:
                        message(
                            "Regions in bed file should have "
                            "unique identifier (col 4).",
                            type="ERROR")
                    if i.strand[0] not in ['.', '+', '-']:
                        message("Strand should be one of '+','-' or '.'.",
                                type="ERROR")
                    if ft_type == 'single_nuc':
                        if i.end - i.start != 1:
                            message(
                                "Region length should be 1 nucleotide "
                                "long when 'single_nuc' is set. Use 'user_regions'.",
                                type="ERROR")
                    elif ft_type == 'user_regions':
                        if i.end - i.start == 1:
                            message(
                                "Region length should not be 1 nucleotide "
                                "long when 'user_regions' is set. Use 'single_nuc'.",
                                type="ERROR")

    # -------------------------------------------------------------------------
    # Create a list of labels for the diagrams.
    # Take user input in account
    # -------------------------------------------------------------------------
    message('Checking labels.')

    if labels is not None:
        labels = labels.split(",")
        # Ensure the number of labels is the same as the number of bw files.
        if len(labels) != len(bigwiglist):
            message(
                "The number of labels should be the same as the number of"
                " bigwig files.",
                type="ERROR")
        # Ensure labels are non-redondant
        if len(labels) > len(set(labels)):
            message("Labels must be unique.", type="ERROR")
    else:
        labels = []
        for i in range(len(bigwiglist)):
            labels += [
                os.path.splitext(os.path.basename(bigwiglist[i].name))[0]
            ]

    # -------------------------------------------------------------------------
    #
    # Get the requested transcrit lines in bed format
    # Tx are restricted to those found on chromosome
    # declared in the bigwig file.
    # -------------------------------------------------------------------------
    message('Getting the list of chromosomes declared in bigwig files.')
    bw_chrom = list()
    for i in bigwiglist:
        bw_chrom += list(pyBigWig.open(i.name).chroms().keys())

    bed_col = [0, 1, 2, 3, 4, 5]

    if is_gtf:

        message('Selecting chromosomes declared in bigwig from gtf.')
        tmp = gtf.select_by_key("feature", "transcript").select_by_key(
            "seqid", ",".join(bw_chrom))

        tmp = gtf.select_by_key("feature", "transcript")
        tmp_tx_name = tmp.extract_data("transcript_id", as_list=True)

        # If several trancript records are associated to
        # the same transcript_id, raise an error.
        if len(tmp_tx_name) > len(set(tmp_tx_name)):
            message('Transcripts should have a unique identifier.',
                    type="ERROR")

        message('Selecting requested regions.')

        # ----------------------------------------------------------------------
        #
        # Slop tss and promoters.
        # No need if transcript was requested (it will be flanked by upstream
        # and doswnstream regions later on).
        # ----------------------------------------------------------------------

        if ft_type == 'transcript':
            message("Getting transcript boundaries (input gtf).")

            main_region_bo = tmp.to_bed(name=["transcript_id"])

        elif ft_type == 'promoter':

            message("Getting promoter regions [-%d,+%d]." %
                    (upstream, downstream))

            main_region_bo = tmp.get_tss(name=["transcript_id"]).slop(
                s=True, l=upstream, r=downstream, g=chrom_info.name)

        elif ft_type == 'tts':

            main_region_bo = tmp.get_tts(name=["transcript_id"]).slop(
                s=True, l=upstream, r=downstream, g=chrom_info.name)

    else:
        message("Loading regions")

        if ft_type == 'user_regions':
            main_region_bo = BedTool(inputfile.name).cut(bed_col)
        elif ft_type == 'single_nuc':
            main_region_bo = BedTool(inputfile.name).cut(bed_col).slop(
                s=True, l=upstream, r=downstream, g=chrom_info.name)
        else:
            message("Unknown method.")

    # Save for tracability
    main_region_bed = make_tmp_file(prefix="region" + ft_type, suffix=".bed")
    main_region_bo.saveas(main_region_bed.name)

    # -------------------------------------------------------------------------
    #
    # Print a header in the output file
    #
    # -------------------------------------------------------------------------
    message("Preparing comments")

    comments = "#"
    comments += "ft_type:" + ft_type + ";"
    comments += "from:" + str(upstream) + ";"
    comments += "to:" + str(downstream) + ";"
    comments += "labels:" + ",".join(labels) + ";"

    # -------------------------------------------------------------------------
    # Compute coverage of requested region
    # Each worker will send a file
    # -------------------------------------------------------------------------

    outputfile_list = {}
    message("Using %d bins for main region." % bin_nb)

    tmp_file = bw_profile_mp(in_bed_file=main_region_bed.name,
                             nb_proc=nb_proc,
                             big_wig=[x.name for x in bigwiglist],
                             bin_nb=bin_nb,
                             pseudo_count=pseudo_count,
                             stranded=not no_stranded,
                             type="main",
                             labels=labels,
                             outputfile=outputfile.name,
                             zero_to_na=zero_to_na,
                             verbose=pygtftk.utils.VERBOSITY)

    outputfile_list["main"] = tmp_file

    # -------------------------------------------------------------------------
    # If transcript was requested
    # we must process flanking regions
    # We need to retrieve coverage of promoter [-upstream, 0]
    # as transcript coverage window size will depend on transcript length.
    # For promoter the length of windows will be fixed.
    # -------------------------------------------------------------------------

    if ft_type in ['transcript', 'user_regions']:

        # Number of bins for TTS and TSS
        around_bin_nb = int(round(bin_nb * bin_around_frac))
        if around_bin_nb < 1:
            around_bin_nb = 1

        if upstream > 0:

            if ft_type == 'transcript':
                message("Getting promoter (using %d bins)." % around_bin_nb)
                ups_region_bo = tmp.get_tss(name=["transcript_id"]).slop(
                    s=True, l=upstream, r=-1, g=chrom_info.name).cut(bed_col)

            else:
                message("Getting upstream regions (%d bins)." % around_bin_nb)
                ups_region_bo = main_region_bo.flank(s=True,
                                                     l=upstream,
                                                     r=0,
                                                     g=chrom_info.name)

            upstream_bed_file = make_tmp_file(prefix="upstream_region" +
                                              ft_type,
                                              suffix=".bed")

            ups_region_bo.saveas(upstream_bed_file.name)

            tmp_file = bw_profile_mp(in_bed_file=upstream_bed_file.name,
                                     nb_proc=nb_proc,
                                     big_wig=[x.name for x in bigwiglist],
                                     bin_nb=around_bin_nb,
                                     pseudo_count=pseudo_count,
                                     stranded=not no_stranded,
                                     type="upstream",
                                     labels=labels,
                                     outputfile=outputfile.name,
                                     zero_to_na=zero_to_na,
                                     verbose=pygtftk.utils.VERBOSITY)

            outputfile_list["upstream"] = tmp_file

        if downstream > 0:

            if ft_type == 'transcript':
                message("Getting TTS (using %d bins)." % around_bin_nb)
                dws_region_bo = tmp.get_tts(name=["transcript_id"]).slop(
                    s=True, l=-1, r=downstream, g=chrom_info.name).cut(bed_col)
            else:
                message("Getting downstream regions (%d bins)." %
                        around_bin_nb)

                dws_region_bo = main_region_bo.flank(s=True,
                                                     l=0,
                                                     r=downstream,
                                                     g=chrom_info.name)
            dws_bed_file = make_tmp_file(prefix="dowstream_region" + ft_type,
                                         suffix=".bed")

            dws_region_bo.saveas(dws_bed_file.name)

            tmp_file = bw_profile_mp(in_bed_file=dws_bed_file.name,
                                     nb_proc=nb_proc,
                                     big_wig=[x.name for x in bigwiglist],
                                     bin_nb=around_bin_nb,
                                     pseudo_count=pseudo_count,
                                     stranded=not no_stranded,
                                     type="downstream",
                                     labels=labels,
                                     outputfile=outputfile.name,
                                     zero_to_na=zero_to_na,
                                     verbose=pygtftk.utils.VERBOSITY)

            outputfile_list["downstream"] = tmp_file

    # -------------------------------------------------------------------------
    #
    # Merge file using pandas
    #
    # -------------------------------------------------------------------------

    message("Reading (pandas): " + outputfile_list["main"].name, type="DEBUG")
    df_main = pd.read_csv(outputfile_list["main"].name, sep="\t")
    # save strand and end
    # They will re-joined added later
    df_copy = df_main[['bwig', 'chrom', 'gene', 'strand', 'start', 'end']]

    df_start = df_main.pop('start')
    df_end = df_main.pop('end')

    if "upstream" in outputfile_list:
        message("Merging upstream file")
        message("Reading (pandas): " + outputfile_list["upstream"].name,
                type="DEBUG")
        df_up = pd.read_csv(outputfile_list["upstream"].name, sep="\t")
        df_up = df_up.drop(['start', 'end'], 1)
        df_main = df_up.merge(df_main.loc[:, df_main.columns],
                              on=['bwig', 'chrom', 'gene', 'strand'])

    if "downstream" in outputfile_list:
        message("Merging downstream file")
        message("Reading (pandas): " + outputfile_list["downstream"].name,
                type="DEBUG")
        df_dws = pd.read_csv(outputfile_list["downstream"].name, sep="\t")
        df_dws = df_dws.drop(['start', 'end'], 1)
        df_main = df_main.merge(df_dws.loc[:, df_dws.columns],
                                on=['bwig', 'chrom', 'gene', 'strand'])

    # join start and end.
    df_main = df_main.merge(df_copy.loc[:, df_copy.columns],
                            on=['bwig', 'chrom', 'gene', 'strand'])
    df_start = df_main.pop('start')
    df_end = df_main.pop('end')
    df_main.insert(2, 'start', df_start)
    df_main.insert(3, 'end', df_end)

    message("Writing to file")
    outputfile.close()

    with open(outputfile.name, 'a') as f:
        f.write(comments + "\n")
        df_main.to_csv(f,
                       sep="\t",
                       index=False,
                       mode='a',
                       columns=df_main.columns,
                       na_rep='NA')

    # -------------------------------------------------------------------------
    #
    # Compress
    #
    # -------------------------------------------------------------------------

    message("Compressing")
    path = os.path.abspath(outputfile.name)
    filename = os.path.basename(path)
    message("filename: " + filename, type="DEBUG")
    zip_filename = filename + '.zip'
    message("zip_filename: " + zip_filename, type="DEBUG")
    zip_path = os.path.join(os.path.dirname(path), zip_filename)
    message("zip_path: " + zip_path, type="DEBUG")

    with zipfile.ZipFile(zip_path, 'w', allowZip64=True) as zf:
        zf.write(filename=path, arcname=filename)

    for i in outputfile_list:
        message("deleting " + outputfile_list[i].name)
        os.remove(outputfile_list[i].name)
    os.remove(outputfile.name)

    gc.disable()
    close_properly(inputfile, outputfile)
Exemple #11
0
def get_feat_seq(inputfile=None,
                 outputfile=None,
                 genome=None,
                 feature_type="exon",
                 separator="",
                 no_rev_comp=False,
                 label="",
                 rev_comp_to_header=False,
                 unique=False):
    """
    Description: Get transcripts sequences in fasta format from a GTF file.
    """

    # -------------------------------------------------------------------------
    # Should sequences be reverse-complemented
    # -------------------------------------------------------------------------

    force_strandedness = not no_rev_comp

    # -------------------------------------------------------------------------
    # Check chrom to avoid segfault
    # https://github.com/dputhier/libgtftk/issues/27
    # -------------------------------------------------------------------------

    if genome.name.endswith(".gz"):
        message("Genome in gz format is not currently supported.",
                type="ERROR")

    genome_chr_list = []

    message("Fasta files found: %s" % genome.name)

    message("Checking fasta file chromosome list")

    with genome as geno:
        for i in geno:
            if i.startswith(">"):
                i = i.rstrip("\n")
                genome_chr_list += [i[1:]]

    gtf = GTF(inputfile, check_ensembl_format=False)

    gtf_chr_list = gtf.get_chroms(nr=True)

    # Check chrom to avoid segfault
    # https://github.com/dputhier/libgtftk/issues/27
    message("Comparing chromosomes from GTF and Fasta files.")
    gtf_chr_list_found = [x for x in gtf_chr_list if x in genome_chr_list]

    if len(gtf_chr_list_found) == 0:
        message("Chromosome from GTF were not found in fasta file",
                type="ERROR")

    if len(gtf_chr_list_found) != len(gtf_chr_list):
        not_found = [x for x in gtf_chr_list if x not in gtf_chr_list_found]
        message("Some chromosomes were not found in the fasta file: %s" %
                ",".join(not_found),
                type="ERROR")

    # -------------------------------------------------------------------------
    # Retrieving fasta sequences
    #
    # -------------------------------------------------------------------------

    message("Retrieving fasta sequences.")

    try:
        # The nameOnly argument is not supported
        # through all Bedtools versions

        feat_seq = gtf.select_by_key("feature", feature_type).to_bed(
            name=label.split(","),
            sep=separator).sequence(fi=genome.name,
                                    nameOnly=True,
                                    s=force_strandedness)
    except BEDToolsError:

        feat_seq = gtf.select_by_key("feature", feature_type).to_bed(
            name=label.split(","),
            sep=separator).sequence(fi=genome.name,
                                    name=True,
                                    s=force_strandedness)

    id_printed = set()

    to_print = True

    for _, line in enumerate(open(feat_seq.seqfn)):

        if line.startswith(">"):

            # This (+/-) may be added by pybedtool
            # but can be accessed though --label
            line = re.sub("\(\+\)$", "", line)
            line = re.sub("\(\-\)$", "", line)

            if rev_comp_to_header:
                if force_strandedness:
                    line = line + separator + "rev_comp"
                else:
                    line = line + separator + "no_rev_comp"

            if unique:
                if line in id_printed:
                    to_print = False
            if to_print:
                outputfile.write(line)
                id_printed.add(line)

        else:
            if not to_print:
                to_print = True
            else:
                outputfile.write(line)

    gc.disable()
    close_properly(outputfile, inputfile)
Exemple #12
0
def feature_size(inputfile=None,
                 outputfile=None,
                 ft_type="transcript",
                 names="transcript_id",
                 key_name='feature_size',
                 separator="|",
                 bed=False):
    """
 Get the size and limits (start/end) of features enclosed in the GTF. If bed
 format is requested returns the limits zero-based half open and the size as a score.
 Otherwise output GTF file with 'feat_size' as a new key and size as value.
    """

    message("Computing feature sizes.")

    gtf = GTF(inputfile)

    feat_list = gtf.get_feature_list(nr=True) + ['mature_rna']

    if ft_type not in feat_list + ["*"]:
        message("Unable to find requested feature.", type="ERROR")

    names = names.split(",")

    if ft_type != 'mature_rna':

        if bed:
            bed_obj = gtf.select_by_key("feature",
                                        ft_type).to_bed(name=names,
                                                        sep=separator,
                                                        add_feature_type=True)

            for i in bed_obj:
                i.score = str(i.end - i.start)
                write_properly(chomp(str(i)), outputfile)
        else:

            tmp_file = make_tmp_file(prefix="feature_size", suffix=".txt")

            elmt = gtf.extract_data("feature,start,end",
                                    as_list_of_list=True,
                                    no_na=False,
                                    hide_undef=False)

            for i in elmt:
                if i[0] != ft_type and ft_type != "*":
                    tmp_file.write("?\n")
                else:
                    tmp_file.write(str(int(i[2]) - int(i[1]) + 1) + "\n")

            tmp_file.close()

            gtf.add_attr_column(tmp_file, key_name).write(outputfile,
                                                          gc_off=True)

    else:

        tx_size = gtf.get_transcript_size()

        if bed:
            bed_obj = gtf.select_by_key("feature", 'transcript').to_bed(
                ['transcript_id'] + names,
                add_feature_type=False,
                sep=separator,
                more_name=['mature_rna'])

            for i in bed_obj:
                names = i.name.split(separator)
                tx_id = names.pop(0)
                i.score = tx_size[tx_id]
                i.name = separator.join(names)
                write_properly(chomp(str(i)), outputfile)
        else:

            if len(tx_size):
                gtf = gtf.add_attr_from_dict(feat="transcript",
                                             key="transcript_id",
                                             a_dict=tx_size,
                                             new_key=key_name)

            gtf.write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Exemple #13
0
def convergent(inputfile=None,
               outputfile=None,
               upstream=1500,
               downstream=1500,
               chrom_info=None):
    """
    Find transcript with convergent tts.
    """

    message("Using -u " + str(upstream) + ".")
    message("Using -d " + str(downstream) + ".")

    tx_to_convergent_nm = dict()
    dist_to_convergent = dict()
    tts_pos = dict()

    message("Loading GTF.")

    gtf = GTF(inputfile)

    message("Getting transcript coordinates.")

    tx_feat = gtf.select_by_key("feature", "transcript")

    message("Getting tts coordinates.")

    tts_bo = tx_feat.get_tts(name=["transcript_id", "gene_id"], sep="||")

    # get tts position
    for i in tts_bo:
        tx_id_ov, gn_id_ov = i.name.split("||")
        tts_pos[tx_id_ov] = int(i.start)

    message("Getting tts coordinates.")

    tts_region_bo = tts_bo.slop(s=True,
                                l=upstream,
                                r=downstream,
                                g=chrom_info.name).cut([0, 1, 2, 3, 4, 5])

    message("Intersecting...")
    tts_intersect_bo = tts_region_bo.intersect(tts_bo,
                                               wb=True,
                                               s=False,
                                               S=True)

    tmp_file = make_tmp_file("tts_slop", ".bed")
    tts_region_bo.saveas(tmp_file.name)
    tmp_file = make_tmp_file("tts_slop_intersection_with_tts_as_", ".bed")
    tts_intersect_bo.saveas(tmp_file.name)

    for i in tts_intersect_bo:

        tx_id_main, gene_id_main = i.fields[3].split("||")
        tx_id_ov, gn_id_ov = i.fields[9].split("||")

        if gene_id_main != gn_id_ov:
            if tx_id_main in tx_to_convergent_nm:
                dist = abs(tts_pos[tx_id_main] - tts_pos[tx_id_ov])
                if dist < dist_to_convergent[tx_id_main]:
                    dist_to_convergent[tx_id_main] = dist
                    tx_to_convergent_nm[tx_id_main] = tx_id_ov
            else:
                dist = abs(tts_pos[tx_id_main] - tts_pos[tx_id_ov])
                dist_to_convergent[tx_id_main] = dist
                tx_to_convergent_nm[tx_id_main] = tx_id_ov

    if len(tx_to_convergent_nm):
        gtf = gtf.add_attr_from_dict(feat="transcript",
                                     key="transcript_id",
                                     a_dict=tx_to_convergent_nm,
                                     new_key="convergent")

        gtf = gtf.add_attr_from_dict(feat="transcript",
                                     key="transcript_id",
                                     a_dict=dist_to_convergent,
                                     new_key="dist_to_convergent")

    gtf.write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Exemple #14
0
def get_5p_3p_coords(inputfile=None,
                     outputfile=None,
                     ft_type="transcript",
                     names="transcript_id",
                     separator="|",
                     more_names='',
                     transpose=0,
                     invert=False,
                     explicit=False):
    """
    Get the 5p or 3p coordinate for each feature (e.g TSS or TTS for a transcript).
    """

    if more_names is None:
        more_names = []
    else:
        more_names = more_names.split(',')

    if not invert:
        message("Computing 5' coordinates of '" + ft_type + "'.")
    else:
        message("Computing 3' coordinates of '" + ft_type + "'.")

    gtf = GTF(inputfile, check_ensembl_format=False)

    if names != "*":
        nms = names.split(",")
    else:

        nms = gtf.select_by_key("feature", "transcript").get_attr_list(add_basic=False)

    if not invert:

        bed_obj = gtf.get_5p_end(feat_type=ft_type,
                                 name=nms,
                                 sep=separator,
                                 more_name=more_names,
                                 explicit=explicit)

    else:

        bed_obj = gtf.get_3p_end(feat_type=ft_type,
                                 name=nms,
                                 sep=separator,
                                 more_name=more_names,
                                 explicit=explicit)

    if not len(bed_obj):
        message("Requested feature could not be found. Use convert_ensembl maybe.",
                type="ERROR")

    if transpose == 0:
        for i in bed_obj:
            write_properly(chomp(str(i)), outputfile)
    else:
        for i in bed_obj:
            out_list = list()
            if i.strand == "+":
                out_list = [i.chrom,
                            str(i.start + transpose),
                            str(i.end + transpose),
                            i.name,
                            i.score,
                            i.strand]
            elif i.strand == "-":
                out_list = [i.chrom,
                            str(i.start - transpose),
                            str(i.end - transpose),
                            i.name,
                            i.score,
                            i.strand]
            outputfile.write("\t".join(out_list) + "\n")
    gc.disable()
    close_properly(outputfile, inputfile)
def closest_genes(
        inputfile=None,
        outputfile=None,
        from_region_type=None,
        no_header=False,
        nb_neighbors=1,
        to_region_type=None,
        same_strandedness=False,
        diff_strandedness=False,
        text_format=False,
        identifier="gene_id",
        collapse=False):
    """
    Find the n closest genes for each gene.
    """

    if same_strandedness and diff_strandedness:
        message("--same-strandedness and --diff-strandedness are "
                "mutually exclusive.",
                type="ERROR")

    # ----------------------------------------------------------------------
    # load GTF
    # ----------------------------------------------------------------------

    gtf = GTF(inputfile)
    gn_gtf = gtf.select_by_key("feature", "gene")
    gn_ids = gn_gtf.get_gn_ids(nr=True)

    if len(gn_gtf) == 0:
        message("No gene feature found. Please use convert_ensembl.",
                type="ERROR")
    if nb_neighbors >= (len(gn_gtf) - 1):
        message("Two much neighbors",
                type="ERROR")

    all_ids = gn_gtf.extract_data(identifier, as_list=True, no_na=False)

    if "." in all_ids:
        message("Some identifiers are undefined ('.').",
                type="ERROR")

    if len(all_ids) == 0:
        message("The identifier was not found.",
                type="ERROR")

    # ----------------------------------------------------------------------
    # load GTF and requested regions (for source/'from' transcript)
    # ----------------------------------------------------------------------

    if from_region_type == 'tss':
        from_regions = gn_gtf.get_5p_end(feat_type="gene",
                                         name=[identifier],
                                         ).cut([0, 1, 2,
                                                3, 4, 5]).sort()
    elif from_region_type == 'tts':
        from_regions = gn_gtf.get_3p_end(feat_type="gene",
                                         name=[identifier],
                                         ).cut([0, 1, 2,
                                                3, 4, 5]).sort()
    elif from_region_type == 'gene':
        from_regions = gn_gtf.to_bed(name=[identifier],
                                     ).cut([0, 1, 2,
                                            3, 4, 5]).sort()
    else:
        message("Unknown type.", type="ERROR")

    # ----------------------------------------------------------------------
    # load GTF and requested regions (for dest/'to' transcript)
    # ----------------------------------------------------------------------

    if to_region_type == 'tss':
        to_regions = gn_gtf.get_5p_end(feat_type="gene",
                                       name=[identifier],
                                       ).cut([0, 1, 2,
                                              3, 4, 5]).sort()
    elif to_region_type == 'tts':
        to_regions = gn_gtf.get_3p_end(feat_type="gene",
                                       name=[identifier],
                                       ).cut([0, 1, 2,
                                              3, 4, 5]).sort()

    elif to_region_type == 'gene':
        to_regions = gn_gtf.to_bed(name=[identifier],
                                   ).cut([0, 1, 2,
                                          3, 4, 5]).sort()
    else:
        message("Unknown type.", type="ERROR")

    # ----------------------------------------------------------------------
    # Search closest genes
    # ----------------------------------------------------------------------

    gene_closest = defaultdict(list)
    gene_closest_dist = defaultdict(list)

    closest_bo = from_regions.closest(b=to_regions,
                                      k=nb_neighbors,
                                      N=True,
                                      s=same_strandedness,
                                      S=diff_strandedness,
                                      d=True)

    for i in closest_bo:
        gene_closest[i[3]] += [i[9]]
        gene_closest_dist[i[3]] += [i[12]]

    if not text_format:

        if len(gene_closest):
            gtf = gtf.add_attr_from_dict(feat="gene",
                                         key=identifier,
                                         a_dict=gene_closest,
                                         new_key="closest_gn")

            gtf = gtf.add_attr_from_dict(feat="gene",
                                         key=identifier,
                                         a_dict=gene_closest_dist,
                                         new_key="closest_dist")

        gtf.write(outputfile, gc_off=True)

    else:
        if not no_header:
            outputfile.write("genes\tclosest_genes\tdistances\n")

        for gene in gn_ids:

            if not collapse:

                outputfile.write("\t".join([gene,
                                            ",".join(gene_closest[gene]),
                                            ",".join(gene_closest_dist[gene])]) + "\n")

            else:

                for closest, dist in zip(gene_closest[gene],
                                         gene_closest_dist[gene]):
                    outputfile.write("\t".join([gene,
                                                closest,
                                                dist]) + "\n")

        gc.disable()

    close_properly(outputfile, inputfile)
Exemple #16
0
def overlapping(
        inputfile=None,
        outputfile=None,
        key_name=None,
        upstream=1500,
        downstream=1500,
        chrom_info=None,
        feature_type='transcript',
        same_strandedness=False,
        diff_strandedness=False,
        annotate_gtf=False,
        bool=False,
        annotate_all=False,
        invert_match=False):
    """
Description: Find transcripts whose body/TSS/TTS do or do not overlap with any
transcript from another gene.
    """

    # ----------------------------------------------------------------------
    # Prepare key names
    # ----------------------------------------------------------------------

    if annotate_gtf:
        if key_name is None:
            key_info = ["overlap",
                        feature_type,
                        "u" + str(upstream / 1000) + "k",
                        "d" + str(downstream / 1000) + "k"
                        ]
            key_name = "_".join(key_info)

        if invert_match:
            message("--annotate-gtf and --invert-match are "
                    "mutually exclusive.",
                    type="ERROR")

    if same_strandedness and diff_strandedness:
        message("--same-strandedness and --diff-strandedness are "
                "mutually exclusive.",
                type="ERROR")

    message("Using -u " + str(upstream))
    message("Using -d " + str(downstream))

    overlapping_tx = defaultdict(list)

    # Load the GTF so that it won't be lost
    # if GTF stream comes from stdin
    gtf = GTF(inputfile)

    message("Getting transcript in bed format")

    tx_feat = gtf.select_by_key("feature",
                                "transcript")

    if annotate_all:
        overlapping_tx = gtf.extract_data(keys=["transcript_id"], as_dict=True, default_val="0")
        for i in overlapping_tx:
            overlapping_tx[i] = []

    # ----------------------------------------------------------------------
    # Get transcript limits
    # ----------------------------------------------------------------------

    tx_bed = tx_feat.to_bed(name=["transcript_id", "gene_id"], sep="||")

    message("Getting " + feature_type + " and 'slopping'.")

    if feature_type == "transcript":

        bed_obj = tx_bed.slop(s=True,
                              l=upstream,
                              r=downstream,
                              g=chrom_info.name).cut([0, 1, 2, 3, 4, 5])

    elif feature_type == "promoter":

        bed_obj = tx_feat.get_tss(name=["transcript_id", "gene_id"],
                                  sep="||").slop(s=True,
                                                 l=upstream,
                                                 r=downstream,
                                                 g=chrom_info.name).cut([0, 1,
                                                                         2, 3,
                                                                         4, 5])

    elif feature_type == "tts":

        bed_obj = tx_feat.get_tts(name=["transcript_id", "gene_id"],
                                  sep="||").slop(s=True,
                                                 l=upstream,
                                                 r=downstream,
                                                 g=chrom_info.name).cut([0, 1,
                                                                         2, 3,
                                                                         4, 5])
    else:
        message("Not implemented yet", type="ERROR")

    tmp_file = make_tmp_file(feature_type + "_slopped_region", ".bed")
    bed_obj.saveas(tmp_file.name)

    overlap_regions = bed_obj.intersect(tx_bed,
                                        wb=True,
                                        s=same_strandedness,
                                        S=diff_strandedness)

    tmp_file = make_tmp_file(feature_type + "_overlapping_regions", ".bed")
    overlap_regions.saveas(tmp_file.name)

    for i in overlap_regions:

        tx_other, gn_other = i.fields[9].split("||")
        tx_id, gene_id = i.fields[3].split("||")
        if gene_id != gn_other:
            overlapping_tx[tx_id] += [tx_other]

    if bool:
        for k, _ in overlapping_tx.items():
            if not len(overlapping_tx[k]):
                overlapping_tx[k] = "0"
            else:
                overlapping_tx[k] = "1"

    if not invert_match:

        if not annotate_gtf:
            value = ",".join(set(overlapping_tx.keys()))
            gtf.select_by_key("transcript_id",
                              value).write(outputfile,
                                           gc_off=True)
        else:

            if len(overlapping_tx):
                gtf = gtf.add_attr_from_dict(feat="transcript",
                                             key="transcript_id",
                                             a_dict=overlapping_tx,
                                             new_key=key_name)
            gtf.write(outputfile,
                      gc_off=True)

    else:
        values = ",".join(set(overlapping_tx.keys()))
        gtf.select_by_key("transcript_id",
                          values,
                          invert_match).write(outputfile, gc_off=True)

    gc.disable()
    close_properly(outputfile, inputfile)
Exemple #17
0
def coverage(
        inputfile=None,
        outputfile=None,
        bw_list=None,
        labels=None,
        pseudo_count=1,
        nb_window=1,
        ft_type="promoter",
        n_highest=None,
        downstream=1000,
        key_name="cov",
        zero_to_na=False,
        name_column=None,
        upstream=1000,
        chrom_info=None,
        nb_proc=1,
        matrix_out=False,
        stat='mean'):
    """
    Compute transcript coverage with one or several bigWig.
    """

    # -------------------------------------------------------------------------
    # Create a list of labels.
    # Take user input in account
    # -------------------------------------------------------------------------

    bw_list = [x.name for x in bw_list]

    if len(bw_list) != len(set(bw_list)):
        message("Found the same bigwigs several times.",
                type="ERROR")

    message('Checking labels.')

    if labels is not None:
        labels = labels.split(",")
        # Ensure the number of labels is the same as the number of bw files.
        if len(labels) != len(bw_list):
            message("The number of labels should be the same as the number of"
                    " bigwig files.", type="ERROR")
        # Ensure labels are non-redondant
        if len(labels) > len(set(labels)):
            message("Labels must be unique.", type="ERROR")
    else:
        labels = []
        for i in range(len(bw_list)):
            labels += [
                os.path.splitext(
                    os.path.basename(
                        bw_list[i]))[0]]

    # -------------------------------------------------------------------------
    # Check the number of windows
    #
    # -------------------------------------------------------------------------

    if n_highest is None:
        n_highest = nb_window

    message('Number of bins: %d' % nb_window)
    message('N highest values: %d' % n_highest)

    if n_highest > nb_window:
        message('The number of window used for computing the score'
                ' (-n) can not be greater than the number of'
                ' windows (-w)', type="ERROR")
        sys.exit()

    # -------------------------------------------------------------------------
    # Check input file is in bed or GTF format
    #
    # -------------------------------------------------------------------------

    message("Loading input file...")
    if inputfile.name == '<stdin>':
        gtf = GTF(inputfile.name)
        is_gtf = True
    else:
        region_bo = BedTool(inputfile.name)
        if len(region_bo) == 0:
            message("Unable to find requested regions",
                    type="ERROR")

        if region_bo.file_type == 'gff':
            gtf = GTF(inputfile.name)
            is_gtf = True
        else:
            is_gtf = False

    # -------------------------------------------------------------------------
    # Get regions of interest
    #
    # -------------------------------------------------------------------------

    name_column = name_column.split(",")

    if is_gtf:

        message("Getting regions of interest...")

        if ft_type.lower() == "intergenic":

            region_bo = gtf.get_intergenic(chrom_info, 0, 0).slop(s=True,
                                                                  l=upstream,
                                                                  r=downstream,
                                                                  g=chrom_info.name).sort()

        elif ft_type.lower() == "intron":

            region_bo = gtf.get_introns().slop(s=True,
                                               l=upstream,
                                               r=downstream,
                                               g=chrom_info.name).sort()

        elif ft_type == "intron_by_tx":

            region_bo = gtf.get_introns(by_transcript=True,
                                        name=name_column,
                                        ).slop(s=True,
                                               l=upstream,
                                               r=downstream,
                                               g=chrom_info.name).sort()

        elif ft_type.lower() in ["promoter", "tss"]:

            region_bo = gtf.get_tss(name=name_column, ).slop(s=True,
                                                             l=upstream,
                                                             r=downstream,
                                                             g=chrom_info.name).sort()

        elif ft_type.lower() in ["tts", "terminator"]:

            region_bo = gtf.get_tts(name=name_column).slop(s=True,
                                                           l=upstream,
                                                           r=downstream,
                                                           g=chrom_info.name).sort()

        else:

            region_bo = gtf.select_by_key(
                "feature",
                ft_type, 0
            ).to_bed(name=name_column).slop(s=True,
                                            l=upstream,
                                            r=downstream,
                                            g=chrom_info.name).sort()

        if len(region_bo) == 0:
            message("Unable to find requested regions",
                    type="ERROR")

    else:
        region_bo = region_bo.slop(s=True,
                                   l=upstream,
                                   r=downstream,
                                   g=chrom_info.name).sort()

    region_bed = make_tmp_file(prefix="region", suffix=".bed")

    region_bo.saveas(region_bed.name)

    # -------------------------------------------------------------------------
    # Compute coverage
    #
    # -------------------------------------------------------------------------

    result_bed = bw_cov_mp(bw_list=bw_list,
                           region_file=open(region_bed.name),
                           labels=labels,
                           bin_nb=nb_window,
                           pseudo_count=pseudo_count,
                           zero_to_na=zero_to_na,
                           nb_proc=nb_proc,
                           n_highest=n_highest,
                           stat=stat,
                           verbose=pygtftk.utils.VERBOSITY)

    if matrix_out:
        result_bed.close()

        df_first = pd.read_csv(result_bed.name, sep="\t", header=None)

        df_first = df_first.ix[:, [0, 1, 2, 3, 5, 4]]

        df_list = []

        for i in range(len(labels)):
            # create a sub data frame containing the coverage values of the
            # current bwig
            str_to_find = r"^" + labels[i] + r"\|"
            tmp_df = df_first[df_first[3].str.match(str_to_find)].copy()
            to_replace = r"^" + labels[i] + r"\|"
            tmp_df.iloc[:, 3] = tmp_df.iloc[:, 3].replace(to_replace,
                                                          r"", regex=True)

            df_list += [tmp_df]

        df_final = df_list.pop(0)

        for i in df_list:
            # Add columns to df final by joining on
            # chrom, start, end, transcript_id, strand
            df_final = df_final.merge(i.iloc[:,
                                      list(range(6))], on=[0, 1,
                                                           2, 3, 5])

        df_final.columns = ["chrom",
                            "start",
                            "end",
                            "name",
                            "strand"] + labels

        df_final.to_csv(outputfile, sep="\t", index=False)

    else:
        nb_line = 0
        for i in result_bed:
            outputfile.write(i)
            nb_line += 1

        if nb_line == 0:
            message("No line available in output...",
                    type="ERROR")
    gc.disable()
    close_properly(inputfile, outputfile)
Exemple #18
0
def get_tx_seq(inputfile=None,
               outputfile=None,
               genome=None,
               with_introns=False,
               delete_version=False,
               del_chr=False,
               separator="",
               no_rev_comp=False,
               label="",
               sleuth_format=True,
               explicit=True,
               assembly="bla"):
    """
    Description: Get transcripts sequences in fasta format from a GTF file.
    """

    # -----------------------------------------------------------
    #  Check chromosomes in fasta file
    # -----------------------------------------------------------

    genome_chr_list = []

    message("%d fasta files found." % len(genome))

    as_gz_ext = [True for x in genome if x.name.endswith(".gz")]

    if any(as_gz_ext):
        message("Genome in gz format is not currently supported.",
                type="ERROR")

    if len(genome) == 1:
        message("Checking fasta file chromosome list")
        genome = genome[0]
        with genome as genome_file:
            for i in genome_file:
                if i.startswith(">"):
                    i = i.rstrip("\n")
                    genome_chr_list += [i[1:]]
    else:
        message("Merging fasta files")
        tmp_genome = make_tmp_file(prefix="genome", suffix=".fa")
        with tmp_genome as tg:
            for curr_file in genome:
                message("Merging %s" % curr_file.name)
                with curr_file as cf:
                    shutil.copyfileobj(cf, tg, 1024 * 1024 * 100)

        message("Checking fasta file chromosome list")
        genome = open(tmp_genome.name, "r")
        with genome as genome_file:
            for i in genome_file:
                if i.startswith(">"):
                    i = i.rstrip("\n")
                    genome_chr_list += [i[1:]]

    rev_comp = not no_rev_comp

    message("Chromosomes in fasta file: " + ",".join(genome_chr_list))

    # -----------------------------------------------------------
    #  Read gtf
    # -----------------------------------------------------------

    gtf = GTF(inputfile)
    nb_tx_before = gtf.extract_data("transcript_id",
                                    as_list=True,
                                    no_na=True,
                                    nr=True)

    # -----------------------------------------------------------
    #  Select genes falling in chrom defined in the fasta file
    # -----------------------------------------------------------

    message("Chromosomes in gtf file: " + ",".join(gtf.get_chroms(nr=True)))

    message("Selecting chromosome defined in the fasta file")

    gtf = gtf.select_by_key(key="seqid", value=",".join(genome_chr_list))

    message("Chromosomes in gtf file: " + ",".join(gtf.get_chroms(nr=True)))

    if len(gtf) == 0:
        message("No genes were found on chromosomes defined in fasta file.",
                type="ERROR")

    nb_tx_after = gtf.extract_data("transcript_id",
                                   as_list=True,
                                   no_na=True,
                                   nr=True)
    if len(nb_tx_after) != len(nb_tx_before):
        diff = list(set(nb_tx_before) - set(nb_tx_after))
        message("Some transcripts had"
                " no corresponding chromosome"
                " in the fasta file: " + ",".join(diff)[0:100] + "...")

    message("Using genome file: " + genome.name)
    message("Retrieving fasta sequences from " + genome.name)
    fasta_seq = gtf.get_sequences(genome=genome.name,
                                  intron=with_introns,
                                  rev_comp=rev_comp)

    tx_gtf = gtf.select_by_key("feature", "transcript")

    if sleuth_format:

        tx_biotype = tx_gtf.extract_data("transcript_id,transcript_biotype",
                                         as_dict_of_lists=True,
                                         hide_undef=False)
        gn_biotype = tx_gtf.extract_data("gene_id,gene_biotype",
                                         as_dict_of_lists=True,
                                         hide_undef=False)

        for i in fasta_seq:
            gene_id = i.gene_id
            transcript_id = i.transcript_id
            chrom = i.chrom

            gn_bio = gn_biotype[i.gene_id][0]
            tx_bio = tx_biotype[i.transcript_id][0]

            if delete_version:
                transcript_id = re.sub('\.[0-9]+$', '', transcript_id)
                gene_id = re.sub('\.[0-9]+$', '', gene_id)
            if del_chr:
                chrom = chrom.replace('chr', '')

            header = " ".join([
                transcript_id, ":".join([
                    "chromosome", assembly, chrom,
                    str(i.start),
                    str(i.end), "1"
                ]), "gene:" + gene_id, "gene_biotype:" + gn_bio,
                "transcript_biotype:" + tx_bio
            ])

            outputfile.write(">" + header + "\n")
            outputfile.write(i.sequence + "\n")
    else:
        tx_info = tx_gtf.extract_data("transcript_id," + label,
                                      as_dict_of_lists=True,
                                      hide_undef=False)
        for i in fasta_seq:
            if not explicit:
                header = separator.join(tx_info[i.transcript_id])
            else:
                header = [
                    str(x[0]) + "=" + x[1]
                    for x in zip(label.split(","), tx_info[i.transcript_id])
                ]
                header = separator.join(header)
            outputfile.write(">" + header + "\n")
            outputfile.write(i.sequence + "\n")

    gc.disable()
    close_properly(outputfile, inputfile)
def select_by_intron_size(inputfile=None,
                          outputfile=None,
                          intron_size=0,
                          merged=False,
                          invert_match=False,
                          delete_monoexonic=False,
                          add_intron_size=False):
    """
    Select genes which contain an intron of size at least s or whose sum of intron size is at least s
    """

    message("Searching for intronic regions.")

    gtf = GTF(inputfile, check_ensembl_format=False)

    introns_bo = gtf.get_introns(by_transcript=True,
                                 name=["transcript_id"],
                                 intron_nb_in_name=False).sort()

    # Get the list of transcripts
    all_tx_ids = gtf.get_tx_ids(nr=True)

    # The list of transcripts
    # to be deleted
    to_delete = OrderedDict()

    if merged:
        # Create a dict that will contain the sum of introns for
        # each transcript
        intron_sum_dict = OrderedDict.fromkeys(all_tx_ids, 0)

        for i in introns_bo:
            size = i.end - i.start
            tx_id = i.name
            intron_sum_dict[tx_id] += size

        for tx_id, sum_intron in list(intron_sum_dict.items()):

            if sum_intron != 0:
                if not invert_match:
                    if sum_intron < intron_size:
                        to_delete[tx_id] = 1

                else:
                    if sum_intron >= intron_size:
                        to_delete[tx_id] = 1
            else:
                if delete_monoexonic:
                    to_delete[tx_id] = 1

        if add_intron_size:
            gtf = gtf.add_attr_from_dict(feat="transcript",
                                         key="transcript_id",
                                         a_dict=intron_sum_dict,
                                         new_key="intron_size_sum")

    else:

        # Create a dict that will contain a list introns size
        # for each transcript

        intron_size_dict = defaultdict(list)

        for tx_id in all_tx_ids:
            intron_size_dict[tx_id] = []

        for i in introns_bo:
            size = i.end - i.start
            tx_id = i.name

            intron_size_dict[tx_id] += [size]

        for tx_id, list_size in list(intron_size_dict.items()):
            if not list_size:
                intron_size_dict[tx_id] = [0]
                if delete_monoexonic:
                    to_delete[tx_id] = 1
                continue

            for size in intron_size_dict[tx_id]:

                if not invert_match:
                    if size < intron_size:
                        to_delete[tx_id] = 1

                else:
                    if size >= intron_size:
                        to_delete[tx_id] = 1

        if add_intron_size:

            for tx_id, list_size in list(intron_size_dict.items()):
                list_size = [str(x) for x in list_size]
                intron_size_dict[tx_id] = ",".join(list_size)

            gtf = gtf.add_attr_from_dict(feat="transcript",
                                         key="transcript_id",
                                         a_dict=intron_size_dict,
                                         new_key="intron_size")

    all_tx_ids = gtf.get_tx_ids(nr=True)
    all_tx_ids = [x for x in all_tx_ids if x not in to_delete]
    msg_list = ",".join(list(to_delete.keys()))
    nb_char = min([len(msg_list), 40])
    msg_list = msg_list[0:nb_char]
    message("Deleting: " + msg_list + "...")

    gtf = gtf.select_by_key("transcript_id", ",".join(all_tx_ids))

    gtf.write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Exemple #20
0
def divergent(
        inputfile=None,
        outputfile=None,
        key_name=None,
        upstream=1500,
        downstream=1500,
        chrom_info=None,
        no_strandness=False,
        no_annotation=False):
    """
Find transcript with divergent promoters.
    """

    message("Using -u " + str(upstream) + ".")
    message("Using -d " + str(downstream) + ".")

    tx_with_divergent = dict()
    dist_to_divergent = dict()
    tss_pos = dict()

    message("Loading GTF.")

    gtf = GTF(inputfile)

    message("Getting transcript coordinates.")

    tx_feat = gtf.select_by_key("feature",
                                "transcript")
    message("Getting tss coordinates.")

    tss_bo = tx_feat.get_tss(name=["transcript_id", "gene_id"],
                             sep="||")

    # get tss position
    for i in tss_bo:
        tx_id_tss, gn_id_tss = i.name.split("||")
        tss_pos[tx_id_tss] = int(i.start)

    message("Getting promoter coordinates.")

    promoter_bo = tss_bo.slop(s=True,
                              l=upstream,
                              r=downstream,
                              g=chrom_info.name).cut([0, 1,
                                                      2, 3,
                                                      4, 5])
    message("Intersecting...")

    if no_strandness:
        prom_with_tss_bo = promoter_bo.intersect(tss_bo,
                                                 wb=True,
                                                 s=False,
                                                 S=False)
    else:
        prom_with_tss_bo = promoter_bo.intersect(tss_bo,
                                                 wb=True,
                                                 s=False,
                                                 S=True)

    tmp_file = make_tmp_file("promoter_slop", ".bed")
    promoter_bo.saveas(tmp_file.name)
    tmp_file = make_tmp_file("promoter_intersection_with_tss_as_", ".bed")
    prom_with_tss_bo.saveas(tmp_file.name)

    for i in prom_with_tss_bo:

        tx_id_tss, gn_id_tss = i.fields[9].split("||")
        tx_id_prom, gene_id_prom = i.fields[3].split("||")

        if gene_id_prom != gn_id_tss:
            if tx_id_prom in tx_with_divergent:
                dist = abs(tss_pos[tx_id_prom] - tss_pos[tx_id_tss])
                if dist < dist_to_divergent[tx_id_prom]:
                    dist_to_divergent[tx_id_prom] = dist
                    tx_with_divergent[tx_id_prom] = tx_id_tss
            else:

                dist = abs(tss_pos[tx_id_prom] - tss_pos[tx_id_tss])
                dist_to_divergent[tx_id_prom] = dist
                tx_with_divergent[tx_id_prom] = tx_id_tss

    if not no_annotation:

        if key_name is None:
            key_name = "divergent"
            key_name_dist = "dist_to_divergent"
        else:
            key_name_dist = "dist_" + key_name

        if len(tx_with_divergent):
            gtf = gtf.add_attr_from_dict(feat="transcript",
                                         key="transcript_id",
                                         a_dict=tx_with_divergent,
                                         new_key=key_name)

            gtf = gtf.add_attr_from_dict(feat="transcript",
                                         key="transcript_id",
                                         a_dict=dist_to_divergent,
                                         new_key=key_name_dist)

        gtf.write(outputfile,
                  gc_off=True)

    else:
        gtf.select_by_key("transcript_id",
                          ",".join(list(tx_with_divergent.keys()))).write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Exemple #21
0
def rm_dup_tss(inputfile=None, outputfile=None):
    """If several transcripts of a gene share the same tss, select only one."""

    # ----------------------------------------------------------------------
    # Get the TSS
    # ----------------------------------------------------------------------

    gtf = GTF(inputfile)
    tss_bo = gtf.get_tss(["gene_id", "transcript_id"])

    # ----------------------------------------------------------------------
    # Sort the file by name (4th col) to ensure reproducibility between calls.
    # ----------------------------------------------------------------------

    with open(tss_bo.fn) as f:
        lines = [line.split('\t') for line in f]

    tmp_file = make_tmp_file(prefix="tss_sorted_by_tx_id", suffix=".bed")

    for line in sorted(lines, key=operator.itemgetter(3)):
        tmp_file.write('\t'.join(line))

    tmp_file.close()

    tss_bo = BedTool(tmp_file.name)

    # ----------------------------------------------------------------------
    # Get the list of non redundant TSSs
    # ----------------------------------------------------------------------

    gene_dict = defaultdict(dict)
    to_delete = []

    message("Looking for redundant TSS (gene-wise).")

    for line in tss_bo:

        tss = line.start
        name = line.name
        gene_id, tx_id = name.split("|")

        if gene_id in gene_dict:
            if tss not in gene_dict[gene_id]:
                gene_dict[gene_id][tss] = tx_id
            else:
                to_delete += [tx_id]
        else:
            gene_dict[gene_id][tss] = tx_id

    message("Deleted transcripts: " +
            ",".join(to_delete[1:min(10, len(to_delete))]) + "...",
            type="DEBUG")

    # ----------------------------------------------------------------------
    # Write
    # ----------------------------------------------------------------------

    gtf.select_by_key("feature", "gene",
                      invert_match=True).select_by_key(
                          "transcript_id",
                          ",".join(to_delete),
                          invert_match=True).write(outputfile, gc_off=True)