def __call__(self,
                 parser,
                 namespace,
                 values,
                 option_string=None):
        if values in ["mm8", "mm9", "mm10", "hg19", "hg38", "rn3", "rn4"]:
            chr_size = pybedtools.helpers.chromsizes(values)
            ## Delete haplotype chromosome
            ## unplaced contig and unlocalized contig
            regexp = re.compile('(_random)|(^chrUn)|(_hap\d+)|(_alt)|(^chrM$)')
            chr_size = {key: chr_size[key] for key in chr_size if not regexp.search(key)}
            tmp_file_chr = make_tmp_file(prefix='chromsize', suffix='.txt')
            for chrom, size in chr_size.items():
                tmp_file_chr.write(chrom + "\t" + str(size[1]) + "\n")
            tmp_file_chr.close()
            values = open(tmp_file_chr.name, 'r')
        elif values == 'simple':
            chr_size = {'chr1': 300, 'chr2': 600}
            tmp_file_chr = make_tmp_file(prefix='chromsize', suffix='.txt')
            for chrom, size in chr_size.items():
                tmp_file_chr.write(chrom + "\t" + str(size) + "\n")
            tmp_file_chr.close()
            values = open(tmp_file_chr.name, 'r')
        else:
            check_file_or_dir_exists(values)
            values = open(values, "r")
            chrom_info_as_dict(values)

        # Add the attribute
        setattr(namespace, self.dest, values)
Beispiel #2
0
def bed_to_gtf(inputfile=None,
               outputfile=None,
               ft_type="transcript",
               source="Unknown"):
    """
 Convert a bed file to a gtf. This will make the poor bed feel as if it was a
 nice gtf (but with lots of empty fields...). May be helpful sometimes...
    """

    message("Converting the bed file into GTF file.")

    if inputfile.name == '<stdin>':
        tmp_file = make_tmp_file(prefix="input_bed", suffix=".bed")
        for i in inputfile:
            write_properly(chomp(str(i)), tmp_file)

        tmp_file.close()
        inputfile.close()

        bed_obj = BedTool(tmp_file.name)
    else:
        bed_obj = BedTool(inputfile.name)

    n = 1
    for i in bed_obj:

        if i.strand == "":
            i.strand = "."
        if i.name == "":
            i.name = str("feature_" + str(n))
        if i.score == "":
            i.score = "0"

        if ft_type == "exon":
            key_value = "gene_id \"" + i.name + "\"; " + \
                        "transcript_id \"" + i.name + "\"; " + \
                        "exon_id \"" + i.name + "\";"
        elif ft_type == "gene":
            key_value = "gene_id \"" + i.name + "\";"
        else:
            key_value = "gene_id \"" + i.name + "\"; " + \
                        "transcript_id \"" + i.name + "\";"

        if pygtftk.utils.ADD_CHR == 1:
            chrom_out = "chr" + i.chrom
        else:
            chrom_out = i.chrom

        list_out = [
            chrom_out, source, ft_type,
            str(i.start + 1),
            str(i.end),
            str(i.score), i.strand, ".", key_value
        ]

        write_properly("\t".join(list_out), outputfile)

        n += 1
    gc.disable()
    close_properly(outputfile)
def nb_transcripts(inputfile=None,
                   outputfile=None,
                   text_format=False,
                   key_name=""):
    """
    Compute the number of transcript per gene.
    """

    gtf = GTF(inputfile)

    message("Computing the number of transcript per gene in input GTF file.")

    # Computation of transcript number is performed on exon lines
    # Just in case some transcript lines would be lacking (but they should
    # not...)

    n_tx = gtf.get_gn_to_tx()

    if not text_format:
        tmp_file = make_tmp_file(prefix="nb_tx", suffix=".txt")

    for i in n_tx:
        if not text_format:
            tmp_file.write(i + "\t" + str(len(n_tx[i])) + "\n")
        else:
            outputfile.write(i + "\t" + str(len(n_tx[i])) + "\n")

    if not text_format:
        tmp_file.close()
        gtf.add_attr_from_file(feat="gene",
                               key="gene_id",
                               new_key=key_name,
                               inputfile=tmp_file.name).write(outputfile,
                                                              gc_off=True)

    close_properly(outputfile, inputfile)
Beispiel #4
0
def discretize_key(inputfile=None,
                   outputfile=None,
                   src_key=None,
                   dest_key="disc_key",
                   nb_levels=2,
                   percentiles=False,
                   percentiles_of_uniq=False,
                   precision=2,
                   log=False,
                   labels=None):
    """
    Create a new key by discretizing a numeric key.
    """

    if nb_levels < 2:
        message("--nb-levels has to be greater than 2.", type="ERROR")

    # -------------------------------------------------------------------------
    #
    # Check labels and nb_levels
    #
    # -------------------------------------------------------------------------

    if labels is not None:
        labels = labels.split(",")
        if len(labels) != nb_levels:
            message(
                "The number of labels should be the same as the number of levels.",
                type="ERROR")
        if len(labels) != len(set(labels)):
            message("Redundant labels not allowed.", type="ERROR")

    # -------------------------------------------------------------------------
    #
    # Load GTF. Retrieve values for src-key
    #
    # -------------------------------------------------------------------------

    gtf = GTF(inputfile, check_ensembl_format=False)
    src_values = gtf.extract_data(src_key, as_list=True)

    if len([x for x in src_values if x not in ['.', '?']]) == 0:
        message('The key was not found in this GTF.', type="ERROR")

    min_val = None
    max_val = None

    dest_values = []
    dest_pos = []

    for p, v in enumerate(src_values):
        try:
            a = float(v)
            if min_val is not None:
                if a > max_val:
                    max_val = a
                if a < min_val:
                    min_val = a
            else:
                min_val = a
                max_val = a

            dest_values += [a]
            dest_pos += [p]
        except ValueError:
            pass

    if min_val is None:
        message("Did not find numeric values in the source key.", type="ERROR")
    if min_val == max_val:
        message(
            "The minimum and maximum values found in the source key are the same.",
            type="ERROR")

    if log:
        if 0 in dest_values:
            message("Encountered zero values before log transformation.",
                    type="WARNING",
                    force=True)
            message("Adding a pseudocount (+1).", type="WARNING", force=True)

            pseudo_count = 1
            dest_values = list(np.log2([x + pseudo_count
                                        for x in dest_values]))

        # update max/min values
        max_val = max(dest_values)
        min_val = min(dest_values)

    # Apply the same rule as pandas.cut when bins is an int.
    min_val = min_val - max_val / 1000

    # -------------------------------------------------------------------------
    #
    # Compute percentiles if required
    #
    # -------------------------------------------------------------------------

    if percentiles:
        if percentiles_of_uniq:
            dest_values_tmp = [min_val] + list(set(dest_values))
        else:
            dest_values_tmp = [min_val] + dest_values
        n = nb_levels

        q = [np.percentile(dest_values_tmp, 100 / n * i) for i in range(0, n)]
        q = q + [np.percentile(dest_values_tmp, 100)]

        if len(q) != len(set(q)):
            message("No ties are accepted in  percentiles.",
                    type="WARNING",
                    force=True)
            message("Breaks: " + str(q), type="WARNING", force=True)
            message("Try -u. Exiting", type="ERROR")

    # -------------------------------------------------------------------------
    #
    # Create a factor
    #
    # -------------------------------------------------------------------------

    if percentiles:

        (breaks, cat_label) = pandas.cut(dest_values,
                                         bins=q,
                                         labels=labels,
                                         retbins=True)
    else:
        (breaks, cat_label) = pandas.cut(dest_values,
                                         bins=nb_levels,
                                         labels=labels,
                                         retbins=True)

    if labels is None:
        # The include_lowest argument of pandas is not working.
        # Using this workaround to avoid minimum value outside of data range.
        cat_label[0] = min(dest_values)
        cat_label = [round(x, precision) for x in cat_label]
        if precision == 0:
            cat_label = [int(x) for x in cat_label]
        cat_label = [str(x) for x in list(zip(cat_label[:-1], cat_label[1:]))]
        cat_label[0] = cat_label[0].replace("(", "[")
        cat_label = [x.replace(")", "]") for x in cat_label]
        cat_label = [str(x).replace(", ", "_") for x in cat_label]

        # The string can be very problematic later...
        breaks.categories = cat_label

    message("Categories: " + str(list(breaks.categories)),
            type="INFO",
            force=True)

    # -------------------------------------------------------------------------
    #
    # Write to disk
    #
    # -------------------------------------------------------------------------

    tmp_file = make_tmp_file(prefix="discretized_keys", suffix=".txt")

    with tmp_file as tp_file:
        for p, v in zip(dest_pos, breaks):
            tp_file.write(str(p) + "\t" + str(v) + '\n')

    gtf.add_attr_to_pos(tmp_file, new_key=dest_key).write(outputfile,
                                                          gc_off=True)

    close_properly(outputfile, inputfile)
Beispiel #5
0
def overlapping(
        inputfile=None,
        outputfile=None,
        key_name=None,
        upstream=1500,
        downstream=1500,
        chrom_info=None,
        feature_type='transcript',
        same_strandedness=False,
        diff_strandedness=False,
        annotate_gtf=False,
        bool=False,
        annotate_all=False,
        invert_match=False):
    """
Description: Find transcripts whose body/TSS/TTS do or do not overlap with any
transcript from another gene.
    """

    # ----------------------------------------------------------------------
    # Prepare key names
    # ----------------------------------------------------------------------

    if annotate_gtf:
        if key_name is None:
            key_info = ["overlap",
                        feature_type,
                        "u" + str(upstream / 1000) + "k",
                        "d" + str(downstream / 1000) + "k"
                        ]
            key_name = "_".join(key_info)

        if invert_match:
            message("--annotate-gtf and --invert-match are "
                    "mutually exclusive.",
                    type="ERROR")

    if same_strandedness and diff_strandedness:
        message("--same-strandedness and --diff-strandedness are "
                "mutually exclusive.",
                type="ERROR")

    message("Using -u " + str(upstream))
    message("Using -d " + str(downstream))

    overlapping_tx = defaultdict(list)

    # Load the GTF so that it won't be lost
    # if GTF stream comes from stdin
    gtf = GTF(inputfile)

    message("Getting transcript in bed format")

    tx_feat = gtf.select_by_key("feature",
                                "transcript")

    if annotate_all:
        overlapping_tx = gtf.extract_data(keys=["transcript_id"], as_dict=True, default_val="0")
        for i in overlapping_tx:
            overlapping_tx[i] = []

    # ----------------------------------------------------------------------
    # Get transcript limits
    # ----------------------------------------------------------------------

    tx_bed = tx_feat.to_bed(name=["transcript_id", "gene_id"], sep="||")

    message("Getting " + feature_type + " and 'slopping'.")

    if feature_type == "transcript":

        bed_obj = tx_bed.slop(s=True,
                              l=upstream,
                              r=downstream,
                              g=chrom_info.name).cut([0, 1, 2, 3, 4, 5])

    elif feature_type == "promoter":

        bed_obj = tx_feat.get_tss(name=["transcript_id", "gene_id"],
                                  sep="||").slop(s=True,
                                                 l=upstream,
                                                 r=downstream,
                                                 g=chrom_info.name).cut([0, 1,
                                                                         2, 3,
                                                                         4, 5])

    elif feature_type == "tts":

        bed_obj = tx_feat.get_tts(name=["transcript_id", "gene_id"],
                                  sep="||").slop(s=True,
                                                 l=upstream,
                                                 r=downstream,
                                                 g=chrom_info.name).cut([0, 1,
                                                                         2, 3,
                                                                         4, 5])
    else:
        message("Not implemented yet", type="ERROR")

    tmp_file = make_tmp_file(feature_type + "_slopped_region", ".bed")
    bed_obj.saveas(tmp_file.name)

    overlap_regions = bed_obj.intersect(tx_bed,
                                        wb=True,
                                        s=same_strandedness,
                                        S=diff_strandedness)

    tmp_file = make_tmp_file(feature_type + "_overlapping_regions", ".bed")
    overlap_regions.saveas(tmp_file.name)

    for i in overlap_regions:

        tx_other, gn_other = i.fields[9].split("||")
        tx_id, gene_id = i.fields[3].split("||")
        if gene_id != gn_other:
            overlapping_tx[tx_id] += [tx_other]

    if bool:
        for k, _ in overlapping_tx.items():
            if not len(overlapping_tx[k]):
                overlapping_tx[k] = "0"
            else:
                overlapping_tx[k] = "1"

    if not invert_match:

        if not annotate_gtf:
            value = ",".join(set(overlapping_tx.keys()))
            gtf.select_by_key("transcript_id",
                              value).write(outputfile,
                                           gc_off=True)
        else:

            if len(overlapping_tx):
                gtf = gtf.add_attr_from_dict(feat="transcript",
                                             key="transcript_id",
                                             a_dict=overlapping_tx,
                                             new_key=key_name)
            gtf.write(outputfile,
                      gc_off=True)

    else:
        values = ",".join(set(overlapping_tx.keys()))
        gtf.select_by_key("transcript_id",
                          values,
                          invert_match).write(outputfile, gc_off=True)

    gc.disable()
    close_properly(outputfile, inputfile)
Beispiel #6
0
def mk_matrix(inputfile=None,
              outputfile=None,
              bigwiglist=None,
              ft_type=None,
              pseudo_count=0,
              upstream=1000,
              downstream=1000,
              bin_around_frac=0.1,
              chrom_info=None,
              bin_nb=100,
              nb_proc=None,
              labels=None,
              no_stranded=False,
              zero_to_na=False):
    """
 Description: Create a matrix to be used by 'profile' and 'heatmap' commands.
    """

    # -------------------------------------------------------------------------
    # Check argument consistency
    #
    # -------------------------------------------------------------------------

    if ft_type in ['single_nuc', 'promoter', 'tts']:
        region_size = upstream + downstream + 1
        if region_size < bin_nb:
            message(
                "The region (-u/-d) needs to be extended given the number "
                "of bins (--bin-nb)",
                type="ERROR")

    # -------------------------------------------------------------------------
    # Check output file name does not ends with .zip
    #
    # -------------------------------------------------------------------------

    if outputfile.name.endswith(".zip"):
        outfn = outputfile.name.replace(".zip", "")
        outputfile = open(outfn, "w")

    # -------------------------------------------------------------------------
    # Check input file is in bed or GTF format
    #
    # -------------------------------------------------------------------------

    message("Loading input file...")
    if inputfile.name == '<stdin>':
        gtf = GTF(inputfile.name)
        is_gtf = True
        if ft_type == 'user_regions':
            message(
                "--ft-type can not be set to user_regions"
                " when a gtf is provided.",
                type="ERROR")
    else:
        try:

            region_bo = BedTool(inputfile.name)
            len(region_bo)
        except IndexError:
            message("Unable to read the input file. Check format",
                    type="ERROR")
        if len(region_bo) == 0:
            message("Unable to find requested regions", type="ERROR")

        if region_bo.file_type == 'gff':
            message('Loading the GTF file.')
            gtf = GTF(inputfile.name)
            is_gtf = True
        else:
            is_gtf = False

            if ft_type != 'user_regions' and ft_type != 'single_nuc':
                message(
                    "Set --ft-type to 'user_regions' or 'single_nuc'"
                    " when using input bed file.",
                    type="ERROR")
            # Check that the strand is provided and
            # check it is located in the right column
            # (not checked by BedTool...).
            if region_bo.field_count() < 6:
                if not no_stranded:
                    message("Strand is undefined. Use -nst.", type="ERROR")
            else:
                region_name = dict()
                for i in region_bo:
                    if region_name.get(i.name, None) is None:
                        region_name[i.name] = 1
                    else:
                        message(
                            "Regions in bed file should have "
                            "unique identifier (col 4).",
                            type="ERROR")
                    if i.strand[0] not in ['.', '+', '-']:
                        message("Strand should be one of '+','-' or '.'.",
                                type="ERROR")
                    if ft_type == 'single_nuc':
                        if i.end - i.start != 1:
                            message(
                                "Region length should be 1 nucleotide "
                                "long when 'single_nuc' is set. Use 'user_regions'.",
                                type="ERROR")
                    elif ft_type == 'user_regions':
                        if i.end - i.start == 1:
                            message(
                                "Region length should not be 1 nucleotide "
                                "long when 'user_regions' is set. Use 'single_nuc'.",
                                type="ERROR")

    # -------------------------------------------------------------------------
    # Create a list of labels for the diagrams.
    # Take user input in account
    # -------------------------------------------------------------------------
    message('Checking labels.')

    if labels is not None:
        labels = labels.split(",")
        # Ensure the number of labels is the same as the number of bw files.
        if len(labels) != len(bigwiglist):
            message(
                "The number of labels should be the same as the number of"
                " bigwig files.",
                type="ERROR")
        # Ensure labels are non-redondant
        if len(labels) > len(set(labels)):
            message("Labels must be unique.", type="ERROR")
    else:
        labels = []
        for i in range(len(bigwiglist)):
            labels += [
                os.path.splitext(os.path.basename(bigwiglist[i].name))[0]
            ]

    # -------------------------------------------------------------------------
    #
    # Get the requested transcrit lines in bed format
    # Tx are restricted to those found on chromosome
    # declared in the bigwig file.
    # -------------------------------------------------------------------------
    message('Getting the list of chromosomes declared in bigwig files.')
    bw_chrom = list()
    for i in bigwiglist:
        bw_chrom += list(pyBigWig.open(i.name).chroms().keys())

    bed_col = [0, 1, 2, 3, 4, 5]

    if is_gtf:

        message('Selecting chromosomes declared in bigwig from gtf.')
        tmp = gtf.select_by_key("feature", "transcript").select_by_key(
            "seqid", ",".join(bw_chrom))

        tmp = gtf.select_by_key("feature", "transcript")
        tmp_tx_name = tmp.extract_data("transcript_id", as_list=True)

        # If several trancript records are associated to
        # the same transcript_id, raise an error.
        if len(tmp_tx_name) > len(set(tmp_tx_name)):
            message('Transcripts should have a unique identifier.',
                    type="ERROR")

        message('Selecting requested regions.')

        # ----------------------------------------------------------------------
        #
        # Slop tss and promoters.
        # No need if transcript was requested (it will be flanked by upstream
        # and doswnstream regions later on).
        # ----------------------------------------------------------------------

        if ft_type == 'transcript':
            message("Getting transcript boundaries (input gtf).")

            main_region_bo = tmp.to_bed(name=["transcript_id"])

        elif ft_type == 'promoter':

            message("Getting promoter regions [-%d,+%d]." %
                    (upstream, downstream))

            main_region_bo = tmp.get_tss(name=["transcript_id"]).slop(
                s=True, l=upstream, r=downstream, g=chrom_info.name)

        elif ft_type == 'tts':

            main_region_bo = tmp.get_tts(name=["transcript_id"]).slop(
                s=True, l=upstream, r=downstream, g=chrom_info.name)

    else:
        message("Loading regions")

        if ft_type == 'user_regions':
            main_region_bo = BedTool(inputfile.name).cut(bed_col)
        elif ft_type == 'single_nuc':
            main_region_bo = BedTool(inputfile.name).cut(bed_col).slop(
                s=True, l=upstream, r=downstream, g=chrom_info.name)
        else:
            message("Unknown method.")

    # Save for tracability
    main_region_bed = make_tmp_file(prefix="region" + ft_type, suffix=".bed")
    main_region_bo.saveas(main_region_bed.name)

    # -------------------------------------------------------------------------
    #
    # Print a header in the output file
    #
    # -------------------------------------------------------------------------
    message("Preparing comments")

    comments = "#"
    comments += "ft_type:" + ft_type + ";"
    comments += "from:" + str(upstream) + ";"
    comments += "to:" + str(downstream) + ";"
    comments += "labels:" + ",".join(labels) + ";"

    # -------------------------------------------------------------------------
    # Compute coverage of requested region
    # Each worker will send a file
    # -------------------------------------------------------------------------

    outputfile_list = {}
    message("Using %d bins for main region." % bin_nb)

    tmp_file = bw_profile_mp(in_bed_file=main_region_bed.name,
                             nb_proc=nb_proc,
                             big_wig=[x.name for x in bigwiglist],
                             bin_nb=bin_nb,
                             pseudo_count=pseudo_count,
                             stranded=not no_stranded,
                             type="main",
                             labels=labels,
                             outputfile=outputfile.name,
                             zero_to_na=zero_to_na,
                             verbose=pygtftk.utils.VERBOSITY)

    outputfile_list["main"] = tmp_file

    # -------------------------------------------------------------------------
    # If transcript was requested
    # we must process flanking regions
    # We need to retrieve coverage of promoter [-upstream, 0]
    # as transcript coverage window size will depend on transcript length.
    # For promoter the length of windows will be fixed.
    # -------------------------------------------------------------------------

    if ft_type in ['transcript', 'user_regions']:

        # Number of bins for TTS and TSS
        around_bin_nb = int(round(bin_nb * bin_around_frac))
        if around_bin_nb < 1:
            around_bin_nb = 1

        if upstream > 0:

            if ft_type == 'transcript':
                message("Getting promoter (using %d bins)." % around_bin_nb)
                ups_region_bo = tmp.get_tss(name=["transcript_id"]).slop(
                    s=True, l=upstream, r=-1, g=chrom_info.name).cut(bed_col)

            else:
                message("Getting upstream regions (%d bins)." % around_bin_nb)
                ups_region_bo = main_region_bo.flank(s=True,
                                                     l=upstream,
                                                     r=0,
                                                     g=chrom_info.name)

            upstream_bed_file = make_tmp_file(prefix="upstream_region" +
                                              ft_type,
                                              suffix=".bed")

            ups_region_bo.saveas(upstream_bed_file.name)

            tmp_file = bw_profile_mp(in_bed_file=upstream_bed_file.name,
                                     nb_proc=nb_proc,
                                     big_wig=[x.name for x in bigwiglist],
                                     bin_nb=around_bin_nb,
                                     pseudo_count=pseudo_count,
                                     stranded=not no_stranded,
                                     type="upstream",
                                     labels=labels,
                                     outputfile=outputfile.name,
                                     zero_to_na=zero_to_na,
                                     verbose=pygtftk.utils.VERBOSITY)

            outputfile_list["upstream"] = tmp_file

        if downstream > 0:

            if ft_type == 'transcript':
                message("Getting TTS (using %d bins)." % around_bin_nb)
                dws_region_bo = tmp.get_tts(name=["transcript_id"]).slop(
                    s=True, l=-1, r=downstream, g=chrom_info.name).cut(bed_col)
            else:
                message("Getting downstream regions (%d bins)." %
                        around_bin_nb)

                dws_region_bo = main_region_bo.flank(s=True,
                                                     l=0,
                                                     r=downstream,
                                                     g=chrom_info.name)
            dws_bed_file = make_tmp_file(prefix="dowstream_region" + ft_type,
                                         suffix=".bed")

            dws_region_bo.saveas(dws_bed_file.name)

            tmp_file = bw_profile_mp(in_bed_file=dws_bed_file.name,
                                     nb_proc=nb_proc,
                                     big_wig=[x.name for x in bigwiglist],
                                     bin_nb=around_bin_nb,
                                     pseudo_count=pseudo_count,
                                     stranded=not no_stranded,
                                     type="downstream",
                                     labels=labels,
                                     outputfile=outputfile.name,
                                     zero_to_na=zero_to_na,
                                     verbose=pygtftk.utils.VERBOSITY)

            outputfile_list["downstream"] = tmp_file

    # -------------------------------------------------------------------------
    #
    # Merge file using pandas
    #
    # -------------------------------------------------------------------------

    message("Reading (pandas): " + outputfile_list["main"].name, type="DEBUG")
    df_main = pd.read_csv(outputfile_list["main"].name, sep="\t")
    # save strand and end
    # They will re-joined added later
    df_copy = df_main[['bwig', 'chrom', 'gene', 'strand', 'start', 'end']]

    df_start = df_main.pop('start')
    df_end = df_main.pop('end')

    if "upstream" in outputfile_list:
        message("Merging upstream file")
        message("Reading (pandas): " + outputfile_list["upstream"].name,
                type="DEBUG")
        df_up = pd.read_csv(outputfile_list["upstream"].name, sep="\t")
        df_up = df_up.drop(['start', 'end'], 1)
        df_main = df_up.merge(df_main.loc[:, df_main.columns],
                              on=['bwig', 'chrom', 'gene', 'strand'])

    if "downstream" in outputfile_list:
        message("Merging downstream file")
        message("Reading (pandas): " + outputfile_list["downstream"].name,
                type="DEBUG")
        df_dws = pd.read_csv(outputfile_list["downstream"].name, sep="\t")
        df_dws = df_dws.drop(['start', 'end'], 1)
        df_main = df_main.merge(df_dws.loc[:, df_dws.columns],
                                on=['bwig', 'chrom', 'gene', 'strand'])

    # join start and end.
    df_main = df_main.merge(df_copy.loc[:, df_copy.columns],
                            on=['bwig', 'chrom', 'gene', 'strand'])
    df_start = df_main.pop('start')
    df_end = df_main.pop('end')
    df_main.insert(2, 'start', df_start)
    df_main.insert(3, 'end', df_end)

    message("Writing to file")
    outputfile.close()

    with open(outputfile.name, 'a') as f:
        f.write(comments + "\n")
        df_main.to_csv(f,
                       sep="\t",
                       index=False,
                       mode='a',
                       columns=df_main.columns,
                       na_rep='NA')

    # -------------------------------------------------------------------------
    #
    # Compress
    #
    # -------------------------------------------------------------------------

    message("Compressing")
    path = os.path.abspath(outputfile.name)
    filename = os.path.basename(path)
    message("filename: " + filename, type="DEBUG")
    zip_filename = filename + '.zip'
    message("zip_filename: " + zip_filename, type="DEBUG")
    zip_path = os.path.join(os.path.dirname(path), zip_filename)
    message("zip_path: " + zip_path, type="DEBUG")

    with zipfile.ZipFile(zip_path, 'w', allowZip64=True) as zf:
        zf.write(filename=path, arcname=filename)

    for i in outputfile_list:
        message("deleting " + outputfile_list[i].name)
        os.remove(outputfile_list[i].name)
    os.remove(outputfile.name)

    gc.disable()
    close_properly(inputfile, outputfile)
    def __call__(self, string):

        # ---------------------------------------------------------------
        # Check file extension
        # ---------------------------------------------------------------

        fasta_format_1 = '(\.[Ff][Aa][Ss][Tt][Aa]$)|(\.[Ff][Nn][Aa]$)'
        fasta_format_2 = '|(\.[Ff][Aa]$)|(\.[Ff][Aa][Ss]$)|(\.[Ff][Ff][Nn]$)|(\.[Ff][Rr][Nn]$)'
        fasta_regexp = fasta_format_1 + fasta_format_2
        fasta_regexp_gz = re.sub("\$", "\.[Gg][Zz]$", fasta_regexp)
        bed_regexp = '\.[Bb][Ee][Dd][3456]{0,1}$'
        bed_regexp_gz = re.sub("\$", "\.[Gg][Zz]$", bed_regexp)
        gtf_regexp = '\.[Gg][Tt][Ff]$'
        gtf_regexp_gz = re.sub("\$", "\.[Gg][Zz]$", gtf_regexp)
        txt_regexp = '(\.[Tt][Xx][Tt]$)|(\.[Cc][Ss][Vv]$)|(\.[Dd][Ss][Vv]$)|(\.[Tt][Aa][Bb]$)|(\.[Tt][Ss][Vv]$)'
        txt_regexp_gz = re.sub("\$", "\.[Gg][Zz]$", txt_regexp)
        bigwig_regexp = '(\.[Bb][Ww]$)|(\.[Bb][Ii][Gg][Ww][Ii][Gg]$)'
        zip_regexp = '\.[Zz][Ii][Pp]$'
        pdf_regexp = '\.[Pp][Dd][Ff]$'

        ext2regexp = {'bed': bed_regexp,
                      'bed.gz': bed_regexp_gz,
                      'gtf': gtf_regexp,
                      'gtf.gz': gtf_regexp_gz,
                      'fasta': fasta_regexp,
                      'fasta.gz': fasta_regexp_gz,
                      'txt': txt_regexp,
                      'txt.gz': txt_regexp_gz,
                      'bigwig': bigwig_regexp,
                      'zip': zip_regexp,
                      'pdf': pdf_regexp}

        # Set verbosity system wide as depending on
        # command line argument order, VERBOSITY (-V) can
        # be evaluated later...
        if '-V' in sys.argv:
            sys_args = ' '.join(sys.argv)
            verbosity_val = re.search('-V ?([01234])?', sys_args)
            if verbosity_val:
                pygtftk.utils.VERBOSITY = int(verbosity_val.group(1))
            else:
                pygtftk.utils.VERBOSITY = 0

        match = False

        if isinstance(self.file_ext, str):
            extension_list = [self.file_ext]
        else:
            extension_list = list(self.file_ext)

        for this_ext in extension_list:
            if re.search(ext2regexp[this_ext], string):
                match = True
                break

        if not match:
            message('Not a valid filename extension :' + string, type="WARNING")
            message('Extension expected: ' + ext2regexp[this_ext], type="ERROR")
            sys.exit()

        # ---------------------------------------------------------------
        # Check directory
        # ---------------------------------------------------------------

        outputdir = os.path.dirname(os.path.abspath(string))

        if not os.path.exists(outputdir):
            if 'w' in self._mode:
                message("Directory not found. Creating.", type="WARNING")
                os.makedirs(outputdir)

        # ---------------------------------------------------------------
        # Check format
        # ---------------------------------------------------------------

        # if bed3, bed4, bad5 convert to bed6

        if self._mode == 'r':
            if self.file_ext == 'bed':

                message("Checking BED file format (" + string + ").",
                        type="INFO")

                try:
                    file_bo = BedTool(string)
                    nb_line = len(file_bo)
                except:
                    msg = "Unable to load file: " + string + "."
                    message(msg, type="ERROR")
                    sys.exit()

                if nb_line == 0:
                    msg = "It seems that file " + string + " is empty."
                    message(msg, type="ERROR")
                    sys.exit()

                if file_bo.file_type != 'bed':
                    msg = "File {f} is not a valid bed file."
                    msg = msg.format(f=string)
                    message(msg, type="ERROR")
                    sys.exit()

                region_nb = 0
                field_count = file_bo.field_count()

                if field_count != 6:
                    message("Converting to bed6 format (" + string + ").", type="WARNING")
                    tmp_file = make_tmp_file(prefix="bed6_",
                                             suffix=".bed")
                    for record in file_bo:
                        region_nb += 1

                        if field_count < 4:
                            name = 'region_' + str(region_nb)
                        else:
                            name = record.name

                        fields = record.fields[0:3]
                        fields += [name, '0', '.']

                        tmp_file.write("\t".join(fields) + "\n")

                    close_properly(tmp_file)
                    string = tmp_file.name

        # we will work with string
        if 'w' in self._mode:
            self._mode = 'w'

        return super(FormattedFile, self).__call__(string)
Beispiel #8
0
def bw_cov_mp(bw_list=None,
              region_file=None,
              labels=None,
              bin_nb=None,
              nb_proc=None,
              n_highest=None,
              zero_to_na=False,
              pseudo_count=None,
              stat='mean',
              verbose=False):
    """
    Compute bigwig coverage (multi-processed) for a set of regions.

    :param bw_list: the list of bigWig files to be processed.
    :param region_file: the bed file containing the region for which coverage is to be computed.
    :param labels: shortname for bigwigs.
    :param bin_nb: The number of bin into which the region should be splitted.
    :param nb_proc: Number of threads to be used.
    :param n_highest: compute the mean coverage based on the n highest values in the bins.
    :param pseudo_count: The value for a pseudo-count.
    :param verbose: run in verbose mode.
    :param stat: mean (default) or sum.
    :param zero_to_na: Convert missing values to NA, not zero.


    Returns a file.

    """

    n_region_to_proceed = len(BedTool(region_file.name))

    message("Received " + str(n_region_to_proceed) +
            " regions to proceed for each bigwig")

    tokens = intervals(list(range(n_region_to_proceed)), nb_proc)

    pool = multiprocessing.Pool(nb_proc)
    coverage_list = pool.map_async(
        _big_wig_coverage_worker,
        list(
            zip(tokens, repeat(bw_list), repeat(region_file.name),
                repeat(bin_nb), repeat(pseudo_count), repeat(n_highest),
                repeat(False), repeat(False), repeat(None), repeat(labels),
                repeat(zero_to_na), repeat(stat),
                repeat(verbose)))).get(9999999)

    if False in coverage_list:
        sys.stderr.write("Aborting...")
        sys.exit()

    # Unlist the list of list

    coverage_list = [item for sublist in coverage_list for item in sublist]

    tmp_file = make_tmp_file(prefix="region_coverage", suffix=".bed")
    for i in coverage_list:
        tmp_file.write(i)

    tmp_file.close()

    return open(tmp_file.name)
Beispiel #9
0
def get_ceas_records(inputfile=None,
                     outputfile=None,
                     show_tables=False,
                     target_table='GeneTable'):
    """
    Convert a CEAS sqlite file back into a flat file.
    """

    # ----------------------------------------------------------------------
    # load the CEAS file
    # ----------------------------------------------------------------------

    if inputfile.name.endswith('gz'):
        tmp_file = make_tmp_file(prefix='ceas_gunzip', suffix='.txt')
        with gzip.open(inputfile.name, 'rb') as f_in:
            with open(tmp_file.name, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        inputfile = open(tmp_file.name)

    conn = sqlite3.connect(inputfile.name)
    cursor = conn.cursor()

    # ----------------------------------------------------------------------
    # A func to get the list of tables
    # ----------------------------------------------------------------------

    def get_tables(cursor):

        out_list = list()
        cursor.execute('SELECT name from sqlite_master where type= "table"')

        for rec in cursor.fetchall():
            out_list += [rec[0]]

        return out_list

    # ----------------------------------------------------------------------
    # Get table list
    # ----------------------------------------------------------------------

    tables = get_tables(cursor)

    # ----------------------------------------------------------------------
    # To show table
    # ----------------------------------------------------------------------

    if show_tables:

        for tab in tables:
            outputfile.write(tab + "\n")
        sys.exit()
    # ----------------------------------------------------------------------
    # loop through records
    # Each line contains:
    #   chrom,name,strand,txStart,txEnd,cdsStart,
    #   cdsEnd,exonCount,exonStarts,exonEnds,name
    # ----------------------------------------------------------------------

    # Check tables exists

    if target_table not in tables:
        message('Table is undefined', type="ERROR")

    for rec in cursor.execute('SELECT * FROM % s' % target_table):
        for rec in cursor.fetchall():
            out_list = []
            for elemnt in rec:
                out_list += [str(elemnt)]
            outputfile.write("\t".join(out_list) + "\n")
Beispiel #10
0
def get_tx_seq(inputfile=None,
               outputfile=None,
               genome=None,
               with_introns=False,
               delete_version=False,
               del_chr=False,
               separator="",
               no_rev_comp=False,
               label="",
               sleuth_format=True,
               explicit=True,
               assembly="bla"):
    """
    Description: Get transcripts sequences in fasta format from a GTF file.
    """

    # -----------------------------------------------------------
    #  Check chromosomes in fasta file
    # -----------------------------------------------------------

    genome_chr_list = []

    message("%d fasta files found." % len(genome))

    as_gz_ext = [True for x in genome if x.name.endswith(".gz")]

    if any(as_gz_ext):
        message("Genome in gz format is not currently supported.",
                type="ERROR")

    if len(genome) == 1:
        message("Checking fasta file chromosome list")
        genome = genome[0]
        with genome as genome_file:
            for i in genome_file:
                if i.startswith(">"):
                    i = i.rstrip("\n")
                    genome_chr_list += [i[1:]]
    else:
        message("Merging fasta files")
        tmp_genome = make_tmp_file(prefix="genome", suffix=".fa")
        with tmp_genome as tg:
            for curr_file in genome:
                message("Merging %s" % curr_file.name)
                with curr_file as cf:
                    shutil.copyfileobj(cf, tg, 1024 * 1024 * 100)

        message("Checking fasta file chromosome list")
        genome = open(tmp_genome.name, "r")
        with genome as genome_file:
            for i in genome_file:
                if i.startswith(">"):
                    i = i.rstrip("\n")
                    genome_chr_list += [i[1:]]

    rev_comp = not no_rev_comp

    message("Chromosomes in fasta file: " + ",".join(genome_chr_list))

    # -----------------------------------------------------------
    #  Read gtf
    # -----------------------------------------------------------

    gtf = GTF(inputfile)
    nb_tx_before = gtf.extract_data("transcript_id",
                                    as_list=True,
                                    no_na=True,
                                    nr=True)

    # -----------------------------------------------------------
    #  Select genes falling in chrom defined in the fasta file
    # -----------------------------------------------------------

    message("Chromosomes in gtf file: " + ",".join(gtf.get_chroms(nr=True)))

    message("Selecting chromosome defined in the fasta file")

    gtf = gtf.select_by_key(key="seqid", value=",".join(genome_chr_list))

    message("Chromosomes in gtf file: " + ",".join(gtf.get_chroms(nr=True)))

    if len(gtf) == 0:
        message("No genes were found on chromosomes defined in fasta file.",
                type="ERROR")

    nb_tx_after = gtf.extract_data("transcript_id",
                                   as_list=True,
                                   no_na=True,
                                   nr=True)
    if len(nb_tx_after) != len(nb_tx_before):
        diff = list(set(nb_tx_before) - set(nb_tx_after))
        message("Some transcripts had"
                " no corresponding chromosome"
                " in the fasta file: " + ",".join(diff)[0:100] + "...")

    message("Using genome file: " + genome.name)
    message("Retrieving fasta sequences from " + genome.name)
    fasta_seq = gtf.get_sequences(genome=genome.name,
                                  intron=with_introns,
                                  rev_comp=rev_comp)

    tx_gtf = gtf.select_by_key("feature", "transcript")

    if sleuth_format:

        tx_biotype = tx_gtf.extract_data("transcript_id,transcript_biotype",
                                         as_dict_of_lists=True,
                                         hide_undef=False)
        gn_biotype = tx_gtf.extract_data("gene_id,gene_biotype",
                                         as_dict_of_lists=True,
                                         hide_undef=False)

        for i in fasta_seq:
            gene_id = i.gene_id
            transcript_id = i.transcript_id
            chrom = i.chrom

            gn_bio = gn_biotype[i.gene_id][0]
            tx_bio = tx_biotype[i.transcript_id][0]

            if delete_version:
                transcript_id = re.sub('\.[0-9]+$', '', transcript_id)
                gene_id = re.sub('\.[0-9]+$', '', gene_id)
            if del_chr:
                chrom = chrom.replace('chr', '')

            header = " ".join([
                transcript_id, ":".join([
                    "chromosome", assembly, chrom,
                    str(i.start),
                    str(i.end), "1"
                ]), "gene:" + gene_id, "gene_biotype:" + gn_bio,
                "transcript_biotype:" + tx_bio
            ])

            outputfile.write(">" + header + "\n")
            outputfile.write(i.sequence + "\n")
    else:
        tx_info = tx_gtf.extract_data("transcript_id," + label,
                                      as_dict_of_lists=True,
                                      hide_undef=False)
        for i in fasta_seq:
            if not explicit:
                header = separator.join(tx_info[i.transcript_id])
            else:
                header = [
                    str(x[0]) + "=" + x[1]
                    for x in zip(label.split(","), tx_info[i.transcript_id])
                ]
                header = separator.join(header)
            outputfile.write(">" + header + "\n")
            outputfile.write(i.sequence + "\n")

    gc.disable()
    close_properly(outputfile, inputfile)
Beispiel #11
0
def great_reg_domains(inputfile=None,
                      outputfile=None,
                      go_id="GO:0003700",
                      species="hsapiens",
                      upstream=1000,
                      downstream=1000,
                      chrom_info=None,
                      distal=1000000,
                      mode='basal_plus_extension',
                      http_proxy=None,
                      https_proxy=None):
    """ Given a GTF and a GO term, attempt compute labeled regions using GREAT 'association rule'. """

    # -------------------------------------------------------------------------
    # chrom_len will store the chromosome sizes.
    # -------------------------------------------------------------------------

    chrom_len = chrom_info_as_dict(chrom_info)

    # -------------------------------------------------------------------------
    # Read the GTF
    # -------------------------------------------------------------------------

    gtf = GTF(inputfile, check_ensembl_format=False)

    # -------------------------------------------------------------------------
    # Get the TSSs -- Extend them by upstream/dowstream
    # -------------------------------------------------------------------------

    message("Defining basal regulatory domains.", type="INFO")
    basal_reg_bed = gtf.get_tss(name=['gene_id', 'gene_name']).slop(
        s=True, l=upstream, r=downstream, g=chrom_info.name).sort()

    basal_reg_bed_file = make_tmp_file(prefix='basal_reg', suffix='.bed')
    basal_reg_bed.saveas(basal_reg_bed_file.name)

    if mode == 'basal_plus_extension':
        # -------------------------------------------------------------------------
        # Search for upstream limits of each basal_reg_bed
        # Here we ignore overlapping  basal_reg_bed as the way they
        # are proceded is not documented in GREAT to our knowledge
        # -------------------------------------------------------------------------
        message("Defining regulatory domains upstream regions.", type="INFO")

        regulatory_region_start = dict()
        regulatory_region_end = dict()
        chroms = dict()
        strands = dict()

        basal_reg_bed_upstream = basal_reg_bed.closest(
            basal_reg_bed,
            # Ignore features in B that overlap A
            io=True,
            # In addition to the closest feature in B report distance
            # use negative distances to report upstream features.
            # Report distance with respect to A.
            # When A is on the - strand, "upstream" means B has a
            # higher(start, stop).
            D="a",
            # Ignore features in B that are downstream of features in A
            id=True,
            # How ties are handled. "first"  Report the first tie
            t="first",
            # Require that the query and the closest hit have different names/gene_ids.
            N=True)

        basal_reg_bed_upstream_file = make_tmp_file(
            prefix='basal_reg_bed_upstream', suffix='.bed')
        basal_reg_bed_upstream.saveas(basal_reg_bed_upstream_file.name)

        for line in basal_reg_bed_upstream:

            gene_id = line.name
            strand = line.strand
            end = line.end
            start = line.start
            gene_id = "|".join([gene_id, str(start), str(end), strand])
            chroms[gene_id] = line.chrom
            strands[gene_id] = strand

            if strand == '+':

                # if the feature chromosome in B is
                # '.' we have reached the start of the chr
                if line.fields[6] == '.':
                    regulatory_region_start[gene_id] = max(
                        0, line.start - distal)
                else:
                    padding = min(distal, abs(int(line.fields[12])) - 1)
                    regulatory_region_start[gene_id] = line.start - padding

            elif strand == '-':
                # if the feature chromosome in B is
                # '.' we have reached the end of the chr
                if line.fields[6] == '.':
                    regulatory_region_end[gene_id] = min(
                        int(chrom_len[line.chrom]), line.end + distal)
                else:
                    padding = min(distal, abs(int(line.fields[12])) - 1)
                    regulatory_region_end[gene_id] = line.end + padding
            else:
                message("Cannot process genes without strand", type="WARNING")
                message("Please check:" + gene_id, type="ERROR")

        # -------------------------------------------------------------------------
        # Search for downstream limits of each basal_reg_bed
        # Here we ignore overlapping  basal_reg_bed as the way they
        # are proceded is not documented in GREAT to our knowledge
        # -------------------------------------------------------------------------
        message("Defining regulatory domains downstream regions.", type="INFO")

        basal_reg_bed_downstream = basal_reg_bed.closest(
            basal_reg_bed,
            # Ignore features in B that overlap A
            io=True,
            # In addition to the closest feature in B report distance
            # use negative distances to report upstream features.
            # Report distance with respect to A.
            # When A is on the - strand, "upstream" means B has a
            # higher(start, stop).
            D="a",
            # Ignore features in B that are upstream of features in A
            iu=True,
            # How ties are handled. "first"  Report the first tie
            t="first",
            # Require that the query and the closest hit have different names/gene_ids.
            N=True)

        basal_reg_bed_downstream_file = make_tmp_file(
            prefix='basal_reg_bed_upstream', suffix='.bed')
        basal_reg_bed_downstream.saveas(basal_reg_bed_downstream_file.name)

        for line in basal_reg_bed_downstream:

            gene_id = line.name
            strand = line.strand
            end = line.end
            start = line.start
            gene_id = "|".join([gene_id, str(start), str(end), strand])
            chroms[gene_id] = line.chrom
            strands[gene_id] = strand

            if strand == '+':
                # if the feature chromosome in B is
                # '.' we have reached the start of the chr
                if line.fields[6] == '.':
                    regulatory_region_end[gene_id] = min(
                        int(chrom_len[line.chrom]), line.end + distal)
                else:
                    padding = min(distal, abs(int(line.fields[12])) - 1)
                    regulatory_region_end[gene_id] = line.end + padding
            elif strand == '-':
                if line.fields[6] == '.':
                    # sys.stderr.write(str(line.start - distal + 1) + "\n")
                    # sys.stderr.write(gene_id + "\n")
                    regulatory_region_start[gene_id] = max(
                        0, line.start - distal)
                else:
                    padding = min(distal, abs(int(line.fields[12])) - 1)
                    regulatory_region_start[gene_id] = max(
                        0, line.start - padding)
            else:
                message("Cannot process genes without strand", type="WARNING")
                message("Please check:" + gene_id, type="ERROR")
            # print(regulatory_region_start)

    else:
        message(
            "Only 'basal_plus_extension' association rule is currently supported.",
            type='ERROR')

    # -------------------------------------------------------------------------
    # Print the regulatory regions of all genes
    # By default print all genes
    # -------------------------------------------------------------------------

    if go_id is None:
        for gene_id in regulatory_region_start:
            outlist = [
                chroms[gene_id],
                str(regulatory_region_start[gene_id]),
                str(regulatory_region_end[gene_id]),
                gene_id.split("|")[0], "0", strands[gene_id]
            ]

            outputfile.write("\t".join(outlist) + "\n")
    else:

        # -------------------------------------------------------------------------
        # Get the list of gene/transcript associated with a particular GO term
        # -------------------------------------------------------------------------

        message("Getting Gene Ontology annotations.")

        if not go_id.startswith("GO:"):
            go_id = "GO:" + go_id

        is_associated = set()

        bm = Biomart(http_proxy=http_proxy, https_proxy=https_proxy)

        bm.get_datasets('ENSEMBL_MART_ENSEMBL')

        if species + "_gene_ensembl" not in bm.datasets:
            message("Unknow dataset/species.", type="ERROR")

        bm.query({'query': XML.format(species=species, go=go_id)})

        for i in bm.response.content.decode().split("\n"):
            i = i.rstrip("\n")
            if i != '':
                is_associated.add(i)

        for gene_id in regulatory_region_start:
            gene_id_short = gene_id.split("|")[0]
            if gene_id_short in is_associated:
                outlist = [
                    chroms[gene_id],
                    str(regulatory_region_start[gene_id]),
                    str(regulatory_region_end[gene_id]),
                    gene_id.split("|")[0], "0", strands[gene_id]
                ]

                outputfile.write("\t".join(outlist) + "\n")
Beispiel #12
0
def coverage(
        inputfile=None,
        outputfile=None,
        bw_list=None,
        labels=None,
        pseudo_count=1,
        nb_window=1,
        ft_type="promoter",
        n_highest=None,
        downstream=1000,
        key_name="cov",
        zero_to_na=False,
        name_column=None,
        upstream=1000,
        chrom_info=None,
        nb_proc=1,
        matrix_out=False,
        stat='mean'):
    """
    Compute transcript coverage with one or several bigWig.
    """

    # -------------------------------------------------------------------------
    # Create a list of labels.
    # Take user input in account
    # -------------------------------------------------------------------------

    bw_list = [x.name for x in bw_list]

    if len(bw_list) != len(set(bw_list)):
        message("Found the same bigwigs several times.",
                type="ERROR")

    message('Checking labels.')

    if labels is not None:
        labels = labels.split(",")
        # Ensure the number of labels is the same as the number of bw files.
        if len(labels) != len(bw_list):
            message("The number of labels should be the same as the number of"
                    " bigwig files.", type="ERROR")
        # Ensure labels are non-redondant
        if len(labels) > len(set(labels)):
            message("Labels must be unique.", type="ERROR")
    else:
        labels = []
        for i in range(len(bw_list)):
            labels += [
                os.path.splitext(
                    os.path.basename(
                        bw_list[i]))[0]]

    # -------------------------------------------------------------------------
    # Check the number of windows
    #
    # -------------------------------------------------------------------------

    if n_highest is None:
        n_highest = nb_window

    message('Number of bins: %d' % nb_window)
    message('N highest values: %d' % n_highest)

    if n_highest > nb_window:
        message('The number of window used for computing the score'
                ' (-n) can not be greater than the number of'
                ' windows (-w)', type="ERROR")
        sys.exit()

    # -------------------------------------------------------------------------
    # Check input file is in bed or GTF format
    #
    # -------------------------------------------------------------------------

    message("Loading input file...")
    if inputfile.name == '<stdin>':
        gtf = GTF(inputfile.name)
        is_gtf = True
    else:
        region_bo = BedTool(inputfile.name)
        if len(region_bo) == 0:
            message("Unable to find requested regions",
                    type="ERROR")

        if region_bo.file_type == 'gff':
            gtf = GTF(inputfile.name)
            is_gtf = True
        else:
            is_gtf = False

    # -------------------------------------------------------------------------
    # Get regions of interest
    #
    # -------------------------------------------------------------------------

    name_column = name_column.split(",")

    if is_gtf:

        message("Getting regions of interest...")

        if ft_type.lower() == "intergenic":

            region_bo = gtf.get_intergenic(chrom_info, 0, 0).slop(s=True,
                                                                  l=upstream,
                                                                  r=downstream,
                                                                  g=chrom_info.name).sort()

        elif ft_type.lower() == "intron":

            region_bo = gtf.get_introns().slop(s=True,
                                               l=upstream,
                                               r=downstream,
                                               g=chrom_info.name).sort()

        elif ft_type == "intron_by_tx":

            region_bo = gtf.get_introns(by_transcript=True,
                                        name=name_column,
                                        ).slop(s=True,
                                               l=upstream,
                                               r=downstream,
                                               g=chrom_info.name).sort()

        elif ft_type.lower() in ["promoter", "tss"]:

            region_bo = gtf.get_tss(name=name_column, ).slop(s=True,
                                                             l=upstream,
                                                             r=downstream,
                                                             g=chrom_info.name).sort()

        elif ft_type.lower() in ["tts", "terminator"]:

            region_bo = gtf.get_tts(name=name_column).slop(s=True,
                                                           l=upstream,
                                                           r=downstream,
                                                           g=chrom_info.name).sort()

        else:

            region_bo = gtf.select_by_key(
                "feature",
                ft_type, 0
            ).to_bed(name=name_column).slop(s=True,
                                            l=upstream,
                                            r=downstream,
                                            g=chrom_info.name).sort()

        if len(region_bo) == 0:
            message("Unable to find requested regions",
                    type="ERROR")

    else:
        region_bo = region_bo.slop(s=True,
                                   l=upstream,
                                   r=downstream,
                                   g=chrom_info.name).sort()

    region_bed = make_tmp_file(prefix="region", suffix=".bed")

    region_bo.saveas(region_bed.name)

    # -------------------------------------------------------------------------
    # Compute coverage
    #
    # -------------------------------------------------------------------------

    result_bed = bw_cov_mp(bw_list=bw_list,
                           region_file=open(region_bed.name),
                           labels=labels,
                           bin_nb=nb_window,
                           pseudo_count=pseudo_count,
                           zero_to_na=zero_to_na,
                           nb_proc=nb_proc,
                           n_highest=n_highest,
                           stat=stat,
                           verbose=pygtftk.utils.VERBOSITY)

    if matrix_out:
        result_bed.close()

        df_first = pd.read_csv(result_bed.name, sep="\t", header=None)

        df_first = df_first.ix[:, [0, 1, 2, 3, 5, 4]]

        df_list = []

        for i in range(len(labels)):
            # create a sub data frame containing the coverage values of the
            # current bwig
            str_to_find = r"^" + labels[i] + r"\|"
            tmp_df = df_first[df_first[3].str.match(str_to_find)].copy()
            to_replace = r"^" + labels[i] + r"\|"
            tmp_df.iloc[:, 3] = tmp_df.iloc[:, 3].replace(to_replace,
                                                          r"", regex=True)

            df_list += [tmp_df]

        df_final = df_list.pop(0)

        for i in df_list:
            # Add columns to df final by joining on
            # chrom, start, end, transcript_id, strand
            df_final = df_final.merge(i.iloc[:,
                                      list(range(6))], on=[0, 1,
                                                           2, 3, 5])

        df_final.columns = ["chrom",
                            "start",
                            "end",
                            "name",
                            "strand"] + labels

        df_final.to_csv(outputfile, sep="\t", index=False)

    else:
        nb_line = 0
        for i in result_bed:
            outputfile.write(i)
            nb_line += 1

        if nb_line == 0:
            message("No line available in output...",
                    type="ERROR")
    gc.disable()
    close_properly(inputfile, outputfile)
Beispiel #13
0
def convergent(inputfile=None,
               outputfile=None,
               upstream=1500,
               downstream=1500,
               chrom_info=None):
    """
    Find transcript with convergent tts.
    """

    message("Using -u " + str(upstream) + ".")
    message("Using -d " + str(downstream) + ".")

    tx_to_convergent_nm = dict()
    dist_to_convergent = dict()
    tts_pos = dict()

    message("Loading GTF.")

    gtf = GTF(inputfile)

    message("Getting transcript coordinates.")

    tx_feat = gtf.select_by_key("feature", "transcript")

    message("Getting tts coordinates.")

    tts_bo = tx_feat.get_tts(name=["transcript_id", "gene_id"], sep="||")

    # get tts position
    for i in tts_bo:
        tx_id_ov, gn_id_ov = i.name.split("||")
        tts_pos[tx_id_ov] = int(i.start)

    message("Getting tts coordinates.")

    tts_region_bo = tts_bo.slop(s=True,
                                l=upstream,
                                r=downstream,
                                g=chrom_info.name).cut([0, 1, 2, 3, 4, 5])

    message("Intersecting...")
    tts_intersect_bo = tts_region_bo.intersect(tts_bo,
                                               wb=True,
                                               s=False,
                                               S=True)

    tmp_file = make_tmp_file("tts_slop", ".bed")
    tts_region_bo.saveas(tmp_file.name)
    tmp_file = make_tmp_file("tts_slop_intersection_with_tts_as_", ".bed")
    tts_intersect_bo.saveas(tmp_file.name)

    for i in tts_intersect_bo:

        tx_id_main, gene_id_main = i.fields[3].split("||")
        tx_id_ov, gn_id_ov = i.fields[9].split("||")

        if gene_id_main != gn_id_ov:
            if tx_id_main in tx_to_convergent_nm:
                dist = abs(tts_pos[tx_id_main] - tts_pos[tx_id_ov])
                if dist < dist_to_convergent[tx_id_main]:
                    dist_to_convergent[tx_id_main] = dist
                    tx_to_convergent_nm[tx_id_main] = tx_id_ov
            else:
                dist = abs(tts_pos[tx_id_main] - tts_pos[tx_id_ov])
                dist_to_convergent[tx_id_main] = dist
                tx_to_convergent_nm[tx_id_main] = tx_id_ov

    if len(tx_to_convergent_nm):
        gtf = gtf.add_attr_from_dict(feat="transcript",
                                     key="transcript_id",
                                     a_dict=tx_to_convergent_nm,
                                     new_key="convergent")

        gtf = gtf.add_attr_from_dict(feat="transcript",
                                     key="transcript_id",
                                     a_dict=dist_to_convergent,
                                     new_key="dist_to_convergent")

    gtf.write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Beispiel #14
0
def feature_size(inputfile=None,
                 outputfile=None,
                 ft_type="transcript",
                 names="transcript_id",
                 key_name='feature_size',
                 separator="|",
                 bed=False):
    """
 Get the size and limits (start/end) of features enclosed in the GTF. If bed
 format is requested returns the limits zero-based half open and the size as a score.
 Otherwise output GTF file with 'feat_size' as a new key and size as value.
    """

    message("Computing feature sizes.")

    gtf = GTF(inputfile)

    feat_list = gtf.get_feature_list(nr=True) + ['mature_rna']

    if ft_type not in feat_list + ["*"]:
        message("Unable to find requested feature.", type="ERROR")

    names = names.split(",")

    if ft_type != 'mature_rna':

        if bed:
            bed_obj = gtf.select_by_key("feature",
                                        ft_type).to_bed(name=names,
                                                        sep=separator,
                                                        add_feature_type=True)

            for i in bed_obj:
                i.score = str(i.end - i.start)
                write_properly(chomp(str(i)), outputfile)
        else:

            tmp_file = make_tmp_file(prefix="feature_size", suffix=".txt")

            elmt = gtf.extract_data("feature,start,end",
                                    as_list_of_list=True,
                                    no_na=False,
                                    hide_undef=False)

            for i in elmt:
                if i[0] != ft_type and ft_type != "*":
                    tmp_file.write("?\n")
                else:
                    tmp_file.write(str(int(i[2]) - int(i[1]) + 1) + "\n")

            tmp_file.close()

            gtf.add_attr_column(tmp_file, key_name).write(outputfile,
                                                          gc_off=True)

    else:

        tx_size = gtf.get_transcript_size()

        if bed:
            bed_obj = gtf.select_by_key("feature", 'transcript').to_bed(
                ['transcript_id'] + names,
                add_feature_type=False,
                sep=separator,
                more_name=['mature_rna'])

            for i in bed_obj:
                names = i.name.split(separator)
                tx_id = names.pop(0)
                i.score = tx_size[tx_id]
                i.name = separator.join(names)
                write_properly(chomp(str(i)), outputfile)
        else:

            if len(tx_size):
                gtf = gtf.add_attr_from_dict(feat="transcript",
                                             key="transcript_id",
                                             a_dict=tx_size,
                                             new_key=key_name)

            gtf.write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
def get_midpoints(self):
    """Returns a bedtools object containing the midpoints of features.


        :Example:
        >>> from pygtftk import bedtool_extension
        >>> fromscratch1 = bedtool_extension.BedTool('chrX 0 100', from_string=True)
        >>> for i in fromscratch1.get_midpoints(): pass
        >>> assert i.start == 49
        >>> assert i.end == 51
        >>> fromscratch1 = bedtool_extension.BedTool('chrX 0 101', from_string=True)
        >>> for i in fromscratch1.get_midpoints(): pass
        >>> assert i.start == 50
        >>> assert i.end == 51
    """

    message("Calling 'get_midpoints'.", type="DEBUG")

    midpoints_bed = make_tmp_file("Midpoints", ".bed")

    n = 1

    for line in self:

        if line.name == ".":
            name = str(n)
        else:
            name = line.name

        if line.strand == ".":
            strand = "+"
        else:
            strand = line.strand

        if line.score == ".":
            score = "."
        else:
            score = line.score

        diff = line.end - line.start

        if diff % 2 != 0:
            # e.g 10-13 (zero based) -> 11-13 one based
            # mipoint is 12 (one-based) -> 11-12 (zero based)
            # e.g 949-1100 (zero based) -> 950-1100 one based
            # mipoint is 1025 (one-based) -> 1024-1025 (zero based)
            # floored division (python 2)...
            line.end = line.start + int(diff // 2) + 1
            line.start = line.end - 1
        else:
            # e.g 10-14 (zero based) -> 11-14 one based
            # mipoint is 12-13 (one-based) -> 11-13 (zero based)
            # e.g 9-5100 (zero based) -> 10-5100 one based
            # mipoint is 2555-2555 (one-based) -> 2554-2555 (zero based)
            # floored division (python 2)...
            # No real center. Take both

            line.start = line.start + int(diff // 2) - 1
            line.end = line.start + 2

        midpoints_bed.write("\t".join(
            [line.chrom,
             str(line.start),
             str(line.end), name, score, strand]) + "\n")
        n += 1

    midpoints_bed.close()

    return BedTool(fn=midpoints_bed.name)
def tss_numbering(inputfile=None,
                  outputfile=None,
                  compute_dist=False,
                  key_name='tss_number',
                  key_name_dist='dist_to_first_tss',
                  add_nb_tss_to_gene=False,
                  gene_key='nb_tss'):
    """
    Computes the distance between TSS of gene transcripts.
    """

    gtf = GTF(inputfile, check_ensembl_format=True)

    gn_tss_dist = defaultdict(dict)

    message("Getting TSSs.")
    tss = gtf.get_tss(name=["transcript_id"], as_dict=True)
    tx_to_gn = gtf.get_tx_to_gn()

    for k in tss:
        gn_id = tx_to_gn[k]
        gn_tss_dist[gn_id][k] = int(tss[k])

    # if_dict_of_dict is true, get_gn_to_tx() returns a dict of dict
    # that maps gene_id to transcript_id and transcript_id to TSS
    # numbering (1 for most 5', then 2...). For transcripts having
    # the same TSSs, the tss number will be the same.
    gn_to_tx_to_tss = gtf.get_gn_to_tx(as_dict_of_dict=True)

    message("Numbering TSSs.")

    tss_number_file = make_tmp_file(prefix='tx_to_tss_number', suffix='.txt')

    gn_how_many_tss = dict()

    for gn_id in gn_to_tx_to_tss:
        for tx_id in gn_to_tx_to_tss[gn_id]:
            tss_num = str(gn_to_tx_to_tss[gn_id][tx_id])
            tss_number_file.write(tx_id + "\t" + tss_num + "\n")
            if gn_id not in gn_how_many_tss:
                gn_how_many_tss[gn_id] = tss_num
            else:
                if int(tss_num) > int(gn_how_many_tss[gn_id]):
                    gn_how_many_tss[gn_id] = tss_num

    tss_number_file.close()

    gtf = gtf.add_attr_from_file(feat='transcript',
                                 key='transcript_id',
                                 new_key=key_name,
                                 inputfile=open(tss_number_file.name),
                                 has_header=False)

    if add_nb_tss_to_gene:

        gn_how_many_tss_file = make_tmp_file(prefix='gn_how_many_tss',
                                             suffix='.txt')

        for a_key, a_val in gn_how_many_tss.items():
            gn_how_many_tss_file.write(a_key + "\t" + a_val + "\n")

        gn_how_many_tss_file.close()

        gtf = gtf.add_attr_from_file(feat='gene',
                                     key='gene_id',
                                     new_key=gene_key,
                                     inputfile=open(gn_how_many_tss_file.name),
                                     has_header=False)

    if compute_dist:
        gn_to_tx_ordered_by_tss = gtf.get_gn_to_tx(ordered_5p=True)
        tss_dist_file = make_tmp_file(prefix='tx_tss_dist_to_first_tss',
                                      suffix='.txt')

        for gn_id in gn_to_tx_to_tss:
            tx_list = gn_to_tx_ordered_by_tss[gn_id]
            tx_first = tx_list.pop(0)
            # The first tss as distance 0 to the
            # first tss...
            tss_dist_file.write(tx_first + "\t0\n")
            for tx_id in tx_list:
                dist_to_first = abs(int(tss[tx_first]) - int(tss[tx_id]))
                tss_dist_file.write(tx_id + "\t" + str(dist_to_first) + "\n")

        tss_dist_file.close()

        gtf = gtf.add_attr_from_file(feat='transcript',
                                     key='transcript_id',
                                     new_key=key_name_dist,
                                     inputfile=open(tss_dist_file.name),
                                     has_header=False)

    gtf.write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Beispiel #17
0
def divergent(
        inputfile=None,
        outputfile=None,
        key_name=None,
        upstream=1500,
        downstream=1500,
        chrom_info=None,
        no_strandness=False,
        no_annotation=False):
    """
Find transcript with divergent promoters.
    """

    message("Using -u " + str(upstream) + ".")
    message("Using -d " + str(downstream) + ".")

    tx_with_divergent = dict()
    dist_to_divergent = dict()
    tss_pos = dict()

    message("Loading GTF.")

    gtf = GTF(inputfile)

    message("Getting transcript coordinates.")

    tx_feat = gtf.select_by_key("feature",
                                "transcript")
    message("Getting tss coordinates.")

    tss_bo = tx_feat.get_tss(name=["transcript_id", "gene_id"],
                             sep="||")

    # get tss position
    for i in tss_bo:
        tx_id_tss, gn_id_tss = i.name.split("||")
        tss_pos[tx_id_tss] = int(i.start)

    message("Getting promoter coordinates.")

    promoter_bo = tss_bo.slop(s=True,
                              l=upstream,
                              r=downstream,
                              g=chrom_info.name).cut([0, 1,
                                                      2, 3,
                                                      4, 5])
    message("Intersecting...")

    if no_strandness:
        prom_with_tss_bo = promoter_bo.intersect(tss_bo,
                                                 wb=True,
                                                 s=False,
                                                 S=False)
    else:
        prom_with_tss_bo = promoter_bo.intersect(tss_bo,
                                                 wb=True,
                                                 s=False,
                                                 S=True)

    tmp_file = make_tmp_file("promoter_slop", ".bed")
    promoter_bo.saveas(tmp_file.name)
    tmp_file = make_tmp_file("promoter_intersection_with_tss_as_", ".bed")
    prom_with_tss_bo.saveas(tmp_file.name)

    for i in prom_with_tss_bo:

        tx_id_tss, gn_id_tss = i.fields[9].split("||")
        tx_id_prom, gene_id_prom = i.fields[3].split("||")

        if gene_id_prom != gn_id_tss:
            if tx_id_prom in tx_with_divergent:
                dist = abs(tss_pos[tx_id_prom] - tss_pos[tx_id_tss])
                if dist < dist_to_divergent[tx_id_prom]:
                    dist_to_divergent[tx_id_prom] = dist
                    tx_with_divergent[tx_id_prom] = tx_id_tss
            else:

                dist = abs(tss_pos[tx_id_prom] - tss_pos[tx_id_tss])
                dist_to_divergent[tx_id_prom] = dist
                tx_with_divergent[tx_id_prom] = tx_id_tss

    if not no_annotation:

        if key_name is None:
            key_name = "divergent"
            key_name_dist = "dist_to_divergent"
        else:
            key_name_dist = "dist_" + key_name

        if len(tx_with_divergent):
            gtf = gtf.add_attr_from_dict(feat="transcript",
                                         key="transcript_id",
                                         a_dict=tx_with_divergent,
                                         new_key=key_name)

            gtf = gtf.add_attr_from_dict(feat="transcript",
                                         key="transcript_id",
                                         a_dict=dist_to_divergent,
                                         new_key=key_name_dist)

        gtf.write(outputfile,
                  gc_off=True)

    else:
        gtf.select_by_key("transcript_id",
                          ",".join(list(tx_with_divergent.keys()))).write(outputfile, gc_off=True)

    close_properly(outputfile, inputfile)
Beispiel #18
0
def rm_dup_tss(inputfile=None, outputfile=None):
    """If several transcripts of a gene share the same tss, select only one."""

    # ----------------------------------------------------------------------
    # Get the TSS
    # ----------------------------------------------------------------------

    gtf = GTF(inputfile)
    tss_bo = gtf.get_tss(["gene_id", "transcript_id"])

    # ----------------------------------------------------------------------
    # Sort the file by name (4th col) to ensure reproducibility between calls.
    # ----------------------------------------------------------------------

    with open(tss_bo.fn) as f:
        lines = [line.split('\t') for line in f]

    tmp_file = make_tmp_file(prefix="tss_sorted_by_tx_id", suffix=".bed")

    for line in sorted(lines, key=operator.itemgetter(3)):
        tmp_file.write('\t'.join(line))

    tmp_file.close()

    tss_bo = BedTool(tmp_file.name)

    # ----------------------------------------------------------------------
    # Get the list of non redundant TSSs
    # ----------------------------------------------------------------------

    gene_dict = defaultdict(dict)
    to_delete = []

    message("Looking for redundant TSS (gene-wise).")

    for line in tss_bo:

        tss = line.start
        name = line.name
        gene_id, tx_id = name.split("|")

        if gene_id in gene_dict:
            if tss not in gene_dict[gene_id]:
                gene_dict[gene_id][tss] = tx_id
            else:
                to_delete += [tx_id]
        else:
            gene_dict[gene_id][tss] = tx_id

    message("Deleted transcripts: " +
            ",".join(to_delete[1:min(10, len(to_delete))]) + "...",
            type="DEBUG")

    # ----------------------------------------------------------------------
    # Write
    # ----------------------------------------------------------------------

    gtf.select_by_key("feature", "gene",
                      invert_match=True).select_by_key(
                          "transcript_id",
                          ",".join(to_delete),
                          invert_match=True).write(outputfile, gc_off=True)