Ejemplo n.º 1
0
    def __init__ (self, fp, name=None, refid_list=[], output_index=False, verbose=False, **kwargs):
        """
         * fp
            A fasta file containing the reference sequences OR an tab separated index file containing at least 2 columns
            with the refid and the length in bases (like a .fa.fai file generated by samtools faidx).
            The fasta option will take more time as the file has to be parsed to get the refid and length of sequences.
            A 2 column index tsv file will be automatically generated for latter usage as an index file.
            Both fasta and infex file can be gziped
        *  name
            Name of the data file that will be used as track name for plotting. If not given, will be deduced from fp
            file name
        * refid_list
            list of reference sequence id to select from the data file, by default all [ DEFAULT: [] ]
        * output_index
            If True will write a simple A 2 column index tsv file containing the Reference sequence ids and their
            lengths [ DEFAULT: False ]
        """
        # Save self variable
        verbose = verbose
        self.refid_list = refid_list

        # Store the reference genome informations
        if verbose: jprint("Add reference genome file", bold=True)
        self.reference = Reference(
            fp=fp,
            name=name,
            verbose=verbose,
            refid_list=self.refid_list,
            output_index=output_index)

        # List to store annotations and alignment tracks
        self.annotations = []
        self.alignments = []
Ejemplo n.º 2
0
    def _bam_parser(self,
                    fp,
                    min_coverage=5,
                    refid_list=[],
                    verbose=False,
                    **kwargs):
        """Parse a sam or bam formated file
        """
        d = OrderedDict()
        with pysam.AlignmentFile(fp) as bam:
            # Compute the genomic coverage for each reads
            if verbose: jprint("\tTally coverage for each base")
            for line in bam:
                # Skip non aligned reads
                if line.reference_id == -1:
                    continue
                refid = line.reference_name
                # If not refid filter or if the refid is in the autozized list
                if not refid_list or refid in refid_list:
                    # Create a new entry if not in the dict
                    if not refid in d:
                        d[refid] = {
                            "nbases": 0,
                            "+": Counter(),
                            "-": Counter()
                        }
                    # Save coverage
                    strand = "-" if line.is_reverse else "+"
                    for position in line.get_reference_positions():
                        d[refid][strand][position] += 1

        d = self._clean_d(d=d, min_coverage=min_coverage, verbose=verbose)
        return d
Ejemplo n.º 3
0
    def add_annotation(self, fp, name=None, min_len=None, max_len=None, refid_list=None, type_list=None, verbose=False, **kwargs):
        """
          * fp
            An URL to a standard genomic file containing features annotations among the following format:
              gff3: http://www.ensembl.org/info/website/upload/gff3.html
              gtf:  http://www.ensembl.org/info/website/upload/gff.html
              bed:  http://www.ensembl.org/info/website/upload/bed.html
            Valid URL schemes include http, ftp, s3, and file.
            The file can eventually be compressed in ‘gzip’, ‘bz2’, ‘zip’ or ‘xz’
        *  name
            Name of the data file that will be used as track name for plotting. If not given, will be deduced from fp
            file name  [ DEFAULT: None ]
        * min_len
            Minimal size (start to end) of a feature to be selected [default None]
        * max_len
            Maximal size (start to end) of a feature to be selected [default None]
        * refid_list
            List of reference id to select. Example: ["chr1", "chr2", "chr3"] [default None]
        * type_list
            List of feature type to select. Example: ["exon", "gene"] [default None]
        """
        if verbose: jprint("Add annotation file", bold=True)
        a = Annotation(fp=fp, name=name, min_len=min_len, max_len=max_len, refid_list=refid_list, type_list=type_list, verbose=verbose)

        if verbose:
            not_found = set(self.reference.refid_list) - set(a.refid_list)
            if not_found:
                warnings.warn("No annotation found for {}".format(",".join(not_found)))

        self.annotations.append(a)
Ejemplo n.º 4
0
    def add_alignment(self, fp, name=None, min_coverage=5, refid_list=[], output_bed=False, verbose=False, **kwargs):
        """
         * fp
             A standard BAM or SAM (http://samtools.sourceforge.net/SAM1.pdf) containing aligned reads and a standard
             header. The files do not need to be sorted or indexed.
             One can also use a 6 fields bed (chrom, chromStart, chromEnd, name, score, strand) file with a hastaged
             commented header listing the reference sequences id and length, similar to the format generated by the
             output_bed option (Much faster than from a Bam/Sam file, can be gzipped). http://www.ensembl.org/info/website/upload/bed.html
        *  name
            Name of the data file that will be used as track name for plotting. If not given, will be deduced from fp
            file name  [ DEFAULT: None ]
        * min_coverage
            Minimal coverage to compute the data. If less, the coverage will be considered null. Not used for
            if fp is a bed coverage file [ DEFAULT: 5 ]
        * output_bed
            If True will be write a 6 columns compressed bed file containing the coverage values for + and - strand
            excluding positions with coverage lesser than min_coverage.the option will apply only is the input file is
            BAM or SAM. The file starts with a header consisting of a list of the ID of the reference sequences and
            their length [ DEFAULT: False ]. Example:
              #chr20	64444167
              #chr21	46709983
              chr20	276516	276516	pos1	5	+
              chr20	276517	276517	pos2	5	+
        """
        if verbose: jprint("Add alignment file", bold=True)
        a = Alignment(fp=fp, name=name, min_coverage=min_coverage, refid_list=refid_list, output_bed=output_bed, verbose=verbose)

        if verbose:
            not_found = set(self.reference.refid_list) - set(a.refid_list)
            if not_found:
                warnings.warn("No coverage found for {}".format(",".join(not_found)))

        self.alignments.append(a)
Ejemplo n.º 5
0
 def _pickle_parser(self, fp, verbose=False, **kwargs):
     """
     Parse a pickle database
     """
     # Import the file in a dataframe
     if verbose: jprint ("\tTry to load as a pickle file")
     df = pd.read_pickle(fp)
     return df
Ejemplo n.º 6
0
 def get_refid_len(self, refid, verbose=False, **kwargs):
     """ Return the length of a given refid, If the reference is not found return None"""
     if refid not in self.d:
         if verbose:
             jprint(
                 "The reference sequence {} was not found in the reference list"
                 .format(refid))
         return None
     else:
         return self.d[refid]
Ejemplo n.º 7
0
    def to_pickle (self, fp=None, verbose=False, **kwargs):
        """
        Store the parsed file in a pickle file for further use.
        * fp
            Path to save the pickle file. By default original annotation file (- .gz/.tgz) + .pkl
        """
        if not fp:
            if self.fp.endswith(".gz") or self.fp.endswith(".tgz"):
                fp = self.fp.rpartition(".")[0]+".pkl"
            else:
                fp = self.fp+".pkl"

        if verbose: jprint ("Pickle dataframe in file {}".format(fp))
        self.feature_df.to_pickle(fp)
        return fp
Ejemplo n.º 8
0
    def _gtf_parser(self, fp, compression=None, verbose=False, **kwargs):
        """
        Parse a gtf formated file
        """
        if verbose: jprint("\tUse GTF parser to parse annotations")
        # Import the file in a dataframe
        col_names = ["refid","source","type","start","end","score","strand","frame","attribute"]
        df = pd.read_csv(fp, sep="\t", names=col_names, index_col=False, comment="#", compression=compression)

        # Extract the ID field = first field of attribute=
        df['ID'] = df["attribute"].str.split('\"').str[1]
        if verbose: jprint("\tSuccessfully imported as a gtf file")

        # Clean df
        df = self._clean_df(df, verbose=verbose)
        return df
Ejemplo n.º 9
0
 def _clean_d(self, d, min_coverage=5, verbose=False, **kwargs):
     """ Remove base with coverage below threshold and transform in Pandas Series
     """
     if verbose:
         jprint("\tFilter and sort the coverage results by position")
     for refid in d.keys():
         for strand in ["+", "-"]:
             s = OrderedDict()
             for position, coverage in d[refid][strand].items():
                 if coverage >= min_coverage:
                     s[position] = coverage
                     self.nbases += coverage
                     d[refid]["nbases"] += coverage
             d[refid][strand] = pd.Series(s)
             d[refid][strand].sort_index(inplace=True)
     return d
Ejemplo n.º 10
0
    def interval_features (self, refid, start, end, feature_types=None, max_features_per_type=None, verbose=False, **kwargs):
        """
        Parse the annotation file for the given refid and interval and return a dataframe containing all the features
        found for each original line. Features are identified by their ID field for gff3 files, by the entire
        attribute field for the bed files and by the first element in the attribute field for the gtf files
        * refid
            Name of the sequence from the original fasta file to display
        * start
            Start of the window to display. The coordinate is not verified, if outside of the range it will
            return an empty dataframe
        * end
            End of the window to display. The coordinate is not verified, if outside of the range it will
            return an empty dataframe
        * feature_types
            Name of a valid feature type ( "exon"|"transcript"|"gene"|"CDS"...) or list of names of feature type for
            which a row will be returned. The option is not available for bed files. If not given, all features type
            found in the interval will be returned [ DEFAULT: None ]
        * max_features_per_type
            Maximal total number of features for a particular feature type. If more are found, a random sampling will
            be performed. If None, all the features will be returned [ DEFAULT: None ]
        """
        # Verifications and auto adjustment of coordinates
        if not refid in self.refid_list:
            if verbose: jprint ("The reference {} is not in the list of references with alignment".format(refid))
            return pd.DataFrame(columns=["refid","start","end","strand","ID","type"])

        # Select the refid and coordinates
        df = self.feature_df[(self.feature_df["refid"] == refid)&(self.feature_df["end"] > start)&(self.feature_df["start"] < end)]
        if df.empty:
            if verbose: jprint ("No feature found in the requested interval")
            return pd.DataFrame(columns=["refid","start","end","strand","ID","type"])

        # Cast str to list
        if type(feature_types) == str: feature_types = [feature_types]

        # Filter_df by type and max number per type
        select_list = []
        for type_name, type_df in df.groupby("type"):
            # Filter out if not in the list
            if not feature_types or type_name in feature_types:
                sdf = df[(df["type"] == type_name)]
                if sdf.empty:
                    if verbose: jprint ("No feature of type {} found in the requested interval".format(type_name))
                elif max_features_per_type and len(sdf)>max_features_per_type:
                    select_list.append(sdf.sample(max_features_per_type))
                else:
                    select_list.append(sdf)
        # Merge the selected features in a single df
        if select_list:
            df = pd.concat(select_list)
        else:
            if verbose: jprint ("No feature found in the requested interval")
            return pd.DataFrame(columns=["refid","start","end","strand","ID","type"])

        # Return a sorted copy of the  df
        df.sort_values(by=["refid","start","end"], inplace=True)
        return df.copy().reset_index(drop=True)
Ejemplo n.º 11
0
    def __init__(self,
                 fp,
                 name=None,
                 min_coverage=5,
                 refid_list=[],
                 output_bed=False,
                 verbose=False,
                 **kwargs):
        """
         * fp
             A standard BAM or SAM (http://samtools.sourceforge.net/SAM1.pdf) containing aligned reads and a standard
             header. The files do not need to be sorted or indexed.
             One can also use a 6 fields bed (chrom, chromStart, chromEnd, name, score, strand, where score is the
             coverage value (Much faster than from a Bam/Sam file, can be gzipped). http://www.ensembl.org/info/website/upload/bed.html
        *  name
            Name of the data file that will be used as track name for plotting. If not given, will be deduced from fp
            file name  [ DEFAULT: None ]
        * min_coverage
            Minimal coverage to compute the data. If less, the coverage will be considered null. Not used for
            if fp is a bed coverage file [ DEFAULT: 5 ]
        * refid_list
            list of reference sequence id to select from the data file, by default all, Not used for if fp is a bed
            coverage file [ DEFAULT: [] ]
        * output_bed
            If True will be write a 6 columns compressed bed file containing the coverage values for + and - strand
            excluding positions with coverage lesser than min_coverage.the option will apply only is the input file is
            BAM or SAM. [ DEFAULT: False ]
        """
        # Verify that the file is readable
        is_readable_file(fp)

        #Save self variable
        self.fp = fp
        self.name = name if name else file_basename(fp)
        self.ext = extensions_list(fp)[0]
        self.nbases = 0

        if self.ext in ["bam", "sam"]:
            if verbose: jprint("Compute coverage from bam/sam file ", self.fp)
            self.d = self._bam_parser(fp, min_coverage, refid_list)
            if output_bed:
                outfp = "{}/{}.bed.gz".format(dir_path(fp), file_basename(fp))
                if verbose: jprint("Write coverage data in file ", outfp)
                self._write_coverage_file(outfp)
                self.outfp = outfp

        elif self.ext == "bed":
            if verbose: jprint("Extract coverage from bed file", self.fp)
            self.d = self._bed_parser(fp, min_coverage, refid_list)

        else:
            msg = "The file is not in SAM/BAM/BED format. Please provide a correctly formated file"
            raise ValueError(msg)

        if verbose:
            jprint("\tTotal base coverage {} in {} reference sequences".format(
                self.nbases, self.refid_count))
Ejemplo n.º 12
0
    def alignment_summary (self, verbose=False, **kwargs):
        """Display table summarizing annotation file information"""
        if not self.alignments:
            warnings.warn("No alignment track loaded")
            return None

        count_df = pd.DataFrame(columns=["Refid count", "Base coverage"])
        rbc_df = pd.DataFrame()

        for a in self.alignments:
            count_df.loc[a.name] = [a.refid_count, a.nbases]
            rbc = pd.DataFrame(a.refid_nbases)
            rbc.columns=[a.name]
            rbc_df = pd.merge(left=rbc_df, right=rbc, how='outer', right_index=True, left_index=True)

        jprint("Counts per Alignment file", bold=True)
        display(count_df)
        jprint("Counts per Reference sequence", bold=True)
        rbc_df.sort_index()
        display(rbc_df)
Ejemplo n.º 13
0
    def annotation_summary (self, verbose=False, **kwargs):
        """Display table summarizing annotation file information"""
        if not self.annotations:
            warnings.warn("No annotation track loaded")
            return None

        count_df = pd.DataFrame(columns=["Feature count", "Refid count", "Feature type count"])
        rcu_df = pd.DataFrame()
        tcu_df = pd.DataFrame()

        for a in self.annotations:
            count_df.loc[a.name] = [a.feature_count, a.refid_count, a.type_count]
            rcu = a.refid_count_uniq
            rcu.columns=[a.name]
            rcu_df = pd.merge(left=rcu_df, right=rcu, how='outer', right_index=True, left_index=True)
            tcu = a.type_count_uniq
            tcu.columns=[a.name]
            tcu_df = pd.merge(left=tcu_df, right=tcu, how='outer', right_index=True, left_index=True)

        jprint("Counts per Annotation file", bold=True)
        display(count_df)
        jprint("Counts per Reference sequence", bold=True)
        rcu_df.sort_index()
        display(rcu_df)
        jprint("Counts per feature types", bold=True)
        tcu_df.sort_index()
        display(tcu_df)
Ejemplo n.º 14
0
 def select_len (self, min_len=None, max_len=None, verbose=False, **kwargs):
     """ Select features longer or shorter that given values
     """
     if verbose:
         jprint ("Selecting features based on length")
         jprint ("\tFeatures before filtering: {}".format(self.feature_count))
     # Filter min len
     if min_len:
         self.feature_df = self.feature_df[((self.feature_df["end"]-self.feature_df["start"]) >= min_len)]
     if verbose: jprint ("\tFeatures after minimal length filtering: {}".format(self.feature_count))
     # Filter max len
     if max_len:
         self.feature_df = self.feature_df[((self.feature_df["end"]-self.feature_df["start"]) <= max_len)]
     if verbose: jprint ("\tFeatures after maximal length filtering: {}".format(self.feature_count))
Ejemplo n.º 15
0
    def _bed_parser(self,
                    fp,
                    min_coverage=5,
                    refid_list=[],
                    verbose=False,
                    **kwargs):
        """Extract data from a coverage bad file
        """
        d = OrderedDict()

        # File handling for both uncompressed or compressed fasta file
        if fp.endswith(".gz"):
            open_fun, open_mode = gzip.open, "rt"
        else:
            open_fun, open_mode = open, "r"
        # Parse fasta file refid
        with open_fun(fp, open_mode) as fin:
            if verbose: jprint("\tExtract base coverage data")
            for line in fin:
                sl = line[0:-1].split("\t")
                refid = sl[0]
                if not refid_list or refid in refid_list:
                    # Create a new entry if not in the dict
                    if not refid in d:
                        d[refid] = {
                            "nbases": 0,
                            "+": Counter(),
                            "-": Counter()
                        }
                    position = int(sl[1])
                    coverage = int(sl[4])
                    strand = sl[5]
                    d[refid][strand][position] = coverage
                    d[refid]["nbases"] += coverage
                    self.nbases += coverage

        d = self._clean_d(d=d, min_coverage=min_coverage, verbose=verbose)
        return d
Ejemplo n.º 16
0
    def select_references (self, refid_list, verbose=False, **kwargs):
        """ Select features which reference sequence id is in the given list or a single entry. Example: ["chr1", "chr2", "chr3"]
        """
        # Cast in list
        if type(refid_list) == str:
            refid_list = [refid_list]

        if verbose:
            jprint ("Selecting features based on reference id")
            jprint ("\tFeatures before filtering: {}".format(self.feature_count))
        self.feature_df = self.feature_df[(self.feature_df["refid"].isin(refid_list))]
        if verbose: jprint ("\tFeatures after filtering: {}".format(self.feature_count))
Ejemplo n.º 17
0
    def select_types (self, type_list, verbose=False, **kwargs):
        """ Select features which type is in the given list or a single entry. Example: ["exon", "gene"]
        """
        # Cast in list
        if type(type_list) == str:
            type_list = [type_list]

        if verbose:
            jprint ("Selecting features based on type")
            jprint ("\tFeatures before filtering: {}".format(self.feature_count))
        self.feature_df = self.feature_df[(self.feature_df["type"].isin(type_list))]
        if verbose: jprint ("\tFeatures after filtering: {}".format(self.feature_count))
Ejemplo n.º 18
0
    def _clean_df (self, df, verbose=False, **kwargs):
        """
        Clean dataframe after parsing
        """
        # Select fields
        df = df[["refid","start","end","ID","score","strand","type"]].copy()

        # Drop column with NA values
        if verbose: jprint("\tRemove null values")
        l = len(df)
        df.dropna(inplace=True)
        if verbose: jprint("\tRemoved {} invalid lines".format(l-len(df)))

        # Cast the start and end field in integer
        if verbose: jprint("\tCast coordinates to integer and id to str")
        df[['start', 'end']] = df[['start', 'end']].astype(int)
        df[['ID']] = df[['ID']].astype(str)

        # Verify than the dataframe is not empty
        if df.empty:
            raise ValueError("No valid features imported. Is the file valid?")
        return df
Ejemplo n.º 19
0
    def _bed_parser(self, fp, compression=None, verbose=False, **kwargs):
        """
        Parse a bed formated file
        """
        if verbose: jprint("\tUse BED parser to parse annotations")
        # try to import the file as a bed6 in a dataframe
        try:
            col_names = ["refid","start","end","ID","score","strand"]
            df = pd.read_csv(fp, sep="\t", names=col_names, index_col=False, comment="#", compression=compression)
            if verbose: jprint("\tSuccessfully imported as a bed6 file")

        # else try to import as a bed12
        except IndexError as E:
            col_names = ["refid","start","end","ID","score","strand","thickStart","thickEnd","itemRgb","blockCount","blockSizes","blockStarts"]
            df = pd.read_csv(fp, sep="\t", names=col_names, index_col=False, comment="#", compression=compression)
            if verbose: jprint("\tSuccessfully imported as a bed12 file")

        # Type is not available from bed files
        df['type'] = "."

        # Clean df
        df = self._clean_df(df, verbose=verbose)
        return df
Ejemplo n.º 20
0
    def interval_plot (self,
        refid,
        start=None,
        end=None,
        plot_style="ggplot",
        figwidth = 30,
        alignment_track_height=5,
        annotation_track_height=2,
        alignment_bins = 500,
        alignment_bin_repr_fun = "max",
        alignment_log=True,
        alignment_color=("dodgerblue", "darkorange"),
        alignment_alpha=0.5,
        feature_types=[],
        max_features_per_type=500,
        annotation_offset=None,
        annotation_label=False,
        max_label_size=50,
        annotation_color="grey",
        verbose=False, **kwargs):
        """
        * refid
            Name of the sequence from the original fasta file to display
        * start
            Start of the window to display. If not given, will be set to 0 [ DEFAULT: None ]
        * end
            End of the window to display. If not given, will be set to the length of refid [ DEFAULT: None ]
        * plot_style [ DEFAULT: True ]
            Default plot style for pyplot ('grayscale'|'bmh'|'ggplot'|'dark_background'|'classic'|'fivethirtyeight'...)
            [ DEFAULT: "ggplot" ]
        * figwidth
             Width of the ploting area in inches [ DEFAULT: 20 ]
        * alignment_track_height
            Height of individual aligment tracks [DEFAULT : 5 ]
        * annotation_track_height
            Height of individual annotation tracks for each feature types [DEFAULT : 2 ]
        * alignment_bins
            Number of alignment count bins to divide the displayed window. Low number will result in low resolution
            high value could result in a long ploting time. The value is automatically adjusted if lower than base
            resolution, ie if the requested interval is lower than the number of bins [ DEFAULT: 500 ]
        * alignment_bin_repr_fun
            Function to represent each bin ("max", "mean" and "sum") [ DEFAULT: "max" ]
        * alignment_log
            if True the yscale will be log10 else it will be linear [ DEFAULT: True ]
        * alignment_color
            Tuple of 2 color for the alignment + and - tracks [DEFAULT : ("dodgerblue", "darkorange") ]
        * alignment_alpha
            Transparency of the alignment coverage area between 0 and 1 [ DEFAULT: 0.5 ]
        * feature_types
            Name of a valid feature type ( "exon"|"transcript"|"gene"|"CDS"...) or list of names of feature type for
            which a row will be returned. The option is not available for bed files. If not given, all features type
            found in the interval will be returned [ DEFAULT: None ]
        * max_features_per_type
            Maximal total number of features for a particular feature type. If more are found, a random sampling will
            be performed. If None, all the features will be returned [ DEFAULT: 500 ]
        * annotation_offset
            Minimal distance between 2 contigous annotation features on the same level. If not given, will be
            automatically set to 1/400 of the windows to display [DEFAULT : None ]
        * annotation_label
            If True, labels of features will be plotted. To be avoid when expecting many features [DEFAULT : False ]
        * max_label_size
            limit the size of the label text for each feature  [DEFAULT : "50" ]
        * annotation_color
            Color of the annotation arrows [DEFAULT : "grey" ]
        * kwargs
        """
        # Verify that the sequence is in the refid list and that at least one alignment or annotation file was loaded
        if refid not in self.reference.refid_list:
            warnings.warn("Requested reference sequence not found: {}".format(refid))
            return None
        if not self.alignments and not self.annotations:
            warnings.warn("No annotation and alignment track loaded")
            return None

        # Auto define start and stop and overlapping annotation offset if not given
        if not start:
            start = 0
            if verbose: jprint ("Autodefine start position: {}".format(start))
        if not end:
            end = self.reference.get_refid_len(refid)-1
            if verbose: jprint ("Autodefine end position: {}".format(end))
        if start >= end:
            raise ValueError ("Invalid coordinates (start: {}, end :{}) start has to be greater than end")
        if not annotation_offset:
            annotation_offset = int((end-start)/400)
            if verbose:jprint ("Estimated overlap offset: {}".format(annotation_offset))

        figheight = 0

        # Extract alignment coverage data and compute the coverage tracks height
        alignments_dict = OrderedDict()
        if self.alignments:
            if verbose: jprint ("Extract alignment data", bold=True)
            for a in self.alignments:
                alignments_dict[a.name] = a.interval_coverage(
                    refid=refid, start=start, end=end, bins=alignment_bins, bin_repr_fun=alignment_bin_repr_fun)
                # +1 for space bewtween tracks
                figheight += alignment_track_height+1

        # Extract feature annotation data and compute the feature tracks height
        annot_tracks_heigth = 0
        annotation_dict = OrderedDict()
        if self.annotations:
            if verbose: jprint ("Extract annotation data", bold=True)
            for a in self.annotations:
                annotation_dict[a.name] = a.interval_features(
                    refid=refid, start=start, end=end, feature_types=feature_types,
                    max_features_per_type=max_features_per_type)
                # Take empty df into account for ploting
                n = 1 if annotation_dict[a.name].empty else annotation_dict[a.name].type.nunique()
                # +1 for space bewtween tracks
                figheight += n*annotation_track_height+1

        # Create a pylot figure object with an empty grid
        fig = pl.figure (figsize= (figwidth, figheight))
        grid = GridSpec (nrows=figheight, ncols=1, hspace=0.5)
        pl.style.use (plot_style)

        # Curent height marker
        h = 0
        if self.alignments:
            for track_name, track_df in alignments_dict.items():
                if verbose: jprint ("\tAlignment track name: {}".format(track_name))

                # Prepare the subplot grid
                ax = pl.subplot(grid[h:h+alignment_track_height])
                h+=alignment_track_height
                ax.set_xlim((start, end))
                ax.ticklabel_format(useOffset=False, style='plain')
                if alignment_log: ax.set_yscale("log")
                ax.yaxis.set_tick_params(left=True, right=False, labelleft=True, labelright=False)
                ax.xaxis.set_tick_params(bottom=False, top=False, labelbottom=False, labeltop=False)
                ax.set_ylabel(track_name)

                # Plot the positive strand
                if track_df["+"].sum () == 0:
                    ax.text(0.5, 0.6,'No Coverage on the + strand', ha='center', va='center', transform=ax.transAxes)
                else:
                    ax.fill_between(x=track_df.index, y1=0, y2=list(track_df["+"]),
                        alpha=alignment_alpha, color=alignment_color[0], label="Positive strand")
                # Plot the negative strand
                if track_df["-"].sum () == 0:
                    ax.text(0.5, 0.4,'No Coverage on the - strand', ha='center', va='center', transform=ax.transAxes)
                else:
                    ax.fill_between(x=track_df.index, y1=0, y2=list(track_df["-"]),
                        alpha=alignment_alpha, color=alignment_color[1], label="Negative strand")
                # If elements were added to the ax
                if ax.collections: ax.legend(bbox_to_anchor=(1, 1), loc=2,frameon=False)

            # Add x labels if last element
            ax.xaxis.set_tick_params(bottom=True, labelbottom=True)

        if self.annotations:
            for track_name, track_df in annotation_dict.items():
                h+=1
                if verbose: jprint ("\tAlignment track name: {}".format(track_name))

                # No feature case
                if track_df.empty:
                    ax = pl.subplot(grid[h:h+annotation_track_height])
                    ax.set_xlim((start, end))
                    ax.text(0.5, 0.5,'No feature found', ha='center', va='center', transform=ax.transAxes)
                    ax.yaxis.set_tick_params(left=False, right=False, labelleft=False, labelright=False)
                    ax.xaxis.set_tick_params(bottom=True, labelbottom=True)
                    ax.ticklabel_format(useOffset=False, style='plain')
                    ax.grid(axis="y", b=False)
                    ax.set_title (track_name)
                    h+=annotation_track_height

                # General case
                else:
                    first=True
                    for feature_type, feature_df in track_df.groupby("type"):

                        # Prepare the ploting area
                        ax = pl.subplot(grid[h:h+annotation_track_height])
                        h+=annotation_track_height
                        ax.set_xlim((start, end))
                        ax.yaxis.set_tick_params(left=False, right=False, labelleft=False, labelright=False)
                        ax.xaxis.set_tick_params(bottom=False, top=False, labelbottom=False, labeltop=False)
                        ax.ticklabel_format(useOffset=False, style='plain')
                        ax.grid(axis="y", b=False)
                        ax.set_ylabel(feature_type)

                        # Compute the non overlaping level where to plot the arrow
                        level = Level(offset=annotation_offset)
                        for n, feature in feature_df.iterrows():
                            fl = level(feature.ID, feature.start, feature.end, feature.strand)
                            if fl:
                                ax.add_patch( Arrow( posA=[fl.start, fl.level], posB=[fl.end, fl.level], linewidth=3,
                                    color=annotation_color, arrowstyle=fl.arrowstyle))
                                if annotation_label:
                                    text_end = fl.end if fl.end < end-annotation_offset else end-annotation_offset
                                    text_start = fl.start if fl.start > start+annotation_offset else start+annotation_offset
                                    text = fl.ID[0:max_label_size]+"..." if len(fl.ID) > max_label_size else fl.ID
                                    ax.text (x=text_start+ (text_end-text_start)/2, y=fl.level, s=text, ha="center", fontsize=8)

                        ax.set_ylim(level.min_level-0.5, level.max_level+0.5)

                        # First element exception
                        if first:
                            ax.set_title (track_name)
                            first = False
                    # Last elemet exception
                    ax.xaxis.set_tick_params(bottom=True, labelbottom=True)
Ejemplo n.º 21
0
    def interval_coverage(self,
                          refid,
                          start,
                          end,
                          bins=500,
                          bin_repr_fun="max",
                          verbose=False,
                          **kwargs):
        """
        Parse the alignment file for a given refid and interval. The interval is splited in a number of windows equal to
        bins, for which the coverage in computed. The method return a dataframe containing the starting positions of
        the windows and the coverage for the + and - strands. If the refid or the coordinates are invalid a zero filled
        dataframe will be returned.
        * refid
            Name of the sequence from the original fasta file to display
        * start
            Start of the window to display. The coordinate is not verified, if outside of the range it will
            return empty bins
        * end
            End of the window to display. The coordinate is not verified, if outside of the range it will
            return empty bins
       * bins
            Number of alignment count bins to divide the displayed window. Low number will result in low resolution
            high value could result in a long ploting time. The value is automatically adjusted if lower than base
            resolution, ie if the requested interval is lower than the number of bins [ DEFAULT: 500 ]
        * bin_repr_fun
            Function to represent each bin ("max", "mean" and "sum") [ DEFAULT: "max" ]
        """
        if verbose:
            jprint("Compute coverage from the windows: {}:{}-{}".format(
                refid, start, end))
        df = pd.DataFrame(columns=["+", "-"], dtype=int)

        # Adjust number of bins and calculate step
        if bins > end - start:
            bins = end - start
            if verbose:
                jprint(
                    "\tAuto adjust the number of bins to match the interval: {}"
                    .format(bins))
        step = (end - start) / bins
        if verbose: jprint("\tDefine size of each bin: {}".format(step))

        # If refid is not in the self refid-list
        if not refid in self.refid_list:
            if verbose:
                jprint(
                    "\tThe reference {} is not in the list of references with alignment"
                    .format(refid))
            for i in np.arange(start, end, step):
                for strand in ["+", "-"]:
                    df.loc[int(i), strand] = 0
            return df

        # Select positions windows and get maximun
        if verbose: jprint("\tCompute coverage...")
        for i in np.arange(start, end, step):
            winstart = int(i)
            winend = int(i + step)
            for strand in ["+", "-"]:
                l = self.d[refid][strand][
                    (self.d[refid][strand].index >= winstart)
                    & (self.d[refid][strand].index < winend)]
                if l.empty:
                    df.loc[winstart, strand] = 0
                elif bin_repr_fun == "max":
                    df.loc[winstart, strand] = l.max()
                elif bin_repr_fun == "sum":
                    df.loc[winstart, strand] = l.sum()
                elif bin_repr_fun == "mean":
                    df.loc[winstart, strand] = l.sum() / step
        if verbose:
            if df["+"].sum() + df["-"].sum() == 0:
                jprint(
                    "\tNull coverage for both strands in the requested interval"
                )
            elif df["+"].sum() == 0:
                jprint(
                    "\tNull coverage for the positive strand in the requested interval"
                )
            elif df["-"].sum() == 0:
                jprint(
                    "\tNull coverage for the negative strand in the requested interval"
                )
        return df
Ejemplo n.º 22
0
        print ("Can not import a local packages. Please verify source code directory")
        sysexit()
    
# Third party imports
try:
    import numpy as np
    from matplotlib.patches import FancyArrowPatch as Arrow
    from matplotlib.gridspec import GridSpec
    import pylab as pl
    import pandas as pd
    import pysam
    from IPython.core.display import display
    from pycl.pycl import jhelp, jprint, get_package_file
except ImportError as E:
    print (E)
    jprint ("A third party package is missing. Please verify your dependencies")
    sysexit()

#~~~~~~~CLASS~~~~~~~#
class JGV(object):
    
    ##~~~~~~~ SAMPLE FILES ~~~~~~~#
    @ classmethod
    def example_bam (self):
        return get_package_file("JGV","JGV/data/yeast.bam")
    @ classmethod
    def example_fasta (self):
        return get_package_file("JGV","JGV/data/yeast.fa.gz")
    @ classmethod
    def example_gtf (self):
        return get_package_file("JGV","JGV/data/yeast.gtf.gz")
Ejemplo n.º 23
0
    def __init__(self,
                 fp,
                 name=None,
                 refid_list=[],
                 output_index=False,
                 verbose=False,
                 **kwargs):
        """
         * fp
            A fasta file containing the reference sequences OR an tab separated index file containing at least 2 columns
            with the refid and the length in bases (like a .fa.fai file generated by samtools faidx, or with the
            output_index option of this function)
            The fasta option will take more time as the file has to be parsed to get the refid and length of sequences.
            Both fasta and infex file can be gziped
        *  name
            Name of the data file that will be used as track name for plotting. If not given, will be deduced from fp
            file name
        * refid_list
            list of reference sequence id to select from the data file, by default all [ DEFAULT: [] ]
        * output_index
            If True will write a simple A 2 column index tsv file containing the Reference sequence ids and their
            lengths [ DEFAULT: False ]
        """
        # Verify that the file is readable
        is_readable_file(fp)

        #Save self variable
        self.fp = fp
        self.name = name if name else file_basename(fp)
        self.ext = extensions_list(fp)[0]

        # If the file is in fasta format
        if self.ext in ["fa", "fasta"]:
            if verbose: jprint("Parsing fasta file")

            # File handling for both uncompressed or compressed fasta file
            if fp.endswith(".gz"):
                open_fun, open_mode = gzip.open, "rt"
            else:
                open_fun, open_mode = open, "r"

            # Parse fasta file refid and count the length of each sequence if in the refid_list
            with open_fun(fp, open_mode) as f:
                d = OrderedDict()
                last_ref = None
                for l in f:
                    if l.startswith(">"):
                        refid = l[1:].split()[0].strip()
                        if not refid_list or refid in refid_list:
                            d[refid] = 0
                            last_ref = refid
                        else:
                            last_ref = None
                    elif last_ref:
                        d[last_ref] += len(l.strip())

            # Check if sequences found
            assert d, "No Sequence found"

            # Transform the counter in a Dataframe and sort by length
            self.d = pd.Series(d, name="length", dtype="int64")
            self.d.sort_values(inplace=True, ascending=False)

            # Write the index in a file for quicker loading next time
            if output_index:
                index_file = "{}/{}.tsv".format(dir_path(fp),
                                                file_basename(fp))
                if verbose:
                    jprint("Write a fasta index file: {}".format(index_file))
                self.d.to_csv(index_file, sep="\t")

        # In the case the file is not in fasta format, try to parse it as a 2 columns tabulated file with refid and length for each sequence
        else:
            if verbose: jprint("Assume the file is a fasta index")
            self.d = pd.read_csv(fp,
                                 sep="\t",
                                 squeeze=True,
                                 comment="#",
                                 usecols=[0, 1],
                                 index_col=0,
                                 header=None)
            if refid_list: self.d = self.d[(self.d.index.isin(refid_list))]
            self.d.name = "length"
            self.d.sort_values(inplace=True, ascending=False)

        if verbose:
            jprint("\tFound {} reference sequences".format(self.refid_count))
Ejemplo n.º 24
0
    def __init__ (self, fp, name=None, min_len=None, max_len=None, refid_list=None, type_list=None, verbose=False, **kwargs):
        """
         * fp
            A path to a standard genomic file containing features annotations among the following format
            gff3: http://www.ensembl.org/info/website/upload/gff3.html
            gtf: http://www.ensembl.org/info/website/upload/gff.html
            bed:  http://www.ensembl.org/info/website/upload/bed.html
            Alternatively, one can use a python pickle file (.pkl) generated during a previous run.
            The file can eventually be compressed in ‘gzip’ format
        * min_len
            Minimal size (start to end) of a feature to be selected [default None]
        * max_len
            Maximal size (start to end) of a feature to be selected [default None]
        * refid_list
            List of reference id to select. Example: ["chr1", "chr2", "chr3"] [default None]
        * type_list
            List of feature type to select. Example: ["exon", "gene"] [default None]
        """
        if verbose: jprint ("Parse Annotation file")
        # Verify that the file is readable
        is_readable_file(fp)

        #Save self variable
        self.fp = fp
        self.name = name if name else file_basename(fp)

        # Find if gziped
        if has_extension (fp, pos=-1, ext=["gz","tgz"]):
            if verbose: jprint("\tFile is gziped")
            compression="gzip"
            ext_pos=-2
        else:
            if verbose: jprint("\tFile is not compressed")
            compression=None
            ext_pos=-1

        # Find extension type
        if has_extension (fp, pos=ext_pos, ext="gtf"):
            self.feature_df = self._gtf_parser(fp=fp, compression=compression, verbose=verbose)
        elif has_extension (fp, pos=ext_pos, ext="gff3"):
            self.feature_df = self._gff3_parser(fp=fp, compression=compression, verbose=verbose)
        elif has_extension (fp, pos=ext_pos, ext="bed"):
            self.feature_df = self._bed_parser(fp=fp, compression=compression, verbose=verbose)

        # Else try to import as a pickled file
        else:
            try:
                self.feature_df = self._pickle_parser(fp=fp, verbose=verbose)
            # If invalid file format
            except Exception as E:
                raise ValueError("Cannot open file or the file is not in a valid format")

        # Optional filterig steps
        if min_len or max_len:
            self.select_len (min_len=min_len, max_len=max_len, verbose=verbose)
        if refid_list:
            self.select_references (refid_list=refid_list, verbose=verbose)
        if type_list:
            self.select_types (type_list=type_list, verbose=verbose)

        # Sort the dataframe and reset index
        if verbose: jprint("Sorting and final cleanup")
        self.feature_df.sort_values(by=["refid","start","end"], inplace=True)
        self.feature_df.reset_index(drop=True, inplace=True)

        if verbose: jprint("\tNumber of features imported: {}".format(self.feature_count))