def __init__ (self, fp, name=None, refid_list=[], output_index=False, verbose=False, **kwargs): """ * fp A fasta file containing the reference sequences OR an tab separated index file containing at least 2 columns with the refid and the length in bases (like a .fa.fai file generated by samtools faidx). The fasta option will take more time as the file has to be parsed to get the refid and length of sequences. A 2 column index tsv file will be automatically generated for latter usage as an index file. Both fasta and infex file can be gziped * name Name of the data file that will be used as track name for plotting. If not given, will be deduced from fp file name * refid_list list of reference sequence id to select from the data file, by default all [ DEFAULT: [] ] * output_index If True will write a simple A 2 column index tsv file containing the Reference sequence ids and their lengths [ DEFAULT: False ] """ # Save self variable verbose = verbose self.refid_list = refid_list # Store the reference genome informations if verbose: jprint("Add reference genome file", bold=True) self.reference = Reference( fp=fp, name=name, verbose=verbose, refid_list=self.refid_list, output_index=output_index) # List to store annotations and alignment tracks self.annotations = [] self.alignments = []
def _bam_parser(self, fp, min_coverage=5, refid_list=[], verbose=False, **kwargs): """Parse a sam or bam formated file """ d = OrderedDict() with pysam.AlignmentFile(fp) as bam: # Compute the genomic coverage for each reads if verbose: jprint("\tTally coverage for each base") for line in bam: # Skip non aligned reads if line.reference_id == -1: continue refid = line.reference_name # If not refid filter or if the refid is in the autozized list if not refid_list or refid in refid_list: # Create a new entry if not in the dict if not refid in d: d[refid] = { "nbases": 0, "+": Counter(), "-": Counter() } # Save coverage strand = "-" if line.is_reverse else "+" for position in line.get_reference_positions(): d[refid][strand][position] += 1 d = self._clean_d(d=d, min_coverage=min_coverage, verbose=verbose) return d
def add_annotation(self, fp, name=None, min_len=None, max_len=None, refid_list=None, type_list=None, verbose=False, **kwargs): """ * fp An URL to a standard genomic file containing features annotations among the following format: gff3: http://www.ensembl.org/info/website/upload/gff3.html gtf: http://www.ensembl.org/info/website/upload/gff.html bed: http://www.ensembl.org/info/website/upload/bed.html Valid URL schemes include http, ftp, s3, and file. The file can eventually be compressed in ‘gzip’, ‘bz2’, ‘zip’ or ‘xz’ * name Name of the data file that will be used as track name for plotting. If not given, will be deduced from fp file name [ DEFAULT: None ] * min_len Minimal size (start to end) of a feature to be selected [default None] * max_len Maximal size (start to end) of a feature to be selected [default None] * refid_list List of reference id to select. Example: ["chr1", "chr2", "chr3"] [default None] * type_list List of feature type to select. Example: ["exon", "gene"] [default None] """ if verbose: jprint("Add annotation file", bold=True) a = Annotation(fp=fp, name=name, min_len=min_len, max_len=max_len, refid_list=refid_list, type_list=type_list, verbose=verbose) if verbose: not_found = set(self.reference.refid_list) - set(a.refid_list) if not_found: warnings.warn("No annotation found for {}".format(",".join(not_found))) self.annotations.append(a)
def add_alignment(self, fp, name=None, min_coverage=5, refid_list=[], output_bed=False, verbose=False, **kwargs): """ * fp A standard BAM or SAM (http://samtools.sourceforge.net/SAM1.pdf) containing aligned reads and a standard header. The files do not need to be sorted or indexed. One can also use a 6 fields bed (chrom, chromStart, chromEnd, name, score, strand) file with a hastaged commented header listing the reference sequences id and length, similar to the format generated by the output_bed option (Much faster than from a Bam/Sam file, can be gzipped). http://www.ensembl.org/info/website/upload/bed.html * name Name of the data file that will be used as track name for plotting. If not given, will be deduced from fp file name [ DEFAULT: None ] * min_coverage Minimal coverage to compute the data. If less, the coverage will be considered null. Not used for if fp is a bed coverage file [ DEFAULT: 5 ] * output_bed If True will be write a 6 columns compressed bed file containing the coverage values for + and - strand excluding positions with coverage lesser than min_coverage.the option will apply only is the input file is BAM or SAM. The file starts with a header consisting of a list of the ID of the reference sequences and their length [ DEFAULT: False ]. Example: #chr20 64444167 #chr21 46709983 chr20 276516 276516 pos1 5 + chr20 276517 276517 pos2 5 + """ if verbose: jprint("Add alignment file", bold=True) a = Alignment(fp=fp, name=name, min_coverage=min_coverage, refid_list=refid_list, output_bed=output_bed, verbose=verbose) if verbose: not_found = set(self.reference.refid_list) - set(a.refid_list) if not_found: warnings.warn("No coverage found for {}".format(",".join(not_found))) self.alignments.append(a)
def _pickle_parser(self, fp, verbose=False, **kwargs): """ Parse a pickle database """ # Import the file in a dataframe if verbose: jprint ("\tTry to load as a pickle file") df = pd.read_pickle(fp) return df
def get_refid_len(self, refid, verbose=False, **kwargs): """ Return the length of a given refid, If the reference is not found return None""" if refid not in self.d: if verbose: jprint( "The reference sequence {} was not found in the reference list" .format(refid)) return None else: return self.d[refid]
def to_pickle (self, fp=None, verbose=False, **kwargs): """ Store the parsed file in a pickle file for further use. * fp Path to save the pickle file. By default original annotation file (- .gz/.tgz) + .pkl """ if not fp: if self.fp.endswith(".gz") or self.fp.endswith(".tgz"): fp = self.fp.rpartition(".")[0]+".pkl" else: fp = self.fp+".pkl" if verbose: jprint ("Pickle dataframe in file {}".format(fp)) self.feature_df.to_pickle(fp) return fp
def _gtf_parser(self, fp, compression=None, verbose=False, **kwargs): """ Parse a gtf formated file """ if verbose: jprint("\tUse GTF parser to parse annotations") # Import the file in a dataframe col_names = ["refid","source","type","start","end","score","strand","frame","attribute"] df = pd.read_csv(fp, sep="\t", names=col_names, index_col=False, comment="#", compression=compression) # Extract the ID field = first field of attribute= df['ID'] = df["attribute"].str.split('\"').str[1] if verbose: jprint("\tSuccessfully imported as a gtf file") # Clean df df = self._clean_df(df, verbose=verbose) return df
def _clean_d(self, d, min_coverage=5, verbose=False, **kwargs): """ Remove base with coverage below threshold and transform in Pandas Series """ if verbose: jprint("\tFilter and sort the coverage results by position") for refid in d.keys(): for strand in ["+", "-"]: s = OrderedDict() for position, coverage in d[refid][strand].items(): if coverage >= min_coverage: s[position] = coverage self.nbases += coverage d[refid]["nbases"] += coverage d[refid][strand] = pd.Series(s) d[refid][strand].sort_index(inplace=True) return d
def interval_features (self, refid, start, end, feature_types=None, max_features_per_type=None, verbose=False, **kwargs): """ Parse the annotation file for the given refid and interval and return a dataframe containing all the features found for each original line. Features are identified by their ID field for gff3 files, by the entire attribute field for the bed files and by the first element in the attribute field for the gtf files * refid Name of the sequence from the original fasta file to display * start Start of the window to display. The coordinate is not verified, if outside of the range it will return an empty dataframe * end End of the window to display. The coordinate is not verified, if outside of the range it will return an empty dataframe * feature_types Name of a valid feature type ( "exon"|"transcript"|"gene"|"CDS"...) or list of names of feature type for which a row will be returned. The option is not available for bed files. If not given, all features type found in the interval will be returned [ DEFAULT: None ] * max_features_per_type Maximal total number of features for a particular feature type. If more are found, a random sampling will be performed. If None, all the features will be returned [ DEFAULT: None ] """ # Verifications and auto adjustment of coordinates if not refid in self.refid_list: if verbose: jprint ("The reference {} is not in the list of references with alignment".format(refid)) return pd.DataFrame(columns=["refid","start","end","strand","ID","type"]) # Select the refid and coordinates df = self.feature_df[(self.feature_df["refid"] == refid)&(self.feature_df["end"] > start)&(self.feature_df["start"] < end)] if df.empty: if verbose: jprint ("No feature found in the requested interval") return pd.DataFrame(columns=["refid","start","end","strand","ID","type"]) # Cast str to list if type(feature_types) == str: feature_types = [feature_types] # Filter_df by type and max number per type select_list = [] for type_name, type_df in df.groupby("type"): # Filter out if not in the list if not feature_types or type_name in feature_types: sdf = df[(df["type"] == type_name)] if sdf.empty: if verbose: jprint ("No feature of type {} found in the requested interval".format(type_name)) elif max_features_per_type and len(sdf)>max_features_per_type: select_list.append(sdf.sample(max_features_per_type)) else: select_list.append(sdf) # Merge the selected features in a single df if select_list: df = pd.concat(select_list) else: if verbose: jprint ("No feature found in the requested interval") return pd.DataFrame(columns=["refid","start","end","strand","ID","type"]) # Return a sorted copy of the df df.sort_values(by=["refid","start","end"], inplace=True) return df.copy().reset_index(drop=True)
def __init__(self, fp, name=None, min_coverage=5, refid_list=[], output_bed=False, verbose=False, **kwargs): """ * fp A standard BAM or SAM (http://samtools.sourceforge.net/SAM1.pdf) containing aligned reads and a standard header. The files do not need to be sorted or indexed. One can also use a 6 fields bed (chrom, chromStart, chromEnd, name, score, strand, where score is the coverage value (Much faster than from a Bam/Sam file, can be gzipped). http://www.ensembl.org/info/website/upload/bed.html * name Name of the data file that will be used as track name for plotting. If not given, will be deduced from fp file name [ DEFAULT: None ] * min_coverage Minimal coverage to compute the data. If less, the coverage will be considered null. Not used for if fp is a bed coverage file [ DEFAULT: 5 ] * refid_list list of reference sequence id to select from the data file, by default all, Not used for if fp is a bed coverage file [ DEFAULT: [] ] * output_bed If True will be write a 6 columns compressed bed file containing the coverage values for + and - strand excluding positions with coverage lesser than min_coverage.the option will apply only is the input file is BAM or SAM. [ DEFAULT: False ] """ # Verify that the file is readable is_readable_file(fp) #Save self variable self.fp = fp self.name = name if name else file_basename(fp) self.ext = extensions_list(fp)[0] self.nbases = 0 if self.ext in ["bam", "sam"]: if verbose: jprint("Compute coverage from bam/sam file ", self.fp) self.d = self._bam_parser(fp, min_coverage, refid_list) if output_bed: outfp = "{}/{}.bed.gz".format(dir_path(fp), file_basename(fp)) if verbose: jprint("Write coverage data in file ", outfp) self._write_coverage_file(outfp) self.outfp = outfp elif self.ext == "bed": if verbose: jprint("Extract coverage from bed file", self.fp) self.d = self._bed_parser(fp, min_coverage, refid_list) else: msg = "The file is not in SAM/BAM/BED format. Please provide a correctly formated file" raise ValueError(msg) if verbose: jprint("\tTotal base coverage {} in {} reference sequences".format( self.nbases, self.refid_count))
def alignment_summary (self, verbose=False, **kwargs): """Display table summarizing annotation file information""" if not self.alignments: warnings.warn("No alignment track loaded") return None count_df = pd.DataFrame(columns=["Refid count", "Base coverage"]) rbc_df = pd.DataFrame() for a in self.alignments: count_df.loc[a.name] = [a.refid_count, a.nbases] rbc = pd.DataFrame(a.refid_nbases) rbc.columns=[a.name] rbc_df = pd.merge(left=rbc_df, right=rbc, how='outer', right_index=True, left_index=True) jprint("Counts per Alignment file", bold=True) display(count_df) jprint("Counts per Reference sequence", bold=True) rbc_df.sort_index() display(rbc_df)
def annotation_summary (self, verbose=False, **kwargs): """Display table summarizing annotation file information""" if not self.annotations: warnings.warn("No annotation track loaded") return None count_df = pd.DataFrame(columns=["Feature count", "Refid count", "Feature type count"]) rcu_df = pd.DataFrame() tcu_df = pd.DataFrame() for a in self.annotations: count_df.loc[a.name] = [a.feature_count, a.refid_count, a.type_count] rcu = a.refid_count_uniq rcu.columns=[a.name] rcu_df = pd.merge(left=rcu_df, right=rcu, how='outer', right_index=True, left_index=True) tcu = a.type_count_uniq tcu.columns=[a.name] tcu_df = pd.merge(left=tcu_df, right=tcu, how='outer', right_index=True, left_index=True) jprint("Counts per Annotation file", bold=True) display(count_df) jprint("Counts per Reference sequence", bold=True) rcu_df.sort_index() display(rcu_df) jprint("Counts per feature types", bold=True) tcu_df.sort_index() display(tcu_df)
def select_len (self, min_len=None, max_len=None, verbose=False, **kwargs): """ Select features longer or shorter that given values """ if verbose: jprint ("Selecting features based on length") jprint ("\tFeatures before filtering: {}".format(self.feature_count)) # Filter min len if min_len: self.feature_df = self.feature_df[((self.feature_df["end"]-self.feature_df["start"]) >= min_len)] if verbose: jprint ("\tFeatures after minimal length filtering: {}".format(self.feature_count)) # Filter max len if max_len: self.feature_df = self.feature_df[((self.feature_df["end"]-self.feature_df["start"]) <= max_len)] if verbose: jprint ("\tFeatures after maximal length filtering: {}".format(self.feature_count))
def _bed_parser(self, fp, min_coverage=5, refid_list=[], verbose=False, **kwargs): """Extract data from a coverage bad file """ d = OrderedDict() # File handling for both uncompressed or compressed fasta file if fp.endswith(".gz"): open_fun, open_mode = gzip.open, "rt" else: open_fun, open_mode = open, "r" # Parse fasta file refid with open_fun(fp, open_mode) as fin: if verbose: jprint("\tExtract base coverage data") for line in fin: sl = line[0:-1].split("\t") refid = sl[0] if not refid_list or refid in refid_list: # Create a new entry if not in the dict if not refid in d: d[refid] = { "nbases": 0, "+": Counter(), "-": Counter() } position = int(sl[1]) coverage = int(sl[4]) strand = sl[5] d[refid][strand][position] = coverage d[refid]["nbases"] += coverage self.nbases += coverage d = self._clean_d(d=d, min_coverage=min_coverage, verbose=verbose) return d
def select_references (self, refid_list, verbose=False, **kwargs): """ Select features which reference sequence id is in the given list or a single entry. Example: ["chr1", "chr2", "chr3"] """ # Cast in list if type(refid_list) == str: refid_list = [refid_list] if verbose: jprint ("Selecting features based on reference id") jprint ("\tFeatures before filtering: {}".format(self.feature_count)) self.feature_df = self.feature_df[(self.feature_df["refid"].isin(refid_list))] if verbose: jprint ("\tFeatures after filtering: {}".format(self.feature_count))
def select_types (self, type_list, verbose=False, **kwargs): """ Select features which type is in the given list or a single entry. Example: ["exon", "gene"] """ # Cast in list if type(type_list) == str: type_list = [type_list] if verbose: jprint ("Selecting features based on type") jprint ("\tFeatures before filtering: {}".format(self.feature_count)) self.feature_df = self.feature_df[(self.feature_df["type"].isin(type_list))] if verbose: jprint ("\tFeatures after filtering: {}".format(self.feature_count))
def _clean_df (self, df, verbose=False, **kwargs): """ Clean dataframe after parsing """ # Select fields df = df[["refid","start","end","ID","score","strand","type"]].copy() # Drop column with NA values if verbose: jprint("\tRemove null values") l = len(df) df.dropna(inplace=True) if verbose: jprint("\tRemoved {} invalid lines".format(l-len(df))) # Cast the start and end field in integer if verbose: jprint("\tCast coordinates to integer and id to str") df[['start', 'end']] = df[['start', 'end']].astype(int) df[['ID']] = df[['ID']].astype(str) # Verify than the dataframe is not empty if df.empty: raise ValueError("No valid features imported. Is the file valid?") return df
def _bed_parser(self, fp, compression=None, verbose=False, **kwargs): """ Parse a bed formated file """ if verbose: jprint("\tUse BED parser to parse annotations") # try to import the file as a bed6 in a dataframe try: col_names = ["refid","start","end","ID","score","strand"] df = pd.read_csv(fp, sep="\t", names=col_names, index_col=False, comment="#", compression=compression) if verbose: jprint("\tSuccessfully imported as a bed6 file") # else try to import as a bed12 except IndexError as E: col_names = ["refid","start","end","ID","score","strand","thickStart","thickEnd","itemRgb","blockCount","blockSizes","blockStarts"] df = pd.read_csv(fp, sep="\t", names=col_names, index_col=False, comment="#", compression=compression) if verbose: jprint("\tSuccessfully imported as a bed12 file") # Type is not available from bed files df['type'] = "." # Clean df df = self._clean_df(df, verbose=verbose) return df
def interval_plot (self, refid, start=None, end=None, plot_style="ggplot", figwidth = 30, alignment_track_height=5, annotation_track_height=2, alignment_bins = 500, alignment_bin_repr_fun = "max", alignment_log=True, alignment_color=("dodgerblue", "darkorange"), alignment_alpha=0.5, feature_types=[], max_features_per_type=500, annotation_offset=None, annotation_label=False, max_label_size=50, annotation_color="grey", verbose=False, **kwargs): """ * refid Name of the sequence from the original fasta file to display * start Start of the window to display. If not given, will be set to 0 [ DEFAULT: None ] * end End of the window to display. If not given, will be set to the length of refid [ DEFAULT: None ] * plot_style [ DEFAULT: True ] Default plot style for pyplot ('grayscale'|'bmh'|'ggplot'|'dark_background'|'classic'|'fivethirtyeight'...) [ DEFAULT: "ggplot" ] * figwidth Width of the ploting area in inches [ DEFAULT: 20 ] * alignment_track_height Height of individual aligment tracks [DEFAULT : 5 ] * annotation_track_height Height of individual annotation tracks for each feature types [DEFAULT : 2 ] * alignment_bins Number of alignment count bins to divide the displayed window. Low number will result in low resolution high value could result in a long ploting time. The value is automatically adjusted if lower than base resolution, ie if the requested interval is lower than the number of bins [ DEFAULT: 500 ] * alignment_bin_repr_fun Function to represent each bin ("max", "mean" and "sum") [ DEFAULT: "max" ] * alignment_log if True the yscale will be log10 else it will be linear [ DEFAULT: True ] * alignment_color Tuple of 2 color for the alignment + and - tracks [DEFAULT : ("dodgerblue", "darkorange") ] * alignment_alpha Transparency of the alignment coverage area between 0 and 1 [ DEFAULT: 0.5 ] * feature_types Name of a valid feature type ( "exon"|"transcript"|"gene"|"CDS"...) or list of names of feature type for which a row will be returned. The option is not available for bed files. If not given, all features type found in the interval will be returned [ DEFAULT: None ] * max_features_per_type Maximal total number of features for a particular feature type. If more are found, a random sampling will be performed. If None, all the features will be returned [ DEFAULT: 500 ] * annotation_offset Minimal distance between 2 contigous annotation features on the same level. If not given, will be automatically set to 1/400 of the windows to display [DEFAULT : None ] * annotation_label If True, labels of features will be plotted. To be avoid when expecting many features [DEFAULT : False ] * max_label_size limit the size of the label text for each feature [DEFAULT : "50" ] * annotation_color Color of the annotation arrows [DEFAULT : "grey" ] * kwargs """ # Verify that the sequence is in the refid list and that at least one alignment or annotation file was loaded if refid not in self.reference.refid_list: warnings.warn("Requested reference sequence not found: {}".format(refid)) return None if not self.alignments and not self.annotations: warnings.warn("No annotation and alignment track loaded") return None # Auto define start and stop and overlapping annotation offset if not given if not start: start = 0 if verbose: jprint ("Autodefine start position: {}".format(start)) if not end: end = self.reference.get_refid_len(refid)-1 if verbose: jprint ("Autodefine end position: {}".format(end)) if start >= end: raise ValueError ("Invalid coordinates (start: {}, end :{}) start has to be greater than end") if not annotation_offset: annotation_offset = int((end-start)/400) if verbose:jprint ("Estimated overlap offset: {}".format(annotation_offset)) figheight = 0 # Extract alignment coverage data and compute the coverage tracks height alignments_dict = OrderedDict() if self.alignments: if verbose: jprint ("Extract alignment data", bold=True) for a in self.alignments: alignments_dict[a.name] = a.interval_coverage( refid=refid, start=start, end=end, bins=alignment_bins, bin_repr_fun=alignment_bin_repr_fun) # +1 for space bewtween tracks figheight += alignment_track_height+1 # Extract feature annotation data and compute the feature tracks height annot_tracks_heigth = 0 annotation_dict = OrderedDict() if self.annotations: if verbose: jprint ("Extract annotation data", bold=True) for a in self.annotations: annotation_dict[a.name] = a.interval_features( refid=refid, start=start, end=end, feature_types=feature_types, max_features_per_type=max_features_per_type) # Take empty df into account for ploting n = 1 if annotation_dict[a.name].empty else annotation_dict[a.name].type.nunique() # +1 for space bewtween tracks figheight += n*annotation_track_height+1 # Create a pylot figure object with an empty grid fig = pl.figure (figsize= (figwidth, figheight)) grid = GridSpec (nrows=figheight, ncols=1, hspace=0.5) pl.style.use (plot_style) # Curent height marker h = 0 if self.alignments: for track_name, track_df in alignments_dict.items(): if verbose: jprint ("\tAlignment track name: {}".format(track_name)) # Prepare the subplot grid ax = pl.subplot(grid[h:h+alignment_track_height]) h+=alignment_track_height ax.set_xlim((start, end)) ax.ticklabel_format(useOffset=False, style='plain') if alignment_log: ax.set_yscale("log") ax.yaxis.set_tick_params(left=True, right=False, labelleft=True, labelright=False) ax.xaxis.set_tick_params(bottom=False, top=False, labelbottom=False, labeltop=False) ax.set_ylabel(track_name) # Plot the positive strand if track_df["+"].sum () == 0: ax.text(0.5, 0.6,'No Coverage on the + strand', ha='center', va='center', transform=ax.transAxes) else: ax.fill_between(x=track_df.index, y1=0, y2=list(track_df["+"]), alpha=alignment_alpha, color=alignment_color[0], label="Positive strand") # Plot the negative strand if track_df["-"].sum () == 0: ax.text(0.5, 0.4,'No Coverage on the - strand', ha='center', va='center', transform=ax.transAxes) else: ax.fill_between(x=track_df.index, y1=0, y2=list(track_df["-"]), alpha=alignment_alpha, color=alignment_color[1], label="Negative strand") # If elements were added to the ax if ax.collections: ax.legend(bbox_to_anchor=(1, 1), loc=2,frameon=False) # Add x labels if last element ax.xaxis.set_tick_params(bottom=True, labelbottom=True) if self.annotations: for track_name, track_df in annotation_dict.items(): h+=1 if verbose: jprint ("\tAlignment track name: {}".format(track_name)) # No feature case if track_df.empty: ax = pl.subplot(grid[h:h+annotation_track_height]) ax.set_xlim((start, end)) ax.text(0.5, 0.5,'No feature found', ha='center', va='center', transform=ax.transAxes) ax.yaxis.set_tick_params(left=False, right=False, labelleft=False, labelright=False) ax.xaxis.set_tick_params(bottom=True, labelbottom=True) ax.ticklabel_format(useOffset=False, style='plain') ax.grid(axis="y", b=False) ax.set_title (track_name) h+=annotation_track_height # General case else: first=True for feature_type, feature_df in track_df.groupby("type"): # Prepare the ploting area ax = pl.subplot(grid[h:h+annotation_track_height]) h+=annotation_track_height ax.set_xlim((start, end)) ax.yaxis.set_tick_params(left=False, right=False, labelleft=False, labelright=False) ax.xaxis.set_tick_params(bottom=False, top=False, labelbottom=False, labeltop=False) ax.ticklabel_format(useOffset=False, style='plain') ax.grid(axis="y", b=False) ax.set_ylabel(feature_type) # Compute the non overlaping level where to plot the arrow level = Level(offset=annotation_offset) for n, feature in feature_df.iterrows(): fl = level(feature.ID, feature.start, feature.end, feature.strand) if fl: ax.add_patch( Arrow( posA=[fl.start, fl.level], posB=[fl.end, fl.level], linewidth=3, color=annotation_color, arrowstyle=fl.arrowstyle)) if annotation_label: text_end = fl.end if fl.end < end-annotation_offset else end-annotation_offset text_start = fl.start if fl.start > start+annotation_offset else start+annotation_offset text = fl.ID[0:max_label_size]+"..." if len(fl.ID) > max_label_size else fl.ID ax.text (x=text_start+ (text_end-text_start)/2, y=fl.level, s=text, ha="center", fontsize=8) ax.set_ylim(level.min_level-0.5, level.max_level+0.5) # First element exception if first: ax.set_title (track_name) first = False # Last elemet exception ax.xaxis.set_tick_params(bottom=True, labelbottom=True)
def interval_coverage(self, refid, start, end, bins=500, bin_repr_fun="max", verbose=False, **kwargs): """ Parse the alignment file for a given refid and interval. The interval is splited in a number of windows equal to bins, for which the coverage in computed. The method return a dataframe containing the starting positions of the windows and the coverage for the + and - strands. If the refid or the coordinates are invalid a zero filled dataframe will be returned. * refid Name of the sequence from the original fasta file to display * start Start of the window to display. The coordinate is not verified, if outside of the range it will return empty bins * end End of the window to display. The coordinate is not verified, if outside of the range it will return empty bins * bins Number of alignment count bins to divide the displayed window. Low number will result in low resolution high value could result in a long ploting time. The value is automatically adjusted if lower than base resolution, ie if the requested interval is lower than the number of bins [ DEFAULT: 500 ] * bin_repr_fun Function to represent each bin ("max", "mean" and "sum") [ DEFAULT: "max" ] """ if verbose: jprint("Compute coverage from the windows: {}:{}-{}".format( refid, start, end)) df = pd.DataFrame(columns=["+", "-"], dtype=int) # Adjust number of bins and calculate step if bins > end - start: bins = end - start if verbose: jprint( "\tAuto adjust the number of bins to match the interval: {}" .format(bins)) step = (end - start) / bins if verbose: jprint("\tDefine size of each bin: {}".format(step)) # If refid is not in the self refid-list if not refid in self.refid_list: if verbose: jprint( "\tThe reference {} is not in the list of references with alignment" .format(refid)) for i in np.arange(start, end, step): for strand in ["+", "-"]: df.loc[int(i), strand] = 0 return df # Select positions windows and get maximun if verbose: jprint("\tCompute coverage...") for i in np.arange(start, end, step): winstart = int(i) winend = int(i + step) for strand in ["+", "-"]: l = self.d[refid][strand][ (self.d[refid][strand].index >= winstart) & (self.d[refid][strand].index < winend)] if l.empty: df.loc[winstart, strand] = 0 elif bin_repr_fun == "max": df.loc[winstart, strand] = l.max() elif bin_repr_fun == "sum": df.loc[winstart, strand] = l.sum() elif bin_repr_fun == "mean": df.loc[winstart, strand] = l.sum() / step if verbose: if df["+"].sum() + df["-"].sum() == 0: jprint( "\tNull coverage for both strands in the requested interval" ) elif df["+"].sum() == 0: jprint( "\tNull coverage for the positive strand in the requested interval" ) elif df["-"].sum() == 0: jprint( "\tNull coverage for the negative strand in the requested interval" ) return df
print ("Can not import a local packages. Please verify source code directory") sysexit() # Third party imports try: import numpy as np from matplotlib.patches import FancyArrowPatch as Arrow from matplotlib.gridspec import GridSpec import pylab as pl import pandas as pd import pysam from IPython.core.display import display from pycl.pycl import jhelp, jprint, get_package_file except ImportError as E: print (E) jprint ("A third party package is missing. Please verify your dependencies") sysexit() #~~~~~~~CLASS~~~~~~~# class JGV(object): ##~~~~~~~ SAMPLE FILES ~~~~~~~# @ classmethod def example_bam (self): return get_package_file("JGV","JGV/data/yeast.bam") @ classmethod def example_fasta (self): return get_package_file("JGV","JGV/data/yeast.fa.gz") @ classmethod def example_gtf (self): return get_package_file("JGV","JGV/data/yeast.gtf.gz")
def __init__(self, fp, name=None, refid_list=[], output_index=False, verbose=False, **kwargs): """ * fp A fasta file containing the reference sequences OR an tab separated index file containing at least 2 columns with the refid and the length in bases (like a .fa.fai file generated by samtools faidx, or with the output_index option of this function) The fasta option will take more time as the file has to be parsed to get the refid and length of sequences. Both fasta and infex file can be gziped * name Name of the data file that will be used as track name for plotting. If not given, will be deduced from fp file name * refid_list list of reference sequence id to select from the data file, by default all [ DEFAULT: [] ] * output_index If True will write a simple A 2 column index tsv file containing the Reference sequence ids and their lengths [ DEFAULT: False ] """ # Verify that the file is readable is_readable_file(fp) #Save self variable self.fp = fp self.name = name if name else file_basename(fp) self.ext = extensions_list(fp)[0] # If the file is in fasta format if self.ext in ["fa", "fasta"]: if verbose: jprint("Parsing fasta file") # File handling for both uncompressed or compressed fasta file if fp.endswith(".gz"): open_fun, open_mode = gzip.open, "rt" else: open_fun, open_mode = open, "r" # Parse fasta file refid and count the length of each sequence if in the refid_list with open_fun(fp, open_mode) as f: d = OrderedDict() last_ref = None for l in f: if l.startswith(">"): refid = l[1:].split()[0].strip() if not refid_list or refid in refid_list: d[refid] = 0 last_ref = refid else: last_ref = None elif last_ref: d[last_ref] += len(l.strip()) # Check if sequences found assert d, "No Sequence found" # Transform the counter in a Dataframe and sort by length self.d = pd.Series(d, name="length", dtype="int64") self.d.sort_values(inplace=True, ascending=False) # Write the index in a file for quicker loading next time if output_index: index_file = "{}/{}.tsv".format(dir_path(fp), file_basename(fp)) if verbose: jprint("Write a fasta index file: {}".format(index_file)) self.d.to_csv(index_file, sep="\t") # In the case the file is not in fasta format, try to parse it as a 2 columns tabulated file with refid and length for each sequence else: if verbose: jprint("Assume the file is a fasta index") self.d = pd.read_csv(fp, sep="\t", squeeze=True, comment="#", usecols=[0, 1], index_col=0, header=None) if refid_list: self.d = self.d[(self.d.index.isin(refid_list))] self.d.name = "length" self.d.sort_values(inplace=True, ascending=False) if verbose: jprint("\tFound {} reference sequences".format(self.refid_count))
def __init__ (self, fp, name=None, min_len=None, max_len=None, refid_list=None, type_list=None, verbose=False, **kwargs): """ * fp A path to a standard genomic file containing features annotations among the following format gff3: http://www.ensembl.org/info/website/upload/gff3.html gtf: http://www.ensembl.org/info/website/upload/gff.html bed: http://www.ensembl.org/info/website/upload/bed.html Alternatively, one can use a python pickle file (.pkl) generated during a previous run. The file can eventually be compressed in ‘gzip’ format * min_len Minimal size (start to end) of a feature to be selected [default None] * max_len Maximal size (start to end) of a feature to be selected [default None] * refid_list List of reference id to select. Example: ["chr1", "chr2", "chr3"] [default None] * type_list List of feature type to select. Example: ["exon", "gene"] [default None] """ if verbose: jprint ("Parse Annotation file") # Verify that the file is readable is_readable_file(fp) #Save self variable self.fp = fp self.name = name if name else file_basename(fp) # Find if gziped if has_extension (fp, pos=-1, ext=["gz","tgz"]): if verbose: jprint("\tFile is gziped") compression="gzip" ext_pos=-2 else: if verbose: jprint("\tFile is not compressed") compression=None ext_pos=-1 # Find extension type if has_extension (fp, pos=ext_pos, ext="gtf"): self.feature_df = self._gtf_parser(fp=fp, compression=compression, verbose=verbose) elif has_extension (fp, pos=ext_pos, ext="gff3"): self.feature_df = self._gff3_parser(fp=fp, compression=compression, verbose=verbose) elif has_extension (fp, pos=ext_pos, ext="bed"): self.feature_df = self._bed_parser(fp=fp, compression=compression, verbose=verbose) # Else try to import as a pickled file else: try: self.feature_df = self._pickle_parser(fp=fp, verbose=verbose) # If invalid file format except Exception as E: raise ValueError("Cannot open file or the file is not in a valid format") # Optional filterig steps if min_len or max_len: self.select_len (min_len=min_len, max_len=max_len, verbose=verbose) if refid_list: self.select_references (refid_list=refid_list, verbose=verbose) if type_list: self.select_types (type_list=type_list, verbose=verbose) # Sort the dataframe and reset index if verbose: jprint("Sorting and final cleanup") self.feature_df.sort_values(by=["refid","start","end"], inplace=True) self.feature_df.reset_index(drop=True, inplace=True) if verbose: jprint("\tNumber of features imported: {}".format(self.feature_count))