def _set_config(self): config = BuscoConfig( self.species, sample_name=self.sample_name, outpath=self.outpath, conda_bin_path=self.conda_bin_path, Rscript_bin_path="", # not required by our analysis tmp_path="./tmp_{}".format(self.sample_name)) from easydev import mkdirs mkdirs(self.outpath) self.config_filename = self.outpath + "/config.ini" config.save_config_file(self.config_filename)
def fetch_ic50s(self): if os.path.exists(self.data_folder_name): pass else: from easydev import mkdirs mkdirs(self.data_folder_name) urllib.request.urlretrieve(self.url_base + "TableS4A.xlsx", self.data_folder_name + "ic50s.xlsx") self._format_data()
def __init__(self): if "DAMONA_PATH" not in os.environ: logger.error("DAMONA_PATH not found in your environment. You must define " "it. In this shell, type 'export DAMONA_PATH=PATH_WHERE_TO_PLACE_DAMONA'") sys.exit(1) self.damona_path = pathlib.Path(os.environ["DAMONA_PATH"]) easydev.mkdirs(self.damona_path) easydev.mkdirs(self.damona_path / 'envs') easydev.mkdirs(self.damona_path / 'images') easydev.mkdirs(self.damona_path / 'images' / 'damona_buffer') easydev.mkdirs(self.damona_path / 'bin')
def _set_config(self): config = BuscoConfig( self.species, sample_name=self.sample_name, outpath=self.outpath, conda_bin_path=self.conda_bin_path, Rscript_bin_path="", # not required by our analysis tmp_path="./tmp_{}".format(self.sample_name) ) from easydev import mkdirs mkdirs(self.outpath) self.config_filename = self.outpath + "/config.ini" config.save_config_file(self.config_filename)
def main(args=None): if args is None: args = sys.argv[:] print(purple("Welcome to sequana_bam_splitter")) user_options = Options(prog="sequana_bam_splitter") if len(args) == 1: args.append("--help") if "--version" in sys.argv: import sequana print(sequana.version) sys.exit(0) options = user_options.parse_args(args[1:]) # set the level logger.level = options.level logger.info("This SAM/BAM/CRAM splitter is used for paired or un-paired " "reads with perfectly mapped or unmapped reads (flags 0, 4, " "16). Others are dropped.") logger.info("Reading {}".format(options.input)) # What prefix used for the output filename ? if options.prefix is None: prefix = options.input.rstrip(".bam") prefix = "test" else: prefix = options.prefix if options.outdir: prefix = options.outdir + os.sep + prefix if os.path.exists(options.outdir) is False: from easydev import mkdirs logger.info("Creating {} directory".format(options.outdir)) mkdirs(options.outdir) match, unmatch, flags = _main(options.input, prefix, keep_unmapped=options.keep_unmapped) logger.info("Matched: {}".format(match)) logger.info("Unmatched (flag 4 and 256): {}".format(unmatch)) logger.info("All flags: {}".format(Counter(flags)))
def krakendb(): # todo try: taxonomy.main([prog, '--download', 'toydb']) except TypeError: # Fails on travis so we download manually (appdirs returns # none instead of the expected user config path HOME = os.getenv('HOME') from sequana.misc import wget baseurl = "https://github.com/sequana/data/raw/master/kraken_toydb/" filenames = [ "database.idx", "database.kdb", "taxonomy/names.dmp", "taxonomy/nodes.dmp" ] for filename in filenames: from easydev import mkdirs mkdirs(HOME + os.sep + "database/taxonomy") wget(baseurl + os.sep + filename, os.sep.join([HOME, "database", filename])) except SystemExit: pass
def run_analysis(chrom, options, feature_dict): if options.verbose: print(chrom) if options.verbose: logger.info('Computing running median (w=%s)' % options.w_median) # compute running median chrom.running_median(n=options.w_median, circular=options.circular) stats = chrom.get_stats(output="dataframe") stats.set_index("name", inplace=True) DOC = stats.ix['DOC'].Value if options.k is None and DOC < 8: options.k = 1 elif options.k is None: options.k = 2 if options.verbose: print("Number of mixture model %s " % options.k) print('Computing zscore') # Compute zscore chrom.compute_zscore(k=options.k, verbose=options.verbose) # Save the CSV file of the ROIs high = chrom.thresholds.high2 low = chrom.thresholds.low2 query = "zscore > @high or zscore < @low" if feature_dict and chrom.chrom_name in feature_dict: f = FilteredGenomeCov(chrom.df.query(query), chrom.thresholds, feature_list=feature_dict[chrom.chrom_name]) else: f = FilteredGenomeCov(chrom.df.query(query), chrom.thresholds) directory = options.output_directory directory += os.sep + "coverage_reports" directory += os.sep + chrom.chrom_name mkdirs(directory) f.df.to_csv("{}/rois.csv".format(directory)) if options.verbose: logger.info("Computing centralness") # Let us save the thresholds first and then change it to compute centralness thresholds = chrom.thresholds.copy() chrom.thresholds.low = -3 chrom.thresholds.high = 3 c3 = chrom.get_centralness() chrom.thresholds.low = -4 chrom.thresholds.high = 4 c4 = chrom.get_centralness() chrom.thresholds = thresholds.copy() # Get back to the original values if options.verbose and chrom.thresholds: print(chrom.thresholds) if options.verbose: res = chrom._get_best_gaussian() print("sigma and mu of the central distribution: mu=%s, sigma=%s" % (round(res["mu"],3), round(res['sigma'],3))) print("Evenness: %8.3f" % chrom.get_evenness()) print("Centralness (3 sigma): %f" % round(c3,3)) print("Centralness (4 sigma): %f" % round(c4,4)) if options.verbose: print("\n\n")
def run_analysis(chrom, options, feature_dict): logger.info("Computing some metrics") if chrom.DOC < 8: logger.warning("The depth of coverage is below 8. sequana_coverage is" " not optimised for such depth. You may want to " " increase the threshold to avoid too many false detections") logger.info(chrom.__str__()) if options.w_median > len(chrom.df) / 4: NW = int(len(chrom.df) / 4) if NW % 2 == 0: NW += 1 logger.warning("median window length is too long. \n" " Setting the window length automatically to a fifth of\n" " the chromosome length ({})".format(NW)) options.w_median = NW # compute the running median, zscore and ROIs for each chunk summarizing the # results in a ChromosomeCovMultiChunk instane logger.info('Using running median (w=%s)' % options.w_median) logger.info("Number of mixture models %s " % options.k) results = chrom.run(options.w_median, options.k, circular=options.circular, binning=options.binning, cnv_delta=options.cnv_clustering) # Print some info related to the fitted mixture models try: mu = results.data[0][0].as_dict()['data']['fit_mu'] sigma = results.data[0][0].as_dict()['data']['fit_sigma'] pi = results.data[0][0].as_dict()['data']['fit_pi'] logger.info("Fitted central distribution (first chunk): mu=%s, sigma=%s, pi=%s" % (round(mu,3), round(sigma,3), round(pi,3))) except: pass # some information about the ROIs found high = chrom.thresholds.high2 low = chrom.thresholds.low2 logger.info("Searching for ROIs (threshold=[{},{}] ; double =[{},{}])".format( chrom.thresholds.low, chrom.thresholds.high, low, high)) ROIs = results.get_rois() # results is a ChromosomeCovMultiChunk instane logger.info("Number of ROIs found: {}".format(len(ROIs.df))) logger.info(" - below average: {}".format(len(ROIs.get_low_rois()))) logger.info(" - above average: {}".format(len(ROIs.get_high_rois()))) # Create directory and save ROIs directory = options.output_directory directory += os.sep + "coverage_reports" directory += os.sep + chrom.chrom_name mkdirs(directory) ROIs.df.to_csv("{}/rois.csv".format(directory)) # save summary and metrics logger.info("Computing extra metrics") summary = results.get_summary() summary.to_json(directory + os.sep + "sequana_summary_coverage.json") logger.info("Evenness: {}".format(summary.data['evenness'])) logger.info("Centralness (3 sigma): {}".format(summary.data['C3'])) logger.info("Centralness (4 sigma): {}".format(summary.data['C4'])) if options.skip_html: return logger.info("Creating report in %s. Please wait" % config.output_dir) if chrom._mode == "chunks": logger.warning(("This chromosome is large. " "Plots in the HTML reports are skipped")) datatable = CoverageModule.init_roi_datatable(ROIs) ChromosomeCoverageModule(chrom, datatable, options={"W": options.w_median, "k": options.k, "ROIs": ROIs, "circular": options.circular}, command=" ".join(["sequana_coverage"] + sys.argv[1:]))
def run_analysis(chrom, options, feature_dict): logger.info("Computing some metrics") if chrom.DOC < 8: logger.warning("The depth of coverage is below 8. sequana_coverage is" " not optimised for such depth. You may want to " " increase the threshold to avoid too many false detections") logger.info(chrom.__str__()) if options.w_median > len(chrom.df) / 4: NW = int(len(chrom.df) / 4) if NW % 2 == 0: NW += 1 logger.warning("median window length is too long. \n" " Setting the window length automatically to a fifth of\n" " the chromosome length ({})".format(NW)) else: NW = options.w_median ######################### DEFINES OUTPUT DIR AND SAMPLE NAME ########### config.output_dir = options.output_directory config.sample_name = os.path.basename(options.input).split('.')[0] ######################################################################### # compute the running median, zscore and ROIs for each chunk summarizing the # results in a ChromosomeCovMultiChunk instane logger.info('Using running median (w=%s)' % NW) logger.info("Number of mixture models %s " % options.k) results = chrom.run(NW, options.k, circular=options.circular, binning=options.binning, cnv_delta=options.cnv_clustering) # Print some info related to the fitted mixture models try: mu = results.data[0][0].as_dict()['data']['fit_mu'] sigma = results.data[0][0].as_dict()['data']['fit_sigma'] pi = results.data[0][0].as_dict()['data']['fit_pi'] logger.info("Fitted central distribution (first chunk): mu=%s, sigma=%s, pi=%s" % (round(mu,3), round(sigma,3), round(pi,3))) except: pass # some information about the ROIs found high = chrom.thresholds.high2 low = chrom.thresholds.low2 logger.info("Searching for ROIs (threshold=[{},{}] ; double =[{},{}])".format( chrom.thresholds.low, chrom.thresholds.high, low, high)) ROIs = results.get_rois() # results is a ChromosomeCovMultiChunk instane logger.info("Number of ROIs found: {}".format(len(ROIs.df))) logger.info(" - below average: {}".format(len(ROIs.get_low_rois()))) logger.info(" - above average: {}".format(len(ROIs.get_high_rois()))) # Create directory and save ROIs directory = options.output_directory directory ="{}/{}".format(options.output_directory, chrom.chrom_name) mkdirs(directory) ROIs.df.to_csv("{}/rois.csv".format(directory)) # save summary and metrics logger.info("Computing extra metrics") summary = results.get_summary(caller="sequana_coverage") summary.to_json("{}/sequana_summary_coverage.json".format(directory)) logger.info("Evenness: {}".format(summary.data['evenness'])) logger.info("Centralness (3 sigma): {}".format(summary.data['C3'])) logger.info("Centralness (4 sigma): {}".format(summary.data['C4'])) if options.skip_html: return chrom.plot_coverage("{}/coverage.png".format(directory)) logger.info("Creating report in %s. Please wait" % options.output_directory) if chrom._mode == "chunks": logger.warning(("This chromosome is large. " "Plots in the HTML reports are skipped")) datatable = CoverageModule.init_roi_datatable(ROIs) # sample name not important for the standalone config.sample_name = "subreports" ChromosomeCoverageModule(chrom, datatable, options={"W": NW, "k": options.k, "ROIs": ROIs, "circular": options.circular}, command=" ".join(["sequana_coverage"] + sys.argv[1:]))
def __init__(self, dbname): super(Kraken2Builder, self).__init__(dbname) self.path_to_taxonomy = sequana_config_path + os.sep + "kraken2_taxonomy" from easydev import mkdirs mkdirs(self.path_to_taxonomy)
def run_analysis(chrom, options, feature_dict): logger.info("Computing some metrics") if chrom.DOC < 8: logger.warning( "The depth of coverage is below 8. sequana_coverage is" " not optimised for such depth. You may want to " " increase the threshold to avoid too many false detections") logger.info(chrom.__str__()) if options.w_median > len(chrom.df) / 5: NW = int(len(chrom.df) / 5) if NW % 2 == 0: NW += 1 logger.warning( "median window length is too long. \n" " Setting the window length automatically to a fifth of\n" " the chromosome length ({})".format(NW)) options.w_median = NW # compute the running median, zscore and ROIs for each chunk summarizing the # results in a ChromosomeCovMultiChunk instane logger.info('Using running median (w=%s)' % options.w_median) logger.info("Number of mixture models %s " % options.k) results = chrom.run(options.w_median, options.k, circular=options.circular) # Print some info related to the fitted mixture models try: mu = results.data[0][0].as_dict()['data']['fit_mu'] sigma = results.data[0][0].as_dict()['data']['fit_sigma'] pi = results.data[0][0].as_dict()['data']['fit_pi'] logger.info( "Fitted central distribution (first chunk): mu=%s, sigma=%s, pi=%s" % (round(mu, 3), round(sigma, 3), round(pi, 3))) except: pass # some information about the ROIs found high = chrom.thresholds.high2 low = chrom.thresholds.low2 logger.info( "Searching for ROIs (threshold=[{},{}] ; double =[{},{}])".format( chrom.thresholds.low, chrom.thresholds.high, low, high)) ROIs = results.get_rois() # results is a ChromosomeCovMultiChunk instane logger.info("Number of ROIs found: {}".format(len(ROIs.df))) logger.info(" - below average: {}".format(len(ROIs.get_low_rois()))) logger.info(" - above average: {}".format(len(ROIs.get_high_rois()))) # Create directory and save ROIs directory = options.output_directory directory += os.sep + "coverage_reports" directory += os.sep + chrom.chrom_name mkdirs(directory) ROIs.df.to_csv("{}/rois.csv".format(directory)) # save summary and metrics logger.info("Computing extra metrics") summary = results.get_summary() summary.to_json(directory + os.sep + "sequana_summary_coverage.json") logger.info("Evenness: {}".format(summary.data['evenness'])) logger.info("Centralness (3 sigma): {}".format(summary.data['C3'])) logger.info("Centralness (4 sigma): {}".format(summary.data['C4'])) if options.skip_html: return logger.info("Creating report in %s. Please wait" % config.output_dir) if chrom._mode == "chunks": logger.warning( ("This chromosome is large (more than {0}). Producing " "plots and HTML sub coverage plots only for data from 0 to " "{0} bases. Neccesitate to recompute some metrics. Please wait" ).format(options.chunksize)) datatable = CoverageModule.init_roi_datatable(ROIs) ChromosomeCoverageModule(chrom, datatable, options={ "W": options.w_median, "k": options.k, "ROIs": ROIs, "circular": options.circular })
def rnadiff(**kwargs): """Perform RNA-seq differential analysis. This command performs the differential analysis of gene expression. The analysis is performed on feature counts generated by a RNA-seq analysis (see e.g. https://github.com/sequana/rnaseq pipeline). The analysis is performed by DESeq2. A HTML report is created as well as a set of output files, including summary table of the analysis. To perform this analysis, you will need the GFF file used during the RNA-seq analysis, the feature stored altogether in a single file, an experimental design file, and the feature and attribute used during the feature count. Here is an example: \b sequana rnadiff --annotation Lepto.gff --design design.csv --features all_features.out --feature-name gene --attribute-name ID """ import pandas as pd from sequana.featurecounts import FeatureCount from sequana.rnadiff import RNADiffAnalysis, RNADesign from sequana.modules_report.rnadiff import RNAdiffModule logger.setLevel(kwargs['logger']) outdir = kwargs['output_directory'] feature = kwargs['feature_name'] attribute = kwargs['attribute_name'] design = kwargs['design'] reference = kwargs['reference'] if kwargs['annotation']: gff = kwargs['annotation'] logger.info(f"Checking annotation file") from sequana import GFF3 g = GFF3(gff) #.save_annotation_to_csv() if feature not in g.features: logger.critical( f"{feature} not found in the GFF. Most probably a wrong feature name" ) attributes = g.get_attributes(feature) if attribute not in attributes: logger.critical( f"{attribute} not found in the GFF for the provided feature. Most probably a wrong feature name. Please change --attribute-name option or do not provide any GFF" ) sys.exit(1) else: gff = None design_check = RNADesign(design, reference=reference) compa_csv = kwargs['comparisons'] if compa_csv: compa_df = pd.read_csv(compa_csv) comparisons = list(zip(compa_df["alternative"], compa_df["reference"])) else: comparisons = design_check.comparisons if kwargs['report_only'] is False: logger.info( f"Processing features counts and saving into {outdir}/light_counts.csv" ) fc = FeatureCount(kwargs['features']) from easydev import mkdirs mkdirs(f"{outdir}") fc.rnadiff_df.to_csv(f"{outdir}/light_counts.csv") logger.info(f"Differential analysis to be saved into ./{outdir}") for k in sorted([ "independent_filtering", "beta_prior", "cooks_cutoff", "fit_type", "reference" ]): logger.info(f" Parameter {k} set to : {kwargs[k]}") r = RNADiffAnalysis( f"{outdir}/light_counts.csv", design, condition=kwargs["condition"], comparisons=comparisons, fc_feature=feature, fc_attribute=attribute, outdir=outdir, gff=gff, cooks_cutoff=kwargs.get("cooks_cutoff"), independent_filtering=kwargs.get("independent_filtering"), beta_prior=kwargs.get("beta_prior"), fit_type=kwargs.get('fit_type')) logger.info(f"Saving output files into {outdir}/rnadiff.csv") try: results = r.run() results.to_csv(f"{outdir}/rnadiff.csv") except Exception as err: logger.error(err) sys.exit(1) else: logger.info(f"DGE done.") # cleanup if succesful os.remove(f"{outdir}/rnadiff.err") os.remove(f"{outdir}/rnadiff.out") os.remove(f"{outdir}/rnadiff_light.R") logger.info(f"Reporting. Saving in rnadiff.html") report = RNAdiffModule(outdir, kwargs['design'], gff=gff, fc_attribute=attribute, fc_feature=feature, alpha=0.05, log2_fc=0, condition=kwargs["condition"], annot_cols=None, pattern="*vs*_degs_DESeq2.csv")