def __init__(self, pattern="**/summary.json", output_filename=None, verbose=True, **kargs): super().__init__() from sequana import logger logger.level = "INFO" if verbose is False: logger.level = "WARNING" logger.info( "Sequana Summary is still a tool in progress and have been " + " tested with the quality_control pipeline only for now.") self.title = "Sequana multiple summary" self.devtools = DevTools() self.filenames = list(glob.iglob(pattern, recursive=True)) self.summaries = [ReadSummary(filename) for filename in self.filenames] self.projects = [ ReadSummary(filename).data['project'] for filename in self.filenames ] self.create_report_content() self.create_html(output_filename)
def switch_header_to_gi(self, acc): """Kraken will only accept the GI from NCBI so we need to convert the ENA accession to GI numbers""" # Accession may have a version .1, .2 hence this try/except first # without the version and then with the version. # Note also that some accession are different from an earlier version. # For instance, AF525933 is in the virus.txt list from ENA but # the new updated accession ois AH012103 showing that the list and DB # must not be fully synchronised. # http://www.ebi.ac.uk/ena/data/search?query=AF525933 # In such case, the results attribute will be missing that accession, # which needs to be searched for specifically. We cannot now its name # before downloading the fasta. if acc in self.results.keys(): res = self.results[acc] else: try: res = self.results[acc.split(".")[0]] except: logger.warning( "\nUnknown accession (%s). May be an updated version. Checking..." % acc) res = self.ena_id_to_gi_number([acc]) self.results.update(res) res = res[acc] logger.info('Found %s using GI number' % acc) return ">" + res['identifier'] + " " + res['comment']
def random_selection(self, output_filename, nreads=None, expected_coverage=None, reference_length=None): """Select random reads :param nreads: number of reads to select randomly. Must be less than number of available reads in the orignal file. :param expected_coverage: :param reference_length: of expected_coverage and reference_length provided, nreads is replaced automatically. """ assert output_filename != self.filename, \ "output filename should be different from the input filename" self.reset() if expected_coverage and reference_length: mu = self.stats['mean'] nreads = int(expected_coverage * reference_length / mu) assert nreads < len( self), "nreads parameter larger than actual Number of reads" selector = random.sample(range(len(self)), nreads) logger.info("Creating a pacbio BAM file with {} reads".format(nreads)) with pysam.AlignmentFile(output_filename, "wb", template=self.data) as fh: for i, read in enumerate(self.data): if i in selector: fh.write(read)
def _get_df(self): if self._df is None: self.reset() N = 0 all_results = [] for read in self.data: res = [] # count reads N += 1 if (N % 10000) == 0: logger.info("Read %d sequences" % N) #res[0] = read length res.append(read.query_length) # res[1] = GC content c = collections.Counter(read.query_sequence) res.append(100 * (c['g'] + c['G'] + c['c'] + c['C']) / float(sum(c.values()))) # aggregate results all_results.append(res) self._df = pd.DataFrame(all_results, columns=['read_length', 'GC_content']) self.reset() return self._df
def get_most_probable_strand_consensus(rnaseq_folder): """From a sequana rna-seq run folder get the most probable strand, based on the frequecies of counts assigned with '0', '1' or '2' type strandness (featureCounts nomenclature) """ rnaseq_folder = Path(rnaseq_folder) sample_folders = list( set([x.parent for x in rnaseq_folder.glob("*/feature_counts_[012]")])) df = pd.concat([ get_most_probable_strand(sample_folder) for sample_folder in sample_folders ]) logger.info("Strandness probability report:") logger.info(df) probable_strands = df.loc[:, "strand"].unique() if len(probable_strands) == 1: return probable_strands[0] else: raise IOError( f"No consensus on most probable strand. Could be: {probable_strands}" )
def _enrichr(self, category, background=None, verbose=True): if background is None: background = self.background if isinstance(category, list): gene_list = category else: assert category in ['up', 'down', 'all'] gene_list = list(self.rnadiff.gene_lists[category]) if self.mapper is not None: logger.info("Input gene list of {} ids".format(len(gene_list))) #gene_list = [x.replace("gene:", "") for x in gene_list] identifiers = self.mapper.loc[gene_list]['name'].drop_duplicates( ).values logger.info("Mapped gene list of {} ids".format(len(identifiers))) gene_list = list(identifiers) enr = gseapy.enrichr(gene_list=gene_list, gene_sets=self.gene_sets, verbose=verbose, background=background, outdir="test", no_plot=True) return enr
def save_significant_pathways(self, mode, cutoff=0.05, nmax=20, background=None): #pragma: no cover """mode should be up, down or all""" if background is None: background = self.background # select the relevant pathways df = self._enrichr(mode, background).results df = self._get_final_df(df, cutoff=cutoff, nmax=nmax) logger.warning("Found {} pathways to save".format(len(df))) if len(df) == nmax: logger.warning("Restricted pathways to {}".format(nmax)) logger.info("saving {} deregulated pathways".format(len(df))) summaries = {} # save them for ID in df['Term']: summary = self.save_pathway(ID, filename="{}_{}.png".format(ID, mode)) summaries[ID] = summary return summaries
def __init__(self, filename, verbose=False): if filename.endswith(".gz"): raise ValueError("Must be decompressed.") self._fasta = FastxFile(filename) self.filename = filename logger.info("Reading input fasta file...please wait") self._N = len([x for x in FastxFile(filename)])
def _get_df(self): if self._df is None: self.reset() N = 0 all_results = [] for read in self.data: res = [] # count reads N += 1 if (N % 10000) == 0: logger.info("Read %d sequences" %N) #res[0] = read length res.append(read.query_length) # res[1] = GC content c = collections.Counter(read.query_sequence) res.append( 100 * (c['g'] + c['G'] + c['c'] + c['C']) / float(sum(c.values())) ) # aggregate results all_results.append(res) self._df = pd.DataFrame(all_results, columns=['read_length','GC_content']) self.reset() return self._df
def _get_data(self): # return list of lists # each list is made of 3 values: mapq, length, concordance from sequana import Cigar data = [] self.reset() count = 0 for align in self.data: mapq = align.mapq length = align.rlen if self.method in ["blasr", "minimap2"]: this = Cigar(align.cigarstring).stats() S, D, I, M = this[4], this[2], this[1], this[0] concordance = 1 - (D + I + S) / (D + I + M + S) else: this = align.get_cigar_stats()[0] error = this[-1] # suppose to be I + D + X total = this[-1] + this[0] if total: concordance = 1 - (error) / (total) else: concordance = 0 data.append([mapq, length, concordance]) if count % 10000 == 0: logger.info("%s" % count) count += 1 return data
def _parse_data(self): taxonomy = {} logger.info("Reading kraken data") columns = ["status", "taxon", "length"] # we select only col 0,2,3 to save memoty, which is required on very # large files try: # each call to concat in the for loop below # will take time and increase with chunk position. # for 15M reads, this has a big cost. So chunksize set to 1M # is better than 1000 and still reasonable in memory reader = pd.read_csv(self.filename, sep="\t", header=None, usecols=[0, 2, 3], chunksize=1000000) except pd.parser.CParserError: raise NotImplementedError # this section is for the case #only_classified_output when there is no found classified read self.unclassified = N # size of the input data set self.classified = 0 self._df = pd.DataFrame([], columns=columns) self._taxons = self._df.taxon return for chunk in reader: try: self._df self._df = pd.concat([self._df, chunk]) except AttributeError: self._df = chunk self._df.columns = columns count = sum(self._df.taxon == 1) if count: logger.warning("Found %s taxons with root ID (1)" % count) # This gives the list of taxons as index and their amount # above, we select only columns 0, 2, 3 the column are still labelled # 0, 2, 3 in the df self._taxons = self._df.groupby("taxon").size() try: self._taxons.drop(0, inplace=True) except: pass # 0 may not be there self._taxons.sort_values(ascending=False, inplace=True) category = self.df.groupby("status").size() if 'C' in category.index: self.classified = category['C'] else: self.classified = 0 if 'U' in category.index: self.unclassified = category['U'] else: self.unclassified = 0
def __init__(self, filename, force=False, **kwargs): """.. rubric:: constructor :param filename: :param force: even though the file format is not recognised, you can force the instanciation. Then, you can use your own filters. """ vcf = VCFBase(filename, verbose=False, **kwargs) if vcf.version == "4.1": logger.info("Reading VCF v 4.1") self.vcf = VCF_mpileup_4dot1(filename, **kwargs) elif vcf.version == "4.2" and vcf.source.startswith("freeBayes"): logger.info("Reading VCF v 4.2 (freebayes)") from sequana.freebayes_vcf_filter import VCF_freebayes self.vcf = VCF_freebayes(filename, **kwargs) else: print(vcf.version) print(vcf.source) msg = """This VCF file is not recognised. So far we handle version v4.1 with mpileup and v4.2 with freebayes. You may use the force option but not all filters will be recognised""" if force is True: print("VCF version %s not tested" % vcf.version) self.vcf = vcf else: raise ValueError(msg)
def _add_db_in_config(self): """ Add new annotation at the end of snpEff.config file. """ logger.info("Updating configuration file") if not self._check_database(self.ref_name): with open("snpEff.config", "a") as fp: print(self.ref_name + ".genome : " + self.ref_name, file=fp)
def splitter_mapped_unmapped(filename, prefix): logger.info("Creating 2 files (mapped and unmapped reads)") data = sniff(filename) count = 0 flags = [] match = 0 unmatch = 0 logger.info("Please wait while creating output files") with open("{}.unmapped.fastq".format(prefix), "w") as fnosirv: with open("{}.mapped.fastq".format(prefix), "w") as fsirv: for a in data: if a.flag & 256: unmatch += 1 elif a.flag & 4: read = "@{}\n{}\n+\n{}\n".format(a.qname, a.query_sequence, a.qual) assert len(a.query_sequence) == len(a.qual) fnosirv.write(read) unmatch += 1 else: read = "@{}\n{}\n+\n{}\n".format(a.qname, a.query_sequence, a.qual) assert len(a.query_sequence) == len(a.qual) fsirv.write(read) match += 1 flags.append(a.flag) return match, unmatch, flags
def df(self): # RG: ID read group ?? # np: number of passes # rq ? # rs: list 6 numbers ? # za: # zm ID of the ZMW # sn: SNR how is this computed ? # zs # - sn: list of ACGT SNRs. A, C, G, T in that order if self._df is not None: return self._df logger.info("Scanning input file. Please wait") self.reset() N = 0 all_results = [] # This takes 60% of the time...could use cython ? for read in self.data: tags = dict(read.tags) #11% of the time res = [] # count reads N += 1 if (N % 10000) == 0: logger.info("Read %d sequences" %N) # res[0] = read length res.append(read.query_length) # also stored in tags["qe"] - tags["qs"] # collections.counter is slow, let us do it ourself res.append( 100. / read.qlen * sum( [read.query_sequence.count(letter) if read.query_sequence else 0 for letter in "CGcgSs"])) # res[1:4] contains SNR stored in tags['sn'] in the order A, C, G, T try: snr = list(tags['sn']) except: snr = [None] * 4 res = res + snr # res[6] = ZMW name, also stored in tags["zm"] res.append(int(tags['zm'])) res.append(tags['np']) # aggregate results all_results.append(res) self._df = pd.DataFrame(all_results, columns=['read_length','GC_content','snr_A','snr_C','snr_G','snr_T','ZMW', "nb_passes"]) self._df.ZMW = self._df.ZMW.astype(int) if len(self._df.ZMW.unique()) != len(self._df): logger.warning("Found non unique ZMW. This may not be a CCS but " "a subread file. Consider using PacbioSubreads class") self.reset() return self._df
def _get_data(self): # return list of lists # each list is made of 3 values: mapq, length, concordance from sequana import Cigar data = [] self.reset() count = 0 for align in self.data: mapq = align.mapq length = align.rlen if self.method in ["blasr", "minimap2"]: this = Cigar(align.cigarstring).stats() S, D, I, M = this[4] , this[2] , this[1], this[0] concordance = 1 - (D+I+S)/(D + I + M + S) else: this = align.get_cigar_stats()[0] error = this[-1] # suppose to be I + D + X total = this[-1] + this[0] if total:concordance = 1- (error)/(total) else:concordance = 0 data.append([mapq, length, concordance]) if count % 10000 == 0: logger.info("%s" % count) count+=1 return data
def test_analysis(krakendb): file1 = sequana_data("Hm2_GTGAAA_L005_R1_001.fastq.gz") file2 = sequana_data("Hm2_GTGAAA_L005_R2_001.fastq.gz") # Test that database must be provided try: df = taxonomy.main([prog, '--file1', file1]) assert False except: assert True from tempfile import TemporaryDirectory directory = TemporaryDirectory() # If on travis and we could not load the database, use the local one # that must have been downloaded try: df = taxonomy.main([ prog, '--file1', file1, "--database", "toydb", "--file2", file2, "--level", "INFO", "--output-directory", directory.name, "--thread", "1" ]) except: # For travis test HOME = os.getenv('HOME') database = os.sep.join([HOME, '.config', 'sequana', 'kraken_toydb']) df = taxonomy.main([ prog, '--file1', file1, "--database", database, "--file2", file2, "--output-directory", directory.name, "--thread", "1" ]) from sequana import logger logger.info(directory.name)
def download_accession_from_ncbi(self, accession): # a list of accessions in a file # can be a list, a unique string, a filename with 1-column wit accession # to retrieve if isinstance(accession, list): pass elif isinstance(accession, str): if os.path.exists(accession): with open(accession, "r") as fin: accessions = fin.read().split() else: accessions = [accession] from easydev import Progress N = len(accessions) pb = Progress(N) logger.info("Fetching {} accession fasta files from NCBI".format(N)) for i, accession in enumerate(accessions): data = self.eutils.EFetch("nucleotide", rettype="fasta", id=accession, retmode="text") if isinstance(data, int): logger.info( "Could not fetch this accession: {}. continue".format( accession)) print("Could not fetch this accession: {}. continue".format( accession)) else: outname = "{}/library/{}.fa".format(self.dbname, accession) with open(outname, "wb") as fout: fout.write(data) pb.animate(i + 1)
def create_taxonomy_file(self, filename="taxonomy.dat"): logger.info("Please wait while creating the output file. " "This may take a few minutes") from easydev import Progress pb = Progress(len(self.df_nodes)) count = 0 df_names = self.df_names.query("key == 'scientific name'").copy() with open(filename, "w") as fout: for taxid in self.df_nodes.index: row = self.df_nodes.loc[taxid] fout.write("ID : {}\n".format(taxid)) fout.write("PARENT ID : {}\n".format( row.parent)) fout.write("RANK : {}\n".format( row['rank'])) #names = df_names.loc[taxid] #print( fout.write("{:26s}: {}\n".format("SCIENTIFIC NAME", df_names.loc[taxid, "name"])) """ len(names) for k,v in zip(names['key'], names['name']): if k.upper() in ['SCIENTIFIC NAME', 'SYNONYM']: fout.write("{:26s}: {}\n".format(k.upper(), v)) except: k, v = names['key'], names['name'] fout.write("{:26s}: {}\n".format(k.upper(), v)) """ fout.write("//\n") count += 1 pb.animate(count)
def main(args=None): if args is None: args = sys.argv[:] print("Welcome to sequana_vcf_filter") user_options = Options(prog="sequana_vcf_filter") if "--version" in args: import sequana print(sequana.version) sys.exit() elif len(args) == 1 or "--help" in args: user_options.parse_args(["prog", "--help"]) elif len(args) == 2: class SimpleOpt(): pass options = SimpleOpt() options.input_filename = args[1] else: options = user_options.parse_args(args[1:]) # set the level logger.level = options.level vcf = VCF(options.input_filename) vcf.vcf.filter_dict['QUAL'] = options.quality vcf.vcf.apply_indel_filter = options.apply_indel_filter vcf.vcf.apply_dp4_filter = options.apply_dp4_filter vcf.vcf.apply_af1_filter = options.apply_af1_filter vcf.vcf.dp4_minimum_depth = options.minimum_depth vcf.vcf.dp4_minimum_depth_strand = options.minimum_depth_strand vcf.vcf.dp4_minimum_ratio = options.minimum_ratio vcf.vcf.minimum_af1 = options.minimum_af1 vcf.vcf.filter_dict['INFO'] = {} vcf.vcf.filter_dict['QUAL'] = options.quality for this in options.filter: this = this[0] signs = [">", "<", ">=", "<="] for sign in signs: if sign in this: key, value = this.split(sign, 1) key = key.strip() value = sign.strip() + value.strip() vcf.vcf.filter_dict['INFO'][key] = value break logger.info(vcf.vcf.filter_dict) res = vcf.vcf.filter_vcf(options.output_filename, output_filtered=options.output_filtered_filename) print() #print(res) return res
def get_data(self, ontologies, include_negative_enrichment=True, fdr=0.05): if isinstance(ontologies, str): ontologies = [ontologies] else: assert isinstance(ontologies, list) # First, we select the required ontologies and build a common data set all_data = [] for ontology in ontologies: data = self.enrichment[ontology]['result'] if isinstance(data, dict): # there was only one hit, we expect: data = [data] all_data.extend(data) data = all_data # remove unclassified GO terms unclassified = [ x for x in data if x['term']['label'] == "UNCLASSIFIED" ] logger.info("Found {} unclassified".format(len(unclassified))) data = [x for x in data if x['term']['label'] != "UNCLASSIFIED"] df = pd.DataFrame(data) if len(df) == 0: return df else: logger.info("Found {} GO terms".format(len(df))) df = df.query("number_in_list!=0").copy() logger.info( "Found {} GO terms with at least 1 gene in reference".format( len(df))) # extract the ID and label df['id'] = [x['id'] for x in df['term']] df['label'] = [x['label'] for x in df['term']] # some extra information for convenience df["pct_diff_expr"] = df['number_in_list'] * 100 / df[ 'number_in_reference'] df["log2_fold_enrichment"] = pylab.log2(df['fold_enrichment']) df["abs_log2_fold_enrichment"] = abs(pylab.log2(df['fold_enrichment'])) # Some user may want to include GO terms with fold enrichment # significanyly below 1 or not. if include_negative_enrichment is False: df = df.query("fold_enrichment>=1").copy() logger.info( "Found {} GO terms after keeping only positive enrichment". format(len(df))) # filter out FDR>0.05 df = df.query("fdr<=@fdr").copy() logger.info("Found {} GO terms after keeping only FDR<{}".format( len(df), fdr)) return df
def get_taxonomy_biokit(self, ids): """Retrieve taxons given a list of taxons :param list ids: list of taxons as strings or integers. Could also be a single string or a single integer :return: a dataframe .. note:: the first call first loads all taxons in memory and takes a few seconds but subsequent calls are much faster """ # filter the lineage to keep only information from one of the main rank # that is superkingdom, kingdom, phylum, class, order, family, genus and # species ranks = ('kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species') if isinstance(ids, int): ids = [ids] if len(ids) == 0: return pd.DataFrame() logger.info('Retrieving taxon using biokit.Taxonomy') if isinstance(ids, list) is False: ids = [ids] lineage = [self.tax.get_lineage_and_rank(x) for x in ids] # Now, we filter each lineage to keep only relevant ranks # We drop the 'no rank' and create a dictionary # Not nice but works for now results = [] for i, this in enumerate(lineage): default = dict.fromkeys(ranks, ' ') for entry in this: if entry[1] in ranks: default[entry[1]] = entry[0] elif entry[1] == "superkingdom": default["kingdom"] = entry[0] # Scientific name is the last entry tagged has no_rank following # species TODO (check this assumption) # e.g. 351680 and 151529 have same 7 ranks so to differenatiate # them, the scientific name should be used. # By default, we will take the last one. If species or genus, we # repeat the term try: default['name'] = this[-1][0] except: default['name'] = "root (ambigous kingdom)" results.append(default) df = pd.DataFrame.from_records(results) df.index = ids df = df[list(ranks) + ['name']] df.index = df.index.astype(int) return df
def filter(self, identifiers_list=[], min_bp=None, max_bp=None, progressbar=True, output_filename='filtered.fastq'): """Save reads in a new file if there are not in the identifier_list :param int min_bp: ignore reads with length shorter than min_bp :param int max_bp: ignore reads with length above max_bp """ # 7 seconds without identifiers to scan the file # on a 750000 reads if min_bp is None: min_bp = 0 if max_bp is None: max_bp = 1e9 # make sure we are at the beginning self.rewind() output_filename, tozip = self._istozip(output_filename) with open(output_filename, "w") as fout: pb = Progress(self.n_reads) buf = "" filtered = 0 saved = 0 for count, lines in enumerate(grouper(self._fileobj)): identifier = lines[0].split()[0] if lines[0].split()[0].decode() in identifiers_list: filtered += 1 else: N = len(lines[1]) if N <= max_bp and N >= min_bp: buf += "{}{}+\n{}".format( lines[0].decode("utf-8"), lines[1].decode("utf-8"), lines[3].decode("utf-8")) saved += 1 else: filtered += 1 if count % 100000 == 0: fout.write(buf) buf = "" if progressbar is True: pb.animate(count+1) fout.write(buf) if filtered < len(identifiers_list): print("\nWARNING: not all identifiers were found in the fastq file to " + "be filtered.") logger.info("\n{} reads were filtered out and {} saved in {}".format( filtered, saved, output_filename)) if tozip is True: logger.info("Compressing file") self._gzip(output_filename)
def _parse_data(self): taxonomy = {} logger.info("Reading kraken data") columns = ["status", "taxon", "length"] # we select only col 0,2,3 to save memoty, which is required on very # large files try: # each call to concat in the for loop below # will take time and increase with chunk position. # for 15M reads, this has a big cost. So chunksize set to 1M # is better than 1000 and still reasonable in memory reader = pd.read_csv(self.filename, sep="\t", header=None, usecols=[0,2,3], chunksize=1000000) except pd.parser.CParserError: raise NotImplementedError # this section is for the case #only_classified_output when there is no found classified read self.unclassified = N # size of the input data set self.classified = 0 self._df = pd.DataFrame([], columns=columns) self._taxons = self._df.taxon return for chunk in reader: try: self._df self._df = pd.concat([self._df, chunk]) except AttributeError: self._df = chunk self._df.columns = columns count = sum(self._df.taxon == 1) if count: logger.warning("Found %s taxons with root ID (1)" % count) # This gives the list of taxons as index and their amount # above, we select only columns 0, 2, 3 the column are still labelled # 0, 2, 3 in the df self._taxons = self._df.groupby("taxon").size() try: self._taxons.drop(0, inplace=True) except: pass # 0 may not be there self._taxons.sort_values(ascending=False, inplace=True) category = self.df.groupby("status").size() if 'C' in category.index: self.classified = category['C'] else: self.classified = 0 if 'U' in category.index: self.unclassified = category['U'] else: self.unclassified = 0
def download_taxonomic_file(self, overwrite=False): """Loads entire flat file from EBI Do not overwrite the file by default. """ import ftplib from sequana import sequana_config_path if os.path.exists(self.database) and overwrite is False: logger.info( "Found taxonomy.dat file in sequana your path {}".format( sequana_config_path)) return else: logger.info( "Downloading and extracting the taxonomy file from the web. Please be patient." ) if self.source == "ena": url = 'ftp.ebi.ac.uk' else: url = 'ftp.ncbi.nlm.nih.gov' self.ftp = ftplib.FTP(url) self.ftp.login() if self.source == "ena": # for the EBI ftp only: self.ftp.cwd('databases') self.ftp.cwd('pub') self.ftp.cwd('databases') self.ftp.cwd('taxonomy') logger.warning( 'Downloading and saving in %s. This is from ebi and may be behind the NCBI taxonomy' % self.database) self.ftp.retrbinary('RETR taxonomy.dat', open(self.database, 'wb').write) ftp.close() else: self.ftp.cwd('pub') self.ftp.cwd('taxonomy') logger.warning('Downloading and saving in %s from ncbi ftp' % self.database) import tempfile import shutil with tempfile.TemporaryDirectory() as tmpdir: filename = tmpdir + os.sep + "taxdump.tar.gz" self.ftp.retrbinary('RETR taxdump.tar.gz', open(filename, "wb").write) import tarfile tf = tarfile.open(filename) assert "nodes.dmp" in tf.getnames() assert "names.dmp" in tf.getnames() tf.extract("nodes.dmp", tmpdir) tf.extract("names.dmp", tmpdir) ncbi = NCBITaxonomy(tmpdir + os.sep + "names.dmp", tmpdir + os.sep + "nodes.dmp") ncbi.create_taxonomy_file(tmpdir + os.sep + "taxonomy.dat") shutil.move(tmpdir + os.sep + "taxonomy.dat", self.database) self.ftp.close()
def copy_config_from_sequana(module, source="config.yaml", target="config.yaml"): # identify config name from the requested module user_config = module.path + os.sep + source if os.path.exists(user_config): shutil.copy(user_config, target) txt = "copied %s from sequana %s pipeline" logger.info(txt % (source, module.name)) else: logger.warning(user_config + "not found")
def run(self, dbname="multiple", output_prefix="kraken_final"): """Run the hierachical analysis This method does not return anything but creates a set of files: - kraken_final.out - krona_final.html - kraken.png (pie plot of the classified/unclassified reads) .. note:: the databases are run in the order provided in the constructor. """ # list of all output to merge at the end self._list_kraken_output = [] self._list_kraken_input = [] # Iteration over the databases for iteration in range(len(self.databases)): status = self._run_one_analysis(iteration) last_unclassified = self._list_kraken_input[-1] stat = os.stat(last_unclassified) if stat.st_size == 0: break # concatenate all kraken output files file_output_final = self.output_directory + os.sep + "%s.out" % output_prefix with open(file_output_final, 'w') as outfile: for fname in self._list_kraken_output: with open(fname) as infile: for line in infile: outfile.write(line) # create html report logger.info("Analysing results") result = KrakenResults(file_output_final) # TODO: this looks similar to the code in KrakenPipeline. could be factorised result.to_js("%s%s%s.html" % (self.output_directory, os.sep, output_prefix)) result.plot(kind="pie") pylab.savefig(self.output_directory + os.sep + "kraken.png") prefix = self.output_directory + os.sep result.kraken_to_json(prefix + "kraken.json", dbname) result.kraken_to_csv(prefix + "kraken.csv", dbname) # remove kraken intermediate files (including unclassified files) if self.unclassified_output: # Just cp the last unclassified file import shutil shutil.copy2(self._list_kraken_input[-1], self.unclassified_output) if not self.keep_temp_files: for f_temp in self._list_kraken_output: os.remove(f_temp) for f_temp in self._list_kraken_input: os.remove(f_temp)
def _download_sequana_db1(self, verbose=True): dbname = "sequana_db1" from easydev import md5 dir1 = sequana_config_path + os.sep + dbname dir2 = dir1 + os.sep + "taxonomy" self.dv.mkdir(dir1) self.dv.mkdir(dir2) logger.info("Downloading about 8Gb of data (if not already downloaded) from" " Synapse into %s" % dir1) from os.path import exists filename = dir1 + "ena_list.txt" if exists(filename) and md5(filename) == "a9cc6268f3338d1632c4712a412593f2": pass else: self._download_from_synapse('syn6171700', dir1) # database.idx filename = dir1 + "database.idx" if exists(filename) and md5(filename) == "2fa4a99a4f52f2f04c5a965adb1534ac": pass else: self._download_from_synapse('syn6171017', dir1) # database.kdb ; this one is large (8Gb) filename = dir1 + "database.kdb" if exists(filename) and md5(filename) == "ff698696bfc88fe83bc201937cd9cbdf": pass else: self._download_from_synapse('syn6171107', dir1) # Then, the taxonomy directory filename = dir1 + "names.dmp" if exists(filename) and md5(filename) == "10bc7a63c579de02112d125a51fd65d0": pass else: self._download_from_synapse('syn6171286', dir2) filename = dir1 + "nodes.dmp" if exists(filename) and md5(filename) == "a68af5a60434e2067c4a0a16df873980": pass else: self._download_from_synapse('syn6171289', dir2) filename = dir1 + "taxons.txt" if exists(filename) and md5(filename) == "e78fbb43b3b41cbf4511d6af16c0287f": pass else: self._download_from_synapse('syn6171290', dir2) logger.info('done. You should have a kraken DB in %s' % dir1) # The annotations wget("https://github.com/sequana/data/raw/master/sequana_db1/annotations.csv", dir1 + os.sep + "annotations.csv")
def _get_qualities(self): from sequana import logger logger.info("Extracting qualities") qualities = [] with FastqReader(self.filename) as f: for i, record in enumerate(f): if i < self.max_sample: quality = [ord(x) -33 for x in record.qualities] qualities.append(quality) else: break return qualities
def ena_id_to_gi_number(self, identifiers): # Now, let us convert the ENA accession to NCBI GI number once for all. # We can fetch only at max 200 identifiers: logger.info("Fetching %s identifiers from NCBI" % len(identifiers)) Nbaskets = int(math.ceil(len(identifiers) / 200.)) results = {} from easydev import split_into_chunks for chunk in split_into_chunks(identifiers, Nbaskets): result = self.eutils.accession_to_info(",".join(chunk)) results.update(result) return results
def _download_assembly_report(self, category): assert category in self.category ftp = ftplib.FTP("ftp.ncbi.nlm.nih.gov") ftp.login("anonymous", "anonymous") ftp.cwd("genomes/refseq/{}".format(category)) filename = "assembly_summary.txt" ftp.retrbinary( 'RETR ' + filename, open(filename.replace(".txt", "_{}.txt".format(category)), "wb").write) logger.info(filename)
def _download_minikraken(self, verbose=True): dv = DevTools() base = sequana_config_path + os.sep + "" taxondir = base + os.sep + "taxonomy" dv.mkdir(base) dv.mkdir(taxondir) logger.info("Downloading minikraken (4Gb)") filename = base + os.sep + "minikraken.tgz" if os.path.exists(filename) and md5(filename) == "30eab12118158d0b31718106785195e2": logger.warning("%s already present" % filename) else: wget("https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz", filename)
def __init__(self, directory=".", prefix=""): self.prefix = prefix self.directory = directory self.sample_name = "undefined" # low quality isoforms filename = "all.polished_lq.fastq" self.lq_isoforms = self.get_file(filename) if self.lq_isoforms: logger.info("Reading {}".format(filename)) self.lq_sequence = FastQ(self.lq_isoforms) # high quality isoforms filename = "all.polished_hq.fastq" self.hq_isoforms = self.get_file(filename) if self.hq_isoforms: logger.info("Reading {}".format(filename)) self.hq_sequence = FastQ(self.hq_isoforms) # General info filename = "file.csv" self.csv = self.get_file(filename) if self.csv: logger.info("Reading {}".format(filename)) self.data = pd.read_csv(self.csv) # CCS fasta sequence #self.ccs = self.get_file("-ccs.tar.gz") filename = "ccs.fasta" self.ccs = self.get_file(filename, noprefix=True) if self.ccs: logger.info("Reading {}".format(filename)) self.ccs = FastA(self.ccs)
def _set_window(self,window): if (window > 0) & (window < 1): self._type_window = 'adapted to genome length : %.1f %% of total length' %(window*100) self._window = int(round(self.__len__() * window)) elif (window >= 1) & ( window <= self.__len__()): self._type_window = 'fixed window length : %d' %(window) self._window = int(window) else: raise ValueError("Incorrect value for window: choose either float ]0,1]" + " (fraction of genome) or integer [1,genome_length] (window size)") logger.info("Computing GC skew") self._compute_skews()
def export(self, filename='test.png'): if self.status is None: logger.error("Upload the tree first with upload() method") export = self.itol.get_itol_export() # Set the format if filename.endswith(".png"): logger.info("Exporting in {} format".format("png")) export.params['format'] = "png" elif filename.endswith(".svg"): logger.info("Exporting in {} format".format("svg")) export.params['format'] = "svg" elif filename.endswith(".pdf"): logger.info("Exporting in {} format".format("pdf")) export.params['format'] = "pdf" elif filename.endswith(".eps"): logger.info("Exporting in {} format".format("eps")) export.params['format'] = "eps" else: raise ValueError("filename must end in pdf, png, svg or eps") export.params.update(**self.params) export.export(filename)
def run(self, dbname="multiple", output_prefix="kraken_final"): """Run the hierachical analysis This method does not return anything but creates a set of files: - kraken_final.out - krona_final.html - kraken.png (pie plot of the classified/unclassified reads) .. note:: the databases are run in the order provided in the constructor. """ # list of all output to merge at the end self._list_kraken_output = [] self._list_kraken_input = [] # Iteration over the databases for iteration in range(len(self.databases)): self._run_one_analysis(iteration) # concatenate all kraken output files file_output_final = self.output_directory + os.sep + "%s.out" % output_prefix with open(file_output_final, 'w') as outfile: for fname in self._list_kraken_output: with open(fname) as infile: for line in infile: outfile.write(line) # create html report logger.info("Analysing results") result = KrakenResults(file_output_final) # TODO: this looks similar to the code in KrakenPipeline. could be factorised result.to_js("%s%s%s.html" % (self.output_directory, os.sep, output_prefix)) result.plot(kind="pie") pylab.savefig(self.output_directory + os.sep + "kraken.png") prefix = self.output_directory + os.sep result.kraken_to_json(prefix + "kraken.json", dbname) result.kraken_to_csv(prefix + "kraken.csv", dbname) # remove kraken intermediate files (including unclassified files) if not self.keep_temp_files: for f_temp in self._list_kraken_output: os.remove(f_temp) for f_temp in self._list_kraken_input: os.remove(f_temp)
def _run_one_analysis(self, iteration): """ Run one analysis """ db = self.databases[iteration] logger.info("Analysing data using database {}".format(db)) # By default, the output contains only classified reads only_classified_output = True # a convenient alias _pathto = lambda x: self.output_directory + x # the output is saved in this file file_kraken_class = _pathto("kraken_%d.out" % iteration) output_filename_unclassified = _pathto("unclassified_%d.fastq" % iteration) file_fastq_unclass = _pathto("unclassified_%d.fastq" % iteration) if iteration == 0: inputs = self.inputs else: # previous results inputs = self._list_kraken_input[iteration-1] # if this is the last iteration (even if iteration is zero), save # classified and unclassified in the final kraken results. if iteration == len(self.databases) -1: only_classified_output = False analysis = KrakenAnalysis(inputs, db, self.threads) analysis.run(output_filename=file_kraken_class, output_filename_unclassified=output_filename_unclassified, only_classified_output=only_classified_output) self._list_kraken_input.append(file_fastq_unclass) self._list_kraken_output.append(file_kraken_class) if self.keep_temp_files: result = KrakenResults(file_kraken_class) result.to_js("%skrona_%d.html" %(self.output_directory, iteration))
def filter_mapq(self, output_filename, threshold_min=0, threshold_max=255): """Select and Write reads within a given range :param str output_filename: name of output file :param int threshold_min: minimum length of the reads to keep :param int threshold_max: maximum length of the reads to keep """ assert threshold_min < threshold_max assert output_filename != self.filename, \ "output filename should be different from the input filename" self.reset() count = 0 with pysam.AlignmentFile(output_filename, "wb", template=self.data) as fh: for read in self.data: if ((read.mapq < threshold_max) & (read.mapq > threshold_min)): fh.write(read) else: pass count += 1 if count % 10000: logger.info("%s sequence processed" % count)
def random_selection(self, output_filename, nreads=None, expected_coverage=None, reference_length=None, read_lengths=None): """Select random reads :param nreads: number of reads to select randomly. Must be less than number of available reads in the orignal file. :param expected_coverage: :param reference_length: if expected_coverage and reference_length provided, nreads is replaced automatically. .. note:: to speed up computation (if you need to call random_selection many times), you can provide the mean read length manually """ assert output_filename != self.filename, \ "output filename should be different from the input filename" if read_lengths is None: self.reset() read_lengths = [read.query_length for i, read in enumerate(self.data)] N = len(read_lengths) if expected_coverage and reference_length: mu = pylab.mean(read_lengths) nreads = int(expected_coverage * reference_length / mu) assert nreads < N, "nreads parameter larger than actual Number of reads" selector = random.sample(range(N), nreads) logger.info("Creating a pacbio BAM file with {} reads".format(nreads)) with pysam.AlignmentFile(output_filename,"wb", template=self.data) as fh: self.reset() for i, read in enumerate(self.data): if i in selector: fh.write(read)
def _download_kraken_toydb(self, verbose=True): """Download the kraken DB toy example from sequana_data into .config/sequana directory Checks the md5 checksums. About 32Mb of data """ dv = DevTools() base = sequana_config_path + os.sep + "kraken_toydb" taxondir = base + os.sep + "taxonomy" dv.mkdir(base) dv.mkdir(taxondir) baseurl = "https://github.com/sequana/data/raw/master/" # download only if required logger.info("Downloading the database into %s" % base) md5sums = [ "28661f8baf0514105b0c6957bec0fc6e", "97a39d44ed86cadea470352d6f69748d", "d91a0fcbbc0f4bbac918755b6400dea6", "c8bae69565af2170ece194925b5fdeb9"] filenames = [ "database.idx", "database.kdb", "taxonomy/names.dmp", "taxonomy/nodes.dmp"] for filename, md5sum in zip(filenames, md5sums): url = baseurl + "kraken_toydb/%s" % filename filename = base + os.sep + filename if os.path.exists(filename) and md5(filename) == md5sum: logger.warning("%s already present" % filename) else: logger.info("Downloading %s" % url) wget(url, filename)
def _to_fastX(self, mode, output_filename, threads=2): """ :param mode: fastq or fasta """ # for now, we use samtools # can use bamtools as well but as long and output 10% larger (sequences # are split on 80-characters length) from snakemake import shell cmd = "samtools %s -@ %s %s > %s" % (mode, threads, self.filename, output_filename) logger.info("Please be patient") logger.info("This may be long depending on your input data file: ") logger.info("typically, a minute per 500,000 reads") shell(cmd) logger.info("done")
def stats(self): results = {} if self.data is not None: logger.info("Reading strand") results['strand'] = { "+": sum(self.data.strand == "+"), "-": sum(self.data.strand == "-"), "?": sum(self.data.strand.isnull()) } results['classification'] = { "total_ccs_reads" : len(self.data), "five_prime_reads" : int(self.data.fiveseen.sum()), "three_prime_reads" : int(self.data.threeseen.sum()), "chimera" : int(self.data.chimera.sum()), "polyA_reads" : int(self.data.polyAseen.sum()), } if self.lq_isoforms: logger.info("Reading LQ isoforms") results['lq_isoform'] = self.lq_sequence.stats() # number of if self.hq_isoforms: logger.info("Reading HQ isoforms") results['hq_isoform'] = self.hq_sequence.stats() # number of polished HQ isoform if self.ccs: seq = [ len(read.sequence) for read in self.ccs] results["CCS"] = { "mean_length" : pylab.mean(seq), "number_ccs_bases" : sum(seq), "number_ccs_reads" : len(seq) } self.idents_v = [] self.full_v = [] self.non_full_v = [] self.isoform_lengths = [] for read in self.lq_sequence: ident, full, non_full, length = read['identifier'].decode().split(";") self.idents_v.append(ident) self.full_v.append(int(full.split("=")[1])) self.non_full_v.append(int(non_full.split("=")[1])) self.isoform_lengths.append(int(length.split("=")[1])) return results
def get_df_concordance(self, max_align=-1): """This methods returns a dataframe with Insert, Deletion, Match, Substitution, read length, concordance (see below for a definition) Be aware that the SAM or BAM file must be created using minimap2 and the --cs option to store the CIGAR in a new CS format, which also contains the information about substitution. Other mapper are also handled (e.g. bwa) but the substitution are solely based on the NM tag if it exists. alignment that have no CS tag or CIGAR are ignored. """ from sequana import Cigar count = 0 I, D, M, L, mapq, flags, NM = [], [], [], [], [], [], [] S = [] for i, a in enumerate(self._data): # tags and cigar populated if there is a match # if we use --cs cigar is not populated so we can only look at tags # tags can be an empty list if a.tags is None or len(a.tags) == 0: continue count += 1 mapq.append(a.mapq) L.append(a.qlen) try: NM.append([x[1] for x in a.tags if x[0] == "NM"][0]) except: NM.append(-1) flags.append(a.flag) if 'cs' in dict(a.tags): cs = CS(dict(a.tags)['cs']) S.append(cs['S']) I.append(cs['I']) D.append(cs['D']) M.append(cs['M']) elif a.cigarstring: cigar = Cigar(a.cigarstring).as_dict() I.append(cigar["I"]) D.append(cigar['D']) M.append(cigar['M']) S.append(None) # no info about substitutions in the cigar else: I.append(0) D.append(0) M.append(0) S.append(0) if max_align>0 and count == max_align: break if count % 10000 == 0: logger.debug("Read {} alignments".format(count)) I = np.array(I) D = np.array(D) M = np.array(M) NM = np.array(NM) try: S = np.array(S) C = 1 - (I + D + S)/(S + I + D + M) logger.info("computed Concordance based on minimap2 --cs option") except: logger.info("computed Concordance based on standard CIGAR information using INDEL and NM tag") computed_S = NM - D - I C = 1 - (I + D + computed_S)/(computed_S + I + D + M) df = pd.DataFrame([C, L, I, D, M, mapq, flags, NM, S]) df = df.T df.columns = ["concordance", 'length', "I", "D", "M", "mapq", "flags", "NM", "mismatch"] return df
def _get_df(self): # When scanning the BAM, we can extract the length, SNR of ACGT (still # need to know how to use it). The GC content (note there is no # ambiguity so no S character). The ZMW. Also, from the tags we could # get more # In each alignement, there are lots of information to retrieve. # One could for instance introspect the tags. # - cx: subread local context flags # - ip: vector of length qlen from 0 to 250. This is the IPD (raw frames # or codec V1) # - np: number of passes (1 for subread, variable for CCS) # - pw: vector of length qlen from 0 to 128? This is the PulseWidth (raw # frames or codec V1) # - qs: 0-based start of query in the ZMW read (absent in CCS) # - qe: 0-based end of query in the ZMW read (absent in CCS) # - zm: position/ID of the ZMW # - sn: list of ACGT SNRs. A, C, G, T in that order # - rq: float encoding exepted accuracy # - dq: DeletionQV # - dt: deletion Tag # - iq: insertionQV # - mq: mergeQV # - sq: substituionQV # - st: substituion tag # - RG: ? # See http://pacbiofileformats.readthedocs.io/en/3.0/BAM.html if self._df is None: logger.info("Scanning input file. Please wait") self.reset() N = 0 all_results = [] # This takes 60% of the time...could use cython ? for i, read in enumerate(self.data): tags = dict(read.tags) res = [] # count reads N += 1 if (N % 10000) == 0: logger.info("Read %d sequences" %N) # res[0] = read length res.append(read.query_length) # also stored in tags["qe"] - tags["qs"] res.append(read.reference_length) # also stored in tags["qe"] - tags["qs"] # collections.counter is slow, let us do it ourself if read.query_length and read.query_sequence: res.append( 100. / read.query_length * sum( [read.query_sequence.count(letter) for letter in "CGcgSs"])) else: res.append(None) # res[1:4] contains SNR stored in tags['sn'] in the order A, C, G, T try: snr = list(tags['sn']) except: snr = [None] * 4 res = res + snr # res[6] = ZMW name, also stored in tags["zm"] try: res.append(int(read.qname.split('/')[1])) except: # simulated data may not have the ZMW info, in which #case, we store just a unique ID res.append(i) # aggregate results all_results.append(res) if self._sample and N >= self._sample: break self._df = pd.DataFrame(all_results, columns=['read_length', "reference_length", 'GC_content', 'snr_A','snr_C','snr_G','snr_T','ZMW']) # populate the nb passes from the ZMW grouped = self._df.groupby("ZMW") agg = grouped.agg({"read_length": len}) ZMW = self._df.ZMW.unique() aa = list(pylab.flatten([[agg.loc[this][0]] * agg.loc[this][0] for this in ZMW])) self._df['nb_passes'] = aa self._df['nb_passes'] -= 1 # nb passes starts at 0 self.reset() return self._df
def run_analysis(chrom, options, feature_dict): logger.info("Computing some metrics") if chrom.DOC < 8: logger.warning("The depth of coverage is below 8. sequana_coverage is" " not optimised for such depth. You may want to " " increase the threshold to avoid too many false detections") logger.info(chrom.__str__()) if options.w_median > len(chrom.df) / 4: NW = int(len(chrom.df) / 4) if NW % 2 == 0: NW += 1 logger.warning("median window length is too long. \n" " Setting the window length automatically to a fifth of\n" " the chromosome length ({})".format(NW)) options.w_median = NW # compute the running median, zscore and ROIs for each chunk summarizing the # results in a ChromosomeCovMultiChunk instane logger.info('Using running median (w=%s)' % options.w_median) logger.info("Number of mixture models %s " % options.k) results = chrom.run(options.w_median, options.k, circular=options.circular, binning=options.binning, cnv_delta=options.cnv_clustering) # Print some info related to the fitted mixture models try: mu = results.data[0][0].as_dict()['data']['fit_mu'] sigma = results.data[0][0].as_dict()['data']['fit_sigma'] pi = results.data[0][0].as_dict()['data']['fit_pi'] logger.info("Fitted central distribution (first chunk): mu=%s, sigma=%s, pi=%s" % (round(mu,3), round(sigma,3), round(pi,3))) except: pass # some information about the ROIs found high = chrom.thresholds.high2 low = chrom.thresholds.low2 logger.info("Searching for ROIs (threshold=[{},{}] ; double =[{},{}])".format( chrom.thresholds.low, chrom.thresholds.high, low, high)) ROIs = results.get_rois() # results is a ChromosomeCovMultiChunk instane logger.info("Number of ROIs found: {}".format(len(ROIs.df))) logger.info(" - below average: {}".format(len(ROIs.get_low_rois()))) logger.info(" - above average: {}".format(len(ROIs.get_high_rois()))) # Create directory and save ROIs directory = options.output_directory directory += os.sep + "coverage_reports" directory += os.sep + chrom.chrom_name mkdirs(directory) ROIs.df.to_csv("{}/rois.csv".format(directory)) # save summary and metrics logger.info("Computing extra metrics") summary = results.get_summary() summary.to_json(directory + os.sep + "sequana_summary_coverage.json") logger.info("Evenness: {}".format(summary.data['evenness'])) logger.info("Centralness (3 sigma): {}".format(summary.data['C3'])) logger.info("Centralness (4 sigma): {}".format(summary.data['C4'])) if options.skip_html: return logger.info("Creating report in %s. Please wait" % config.output_dir) if chrom._mode == "chunks": logger.warning(("This chromosome is large. " "Plots in the HTML reports are skipped")) datatable = CoverageModule.init_roi_datatable(ROIs) ChromosomeCoverageModule(chrom, datatable, options={"W": options.w_median, "k": options.k, "ROIs": ROIs, "circular": options.circular}, command=" ".join(["sequana_coverage"] + sys.argv[1:]))
def main(args=None): if args is None: args = sys.argv[:] user_options = Options(prog="sequana") # If --help or no options provided, show the help if len(args) == 1: user_options.parse_args(["prog", "--help"]) else: options = user_options.parse_args(args[1:]) logger.level = options.logging_level if options.download_reference: logger.info("Downloading reference %s from %s\n" % (options.download_reference, options.database)) from bioservices.apps import download_fasta as df df.download_fasta(options.download_reference, method=options.database) if options.download_genbank is None: return if options.download_genbank: logger.info("Downloading genbank %s from %s\n" % (options.download_genbank, options.database)) from sequana.snpeff import download_fasta_and_genbank download_fasta_and_genbank(options.download_genbank, options.download_genbank, genbank=True, fasta=False) return if options.genbank: assert os.path.exists(options.genbank), \ "%s does not exists" % options.genbank logger.info("Reading %s. This may take time depending on " "your input file" % options.input) # Convert BAM to BED if options.input.endswith(".bam"): bedfile = options.input.replace(".bam", ".bed") logger.info("Converting BAM into BED file") shellcmd("bedtools genomecov -d -ibam %s > %s" % (options.input, bedfile)) elif options.input.endswith(".bed"): bedfile = options.input else: raise ValueError("Input file must be a BAM or BED file") # Set the thresholds if options.low_threshold is None: options.low_threshold = -options.threshold if options.high_threshold is None: options.high_threshold = options.threshold # and output directory config.output_dir = options.output_directory config.sample_name = os.path.basename(options.input).split('.')[0] # Now we can create the instance of GenomeCoverage if options.chromosome == -1: chrom_list = [] else: chrom_list = [options.chromosome] gc = GenomeCov(bedfile, options.genbank, options.low_threshold, options.high_threshold, options.double_threshold, options.double_threshold, chunksize=options.chunksize, chromosome_list=chrom_list) # if we have the reference, let us use it if options.reference: logger.info('Computing GC content') gc.compute_gc_content(options.reference, options.w_gc, options.circular) # Now we scan the chromosomes, if len(gc.chrom_names) == 1: logger.warning("There is only one chromosome. Selected automatically.") run_analysis(gc.chr_list[0], options, gc.feature_dict) elif options.chromosome <-1 or options.chromosome > len(gc.chrom_names): msg = "invalid chromosome index; must be in [1;{}]".format(len(gc.chrom_names)) logger.error(msg) sys.exit(1) else: if options.chromosome == -1: chromosomes = gc.chrom_names # take all chromosomes else: # For user, we start at position 1 but in python, we start at zero chromosomes = [gc.chrom_names[options.chromosome-1]] logger.info("There are %s chromosomes/contigs." % len(gc)) for this in gc.chrom_names: data = (this, gc.positions[this]["start"], gc.positions[this]["end"]) logger.info(" {} (starting pos: {}, ending pos: {})".format(*data)) # here we read chromosome by chromosome to save memory. # However, if the data is small. for i, chrom in enumerate(chromosomes): logger.info("==================== analysing chrom/contig %s/%s (%s)" % (i + 1, len(gc), gc.chrom_names[i])) # since we read just one contig/chromosome, the chr_list contains # only one contig, so we access to it with index 0 run_analysis(gc.chr_list[i], options, gc.feature_dict) if options.skip_multiqc is False: logger.info("=========================") logger.info("Creating multiqc report") pathtocfg = sequana_data("multiqc_config.yaml", "../multiqc/") cmd = 'multiqc . -m sequana_coverage -f -c {}'.format(pathtocfg) import subprocess proc = subprocess.Popen(cmd.split(), cwd=options.output_directory) proc.wait()
def _compute_skews(self): ### initialisation = Calculating GC skew and AT skew for first window self._init_sliding_window() GC_content_slide, GC_skew_slide = self._init_list_results() AT_content_slide, AT_skew_slide = self._init_list_results() self._init_cumul_nuc() c = Counter(self._slide_window) dict_counts = {'G' : c['G'], 'C' : c['C'], 'A' : c['A'], 'T' : c['T']} i = 0 # GC sumGC = float(dict_counts['G'] + dict_counts['C']) GC_content_slide[0][i] = sumGC if sumGC > 0: GC_skew_slide[0][i] = (dict_counts['G'] - dict_counts['C']) / sumGC # AT sumAT = float(dict_counts['A'] + dict_counts['T']) AT_content_slide[0][i] = sumAT if sumAT > 0: AT_skew_slide[0][i] = (dict_counts['A'] - dict_counts['T']) / sumAT ### Compute for all genome while(self._seq_right): out_nuc = self._slide_window.popleft() in_nuc = self._seq_right.popleft() self._slide_window.append(in_nuc) i += 1 if i % 500000 == 0: logger.info("%d / %d" % (i, self.__len__())) # if in and out are the same : do nothing, append same result if out_nuc != in_nuc: # remove out from counters if out_nuc in self._dict_nuc: dict_counts[out_nuc] -= 1 if in_nuc in self._dict_nuc: dict_counts[in_nuc] += 1 sumGC = float(dict_counts['G'] + dict_counts['C']) sumAT = float(dict_counts['A'] + dict_counts['T']) # fill results # GC GC_content_slide[0][i] = sumGC if sumGC > 0: GC_skew_slide[0][i] = (dict_counts['G'] - dict_counts['C'])/sumGC # AT AT_content_slide[0][i] = sumAT if sumAT > 0: AT_skew_slide[0][i] = (dict_counts['A'] - dict_counts['T'])/sumAT # cumul if in_nuc in self._dict_nuc: self._cumul[self._dict_nuc[in_nuc]][i+self._window-1] +=1 self._GC_content_slide = GC_content_slide/float(self._window) self._AT_content_slide = AT_content_slide/float(self._window) self._cumul = np.delete(self._cumul, range(self.__len__(),self._cumul.shape[1]),1) self._cumul = np.cumsum(self._cumul,axis=1) ### save result for Z curve self._Xn = list((self._cumul[self._dict_nuc['A']] + self._cumul[self._dict_nuc['G']]) -\ (self._cumul[self._dict_nuc['C']] + self._cumul[self._dict_nuc['T']])) self._Yn = list((self._cumul[self._dict_nuc['A']] + self._cumul[self._dict_nuc['C']]) -\ (self._cumul[self._dict_nuc['G']] + self._cumul[self._dict_nuc['T']])) self._Zn = list((self._cumul[self._dict_nuc['A']] + self._cumul[self._dict_nuc['T']]) -\ (self._cumul[self._dict_nuc['C']] + self._cumul[self._dict_nuc['G']])) self._AT_skew_slide = AT_skew_slide self._GC_skew_slide = GC_skew_slide ### check proportion of ignored nucleotides GC_content_total = (self._cumul[self._dict_nuc['G']][-1] + self._cumul[self._dict_nuc['C']][-1]) / float(self.__len__()) AT_content_total = (self._cumul[self._dict_nuc['A']][-1] + self._cumul[self._dict_nuc['T']][-1]) / float(self.__len__()) self._ignored_nuc = 1.0 - GC_content_total - AT_content_total